Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,22 @@
import org.opendataloader.pdf.processors.DocumentProcessor;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
* The main entry point for the opendataloader-pdf library.
* Use the static method {@link #processFile(String, Config)} to process a PDF.
*/
public final class OpenDataLoaderPDF {

private static final Logger LOGGER = Logger.getLogger(OpenDataLoaderPDF.class.getCanonicalName());

private OpenDataLoaderPDF() {
}

Expand All @@ -34,10 +43,52 @@ private OpenDataLoaderPDF() {
*
* @param inputPdfName The path to the input PDF file.
* @param config The configuration object specifying output formats and other options.
* @throws IOException If an error occurs during file reading or processing.
*
*/
public static void processFile(String inputPdfName, Config config) throws IOException {
validateInputFile(inputPdfName);
DocumentProcessor.processFile(inputPdfName, config);

}

/**
* Validates whether the given path refers to a valid PDF file.
*
* @param inputPdfName the path to the input file
* @throws IllegalArgumentException if the path is null or blank, syntactically
* invalid, does not exist, is not a regular file,or does not end with {@code .pdf}
*/
private static void validateInputFile(String inputPdfName) {

if (inputPdfName == null || inputPdfName.isBlank()) {
LOGGER.log(Level.WARNING,"Input PDF name is null or Empty");
throw new IllegalArgumentException("Input PDF name is null or Empty");
}

final Path path;

try {
path = Paths.get(inputPdfName);
} catch (InvalidPathException ex) {
LOGGER.log(Level.WARNING,"Invalid Path: " + inputPdfName);
throw new IllegalArgumentException("Invalid Path: " + inputPdfName);
}

if (!Files.exists(path)) {
LOGGER.log(Level.WARNING,"File not found at " + inputPdfName + " location");
throw new IllegalArgumentException("File not found at " + inputPdfName + " location");
}

if (!Files.isRegularFile(path)) {
LOGGER.log(Level.WARNING,"Not a valid file " + inputPdfName);
throw new IllegalArgumentException("Not a valid file " + inputPdfName);
}

if (!path.getFileName().toString().toLowerCase(Locale.ROOT).endsWith(".pdf")) {
LOGGER.log(Level.WARNING,"Not a PDF file");
throw new IllegalArgumentException("Not a PDF file");
}
Comment on lines +63 to +90
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Avoid logging full user-supplied paths at WARNING level.

These warning logs include raw input paths, which can leak sensitive local filesystem details (e.g., usernames/home paths) into centralized logs.

🔧 Suggested hardening
-            LOGGER.log(Level.WARNING,"Input PDF name is null or Empty");
+            LOGGER.log(Level.WARNING, "Input PDF path is null or blank");

-            LOGGER.log(Level.WARNING,"Invalid Path: " + inputPdfName);
+            LOGGER.log(Level.WARNING, "Invalid input PDF path");

-            LOGGER.log(Level.WARNING,"File not found at " + inputPdfName + " location");
+            LOGGER.log(Level.WARNING, "Input PDF file does not exist");

-            LOGGER.log(Level.WARNING,"Not a valid file " + inputPdfName);
+            LOGGER.log(Level.WARNING, "Input PDF path is not a regular file");
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if (inputPdfName == null || inputPdfName.isBlank()) {
LOGGER.log(Level.WARNING,"Input PDF name is null or Empty");
throw new IllegalArgumentException("Input PDF name is null or Empty");
}
final Path path;
try {
path = Paths.get(inputPdfName);
} catch (InvalidPathException ex) {
LOGGER.log(Level.WARNING,"Invalid Path: " + inputPdfName);
throw new IllegalArgumentException("Invalid Path: " + inputPdfName);
}
if (!Files.exists(path)) {
LOGGER.log(Level.WARNING,"File not found at " + inputPdfName + " location");
throw new IllegalArgumentException("File not found at " + inputPdfName + " location");
}
if (!Files.isRegularFile(path)) {
LOGGER.log(Level.WARNING,"Not a valid file " + inputPdfName);
throw new IllegalArgumentException("Not a valid file " + inputPdfName);
}
if (!path.getFileName().toString().toLowerCase(Locale.ROOT).endsWith(".pdf")) {
LOGGER.log(Level.WARNING,"Not a PDF file");
throw new IllegalArgumentException("Not a PDF file");
}
if (inputPdfName == null || inputPdfName.isBlank()) {
LOGGER.log(Level.WARNING, "Input PDF path is null or blank");
throw new IllegalArgumentException("Input PDF name is null or Empty");
}
final Path path;
try {
path = Paths.get(inputPdfName);
} catch (InvalidPathException ex) {
LOGGER.log(Level.WARNING, "Invalid input PDF path");
throw new IllegalArgumentException("Invalid Path: " + inputPdfName);
}
if (!Files.exists(path)) {
LOGGER.log(Level.WARNING, "Input PDF file does not exist");
throw new IllegalArgumentException("File not found at " + inputPdfName + " location");
}
if (!Files.isRegularFile(path)) {
LOGGER.log(Level.WARNING, "Input PDF path is not a regular file");
throw new IllegalArgumentException("Not a valid file " + inputPdfName);
}
if (!path.getFileName().toString().toLowerCase(Locale.ROOT).endsWith(".pdf")) {
LOGGER.log(Level.WARNING, "Not a PDF file");
throw new IllegalArgumentException("Not a PDF file");
}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/OpenDataLoaderPDF.java`
around lines 61 - 88, The warning logs in OpenDataLoaderPDF (the block using
inputPdfName, path, LOGGER) currently print the full user-supplied path and must
be removed or sanitized; update each LOGGER.log call in that validation block to
avoid echoing inputPdfName or the full path and instead log a safe identifier
such as the file base name (path.getFileName().toString()) or a constant
placeholder like "<redacted-path>" (for the InvalidPathException case where path
is not available), e.g. replace messages that concatenate inputPdfName with ones
that use only the safeName or the placeholder so no absolute/user path is
emitted.


}

/**
Expand Down