diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/OpenDataLoaderPDF.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/OpenDataLoaderPDF.java index b0a13af27..3dced5fc3 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/OpenDataLoaderPDF.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/OpenDataLoaderPDF.java @@ -19,6 +19,13 @@ import org.opendataloader.pdf.processors.DocumentProcessor; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.InvalidPathException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Locale; +import java.util.logging.Level; +import java.util.logging.Logger; /** * The main entry point for the opendataloader-pdf library. @@ -26,20 +33,83 @@ */ public final class OpenDataLoaderPDF { + private static final Logger LOGGER = Logger.getLogger(OpenDataLoaderPDF.class.getName()); + + /** Utility class; do not instantiate. */ private OpenDataLoaderPDF() { } /** * Processes a PDF file to extract its content and structure based on the provided configuration. * + *

Input validation is performed before processing. Callers may catch + * {@link IllegalArgumentException} to skip invalid entries in a batch loop: + *

{@code
+     * for (String pdf : paths) {
+     *     try {
+     *         OpenDataLoaderPDF.processFile(pdf, config);
+     *     } catch (IllegalArgumentException e) {
+     *         // skip invalid path and continue
+     *     }
+     * }
+     * }
+ * * @param inputPdfName The path to the input PDF file. * @param config The configuration object specifying output formats and other options. - * @throws IOException If an error occurs during file reading or processing. + * @throws IllegalArgumentException if {@code inputPdfName} is null, blank, syntactically + * invalid, does not exist, is not a regular file, or does + * not have a {@code .pdf} extension. + * @throws IOException If an error occurs during file reading or processing. */ public static void processFile(String inputPdfName, Config config) throws IOException { + validateInputFile(inputPdfName); DocumentProcessor.processFile(inputPdfName, config); } + /** + * Validates that {@code inputPdfName} refers to an existing, regular PDF file. + * + * @param inputPdfName the path string to validate + * @throws IllegalArgumentException if the path is null, blank, syntactically invalid, + * non-existent, not a regular file, or lacks a {@code .pdf} extension + */ + private static void validateInputFile(String inputPdfName) { + if (inputPdfName == null || inputPdfName.isBlank()) { + LOGGER.log(Level.WARNING, "Input PDF path is null or blank"); + throw new IllegalArgumentException("Input PDF path must not be null or blank"); + } + + final Path path; + try { + path = Paths.get(inputPdfName); + } catch (InvalidPathException ex) { + LOGGER.log(Level.WARNING, "Syntactically invalid path supplied"); + throw new IllegalArgumentException("Invalid file path: " + ex.getReason(), ex); + } + + final Path fileNamePath = path.getFileName(); + if (fileNamePath == null) { + LOGGER.log(Level.WARNING, "Path has no file name component (root path not allowed)"); + throw new IllegalArgumentException("Path has no file name component (root paths are not allowed)"); + } + final String fileName = fileNamePath.toString(); + + if (!Files.exists(path)) { + LOGGER.log(Level.WARNING, "PDF file does not exist: {0}", fileName); + throw new IllegalArgumentException("File does not exist: " + fileName); + } + + if (!Files.isRegularFile(path)) { + LOGGER.log(Level.WARNING, "Path does not point to a regular file: {0}", fileName); + throw new IllegalArgumentException("Path is not a regular file: " + fileName); + } + + if (!fileName.toLowerCase(Locale.ROOT).endsWith(".pdf")) { + LOGGER.log(Level.WARNING, "File does not have a .pdf extension: {0}", fileName); + throw new IllegalArgumentException("File must have a .pdf extension: " + fileName); + } + } + /** * Shuts down any cached resources used by the library. *