Skip to content
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,91 @@
import org.opendataloader.pdf.processors.DocumentProcessor;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
* The main entry point for the opendataloader-pdf library.
* Use the static method {@link #processFile(String, Config)} to process a PDF.
*/
public final class OpenDataLoaderPDF {

private static final Logger LOGGER = Logger.getLogger(OpenDataLoaderPDF.class.getName());

private OpenDataLoaderPDF() {
}

/**
* Processes a PDF file to extract its content and structure based on the provided configuration.
*
* <p>Input validation is performed before processing. Callers may catch
* {@link IllegalArgumentException} to skip invalid entries in a batch loop:
* <pre>{@code
* for (String pdf : paths) {
* try {
* OpenDataLoaderPDF.processFile(pdf, config);
* } catch (IllegalArgumentException e) {
* // skip invalid path and continue
* }
* }
* }</pre>
*
* @param inputPdfName The path to the input PDF file.
* @param config The configuration object specifying output formats and other options.
* @throws IOException If an error occurs during file reading or processing.
* @throws IllegalArgumentException if {@code inputPdfName} is null, blank, syntactically
* invalid, does not exist, is not a regular file, or does
* not have a {@code .pdf} extension.
* @throws IOException If an error occurs during file reading or processing.
*/
public static void processFile(String inputPdfName, Config config) throws IOException {
validateInputFile(inputPdfName);
DocumentProcessor.processFile(inputPdfName, config);
}

/**
* Validates that {@code inputPdfName} refers to an existing, regular PDF file.
*
* @param inputPdfName the path string to validate
* @throws IllegalArgumentException if the path is null, blank, syntactically invalid,
* non-existent, not a regular file, or lacks a {@code .pdf} extension
*/
private static void validateInputFile(String inputPdfName) {
if (inputPdfName == null || inputPdfName.isBlank()) {
LOGGER.log(Level.WARNING, "Input PDF path is null or blank");
throw new IllegalArgumentException("Input PDF path must not be null or blank");
}

final Path path;
try {
path = Paths.get(inputPdfName);
} catch (InvalidPathException ex) {
LOGGER.log(Level.WARNING, "Syntactically invalid path supplied");
throw new IllegalArgumentException("Invalid file path: " + ex.getReason(), ex);
}

final String fileName = path.getFileName().toString();

if (!Files.exists(path)) {
LOGGER.log(Level.WARNING, "PDF file does not exist: {0}", fileName);
throw new IllegalArgumentException("File does not exist: " + fileName);
}

if (!Files.isRegularFile(path)) {
LOGGER.log(Level.WARNING, "Path does not point to a regular file: {0}", fileName);
throw new IllegalArgumentException("Path is not a regular file: " + fileName);
}

if (!fileName.toLowerCase(Locale.ROOT).endsWith(".pdf")) {
LOGGER.log(Level.WARNING, "File does not have a .pdf extension: {0}", fileName);
throw new IllegalArgumentException("File must have a .pdf extension: " + fileName);
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
}

/**
* Shuts down any cached resources used by the library.
*
Expand Down