From 955f984e69e385fbf49c279e3fdd242695b8f830 Mon Sep 17 00:00:00 2001 From: Hamid Husain Date: Sat, 11 Apr 2026 14:16:40 +0530 Subject: [PATCH 1/6] Added support to close file and release lock after processing. --- .../pdf/processors/DocumentProcessor.java | 73 ++++++++++++++----- 1 file changed, 55 insertions(+), 18 deletions(-) diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java index 2e8d138c2..cff2e0033 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java @@ -68,6 +68,38 @@ public class DocumentProcessor { private static final Logger LOGGER = Logger.getLogger(DocumentProcessor.class.getCanonicalName()); + /** + * Releases all PDF-related resources to prevent file locks. + * + * Why this is needed: + * - PDDocument holds a native file handle + * - On Windows, this causes file locking if not closed + * - Static containers may retain references → memory leaks + */ + private static void closePdfResources() { + try { + PDDocument document = StaticResources.getDocument(); + if (document != null) { + // releases OS-level file handle + document.close(); + } + } catch (Exception e) { + LOGGER.log(Level.WARNING, "Unable to close PDF document", e); + } + + try { + // Clear static/shared containers to avoid stale references + StaticResources.clear(); + StaticContainers.updateContainers(null); + StaticLayoutContainers.clearContainers(); + StaticStorages.clearAllContainers(); + StaticCoreContainers.clearAllContainers(); + StaticXmpCoreContainers.clearAllContainers(); + } catch (Exception e) { + LOGGER.log(Level.WARNING, "Error clearing static containers", e); + } + } + /** * Processes a PDF file and generates the configured outputs. * @@ -76,25 +108,30 @@ public class DocumentProcessor { * @throws IOException if unable to process the file */ public static void processFile(String inputPdfName, Config config) throws IOException { - preprocessing(inputPdfName, config); - calculateDocumentInfo(); - Set pagesToProcess = getValidPageNumbers(config); - List> contents; - if (StaticLayoutContainers.isUseStructTree()) { - contents = TaggedDocumentProcessor.processDocument(inputPdfName, config, pagesToProcess); - } else if (config.isHybridEnabled()) { - contents = HybridDocumentProcessor.processDocument(inputPdfName, config, pagesToProcess); - } else { - contents = processDocument(inputPdfName, config, pagesToProcess); - } - if (config.needsStructuredProcessing()) { - sortContents(contents, config); + try { + preprocessing(inputPdfName, config); + calculateDocumentInfo(); + Set pagesToProcess = getValidPageNumbers(config); + List> contents; + if (StaticLayoutContainers.isUseStructTree()) { + contents = TaggedDocumentProcessor.processDocument(inputPdfName, config, pagesToProcess); + } else if (config.isHybridEnabled()) { + contents = HybridDocumentProcessor.processDocument(inputPdfName, config, pagesToProcess); + } else { + contents = processDocument(inputPdfName, config, pagesToProcess); + } + if (config.needsStructuredProcessing()) { + sortContents(contents, config); + } + ContentSanitizer contentSanitizer = new ContentSanitizer(config.getFilterConfig().getFilterRules(), + config.getFilterConfig().isFilterSensitiveData()); + contentSanitizer.sanitizeContents(contents); + generateOutputs(inputPdfName, contents, config); + } finally { + // Ensures resources are always released, even if processing throws an exception + closePdfResources(); } - ContentSanitizer contentSanitizer = new ContentSanitizer(config.getFilterConfig().getFilterRules(), - config.getFilterConfig().isFilterSensitiveData()); - contentSanitizer.sanitizeContents(contents); - generateOutputs(inputPdfName, contents, config); - } + } /** * Validates and filters page numbers from config against actual document pages. From 0bc6416ecb6be6a9d2eb17a6676c08b72d42043d Mon Sep 17 00:00:00 2001 From: Hamid Husain Date: Sat, 11 Apr 2026 14:50:01 +0530 Subject: [PATCH 2/6] fix: isolate static cleanup steps to prevent partial cleanup on failure --- .../pdf/processors/DocumentProcessor.java | 67 ++++++++++++------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java index cff2e0033..6e8cd028f 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java @@ -69,34 +69,38 @@ public class DocumentProcessor { private static final Logger LOGGER = Logger.getLogger(DocumentProcessor.class.getCanonicalName()); /** - * Releases all PDF-related resources to prevent file locks. - * - * Why this is needed: - * - PDDocument holds a native file handle - * - On Windows, this causes file locking if not closed - * - Static containers may retain references → memory leaks - */ - private static void closePdfResources() { - try { - PDDocument document = StaticResources.getDocument(); - if (document != null) { - // releases OS-level file handle - document.close(); - } - } catch (Exception e) { - LOGGER.log(Level.WARNING, "Unable to close PDF document", e); + * Releases PDF resources to prevent file locks and memory leaks. + * - Closes PDDocument to free OS file handles (required for file deletion) + * - Clears static containers to remove lingering references + * Should always be called in a finally block. + */ + private static void closePdfResources() throws Exception { + PDDocument document = StaticResources.getDocument(); + if (document != null) { + document.close(); } + // cleanup static containers + clearCleanupStep("StaticResources", StaticResources::clear); + clearCleanupStep("StaticContainers", () -> StaticContainers.updateContainers(null)); + clearCleanupStep("StaticLayoutContainers", StaticLayoutContainers::clearContainers); + clearCleanupStep("StaticStorages", StaticStorages::clearAllContainers); + clearCleanupStep("StaticCoreContainers", StaticCoreContainers::clearAllContainers); + clearCleanupStep("StaticXmpCoreContainers", StaticXmpCoreContainers::clearAllContainers); + } + + /** + * Executes a cleanup step safely without interrupting subsequent steps. + * + * Each cleanup action is isolated so that a failure in one step + * does not prevent the remaining cleanup operations from running. + * Errors are logged for debugging purposes. + */ + private static void clearCleanupStep(String name, Runnable cleanup) { try { - // Clear static/shared containers to avoid stale references - StaticResources.clear(); - StaticContainers.updateContainers(null); - StaticLayoutContainers.clearContainers(); - StaticStorages.clearAllContainers(); - StaticCoreContainers.clearAllContainers(); - StaticXmpCoreContainers.clearAllContainers(); + cleanup.run(); } catch (Exception e) { - LOGGER.log(Level.WARNING, "Error clearing static containers", e); + LOGGER.log(Level.WARNING, "Error clearing " + name, e); } } @@ -129,7 +133,20 @@ public static void processFile(String inputPdfName, Config config) throws IOExce generateOutputs(inputPdfName, contents, config); } finally { // Ensures resources are always released, even if processing throws an exception - closePdfResources(); + try { + closePdfResources(); + } catch (Exception closeException) { + LOGGER.log(Level.WARNING, "Error during PDF resource cleanup", closeException); + if (originalException != null) { + originalException.addSuppressed(closeException); + } else { + if (closeException instanceof IOException) { + throw (IOException) closeException; + } else { + throw new IOException("Failed to close PDF resources", closeException); + } + } + } } } From bd75177e3f7680334fbacd95f4977c6eeba17a01 Mon Sep 17 00:00:00 2001 From: Hamid Husain Date: Sat, 11 Apr 2026 15:27:56 +0530 Subject: [PATCH 3/6] review fixes. --- .../pdf/processors/DocumentProcessor.java | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java index 6e8cd028f..56a1e3d28 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java @@ -73,20 +73,40 @@ public class DocumentProcessor { * - Closes PDDocument to free OS file handles (required for file deletion) * - Clears static containers to remove lingering references * Should always be called in a finally block. - */ + */ private static void closePdfResources() throws Exception { + Exception closeFailure = null; PDDocument document = StaticResources.getDocument(); if (document != null) { - document.close(); + try { + document.close(); + } catch (Exception e) { + closeFailure = e; + } + } + + try { + StaticLayoutContainers.closeContrastRatioConsumer(); + } catch (Exception e) { + if (closeFailure != null) { + closeFailure.addSuppressed(e); + } else { + closeFailure = e; + } } // cleanup static containers clearCleanupStep("StaticResources", StaticResources::clear); clearCleanupStep("StaticContainers", () -> StaticContainers.updateContainers(null)); + clearCleanupStep("GFStaticContainers", () -> org.verapdf.gf.model.impl.containers.StaticContainers.updateContainers(null)); clearCleanupStep("StaticLayoutContainers", StaticLayoutContainers::clearContainers); clearCleanupStep("StaticStorages", StaticStorages::clearAllContainers); clearCleanupStep("StaticCoreContainers", StaticCoreContainers::clearAllContainers); clearCleanupStep("StaticXmpCoreContainers", StaticXmpCoreContainers::clearAllContainers); + + if (closeFailure != null) { + throw closeFailure; + } } /** @@ -112,6 +132,7 @@ private static void clearCleanupStep(String name, Runnable cleanup) { * @throws IOException if unable to process the file */ public static void processFile(String inputPdfName, Config config) throws IOException { + Throwable processingFailure = null; try { preprocessing(inputPdfName, config); calculateDocumentInfo(); @@ -131,14 +152,17 @@ public static void processFile(String inputPdfName, Config config) throws IOExce config.getFilterConfig().isFilterSensitiveData()); contentSanitizer.sanitizeContents(contents); generateOutputs(inputPdfName, contents, config); + } catch (IOException | RuntimeException | Error e) { + processingFailure = e; + throw e; } finally { // Ensures resources are always released, even if processing throws an exception try { closePdfResources(); } catch (Exception closeException) { LOGGER.log(Level.WARNING, "Error during PDF resource cleanup", closeException); - if (originalException != null) { - originalException.addSuppressed(closeException); + if (processingFailure != null) { + processingFailure.addSuppressed(closeException); } else { if (closeException instanceof IOException) { throw (IOException) closeException; From 8dd0f2269b3dff2f0be4ce72abd9277fc0b88e48 Mon Sep 17 00:00:00 2001 From: Hamid Husain Date: Sat, 11 Apr 2026 15:44:27 +0530 Subject: [PATCH 4/6] Review fixes --- .../org/opendataloader/pdf/processors/DocumentProcessor.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java index 56a1e3d28..1679268eb 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java @@ -98,7 +98,10 @@ private static void closePdfResources() throws Exception { // cleanup static containers clearCleanupStep("StaticResources", StaticResources::clear); clearCleanupStep("StaticContainers", () -> StaticContainers.updateContainers(null)); - clearCleanupStep("GFStaticContainers", () -> org.verapdf.gf.model.impl.containers.StaticContainers.updateContainers(null)); + clearCleanupStep( + "GFStaticContainers", + org.verapdf.gf.model.impl.containers.StaticContainers::clearAllContainers + ); clearCleanupStep("StaticLayoutContainers", StaticLayoutContainers::clearContainers); clearCleanupStep("StaticStorages", StaticStorages::clearAllContainers); clearCleanupStep("StaticCoreContainers", StaticCoreContainers::clearAllContainers); From 96db4a77a18d9366e47b8993e53ce0a9af9388cf Mon Sep 17 00:00:00 2001 From: Hamid Husain <41366327+hamid17amu@users.noreply.github.com> Date: Wed, 15 Apr 2026 16:51:42 +0530 Subject: [PATCH 5/6] Correct license URL formatting in DocumentProcessor.java Fixed formatting of the license URL in the comments. --- .../org/opendataloader/pdf/processors/DocumentProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java index 143452ec2..00f817c81 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java @@ -5,7 +5,7 @@ * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0* http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, From bc15271bb034d344bb3a1187432b087e0baa5ca3 Mon Sep 17 00:00:00 2001 From: Hamid Husain <41366327+hamid17amu@users.noreply.github.com> Date: Wed, 15 Apr 2026 16:52:06 +0530 Subject: [PATCH 6/6] Fix formatting of license URL in DocumentProcessor.java --- .../org/opendataloader/pdf/processors/DocumentProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java index 00f817c81..9daa9eafc 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java @@ -5,7 +5,7 @@ * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0* http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS,