opendataloader-project · the-waste-land · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026 · coderabbitai
@@ -329,8 +329,9 @@ private static void generateOutputs(String inputPdfName, List<List<IObject>> con
                 imagesDirectory = config.getOutputFolder() + File.separator + baseName + MarkdownSyntax.IMAGES_DIRECTORY_SUFFIX;
             }
             StaticLayoutContainers.setImagesDirectory(imagesDirectory);
-            ImagesUtils imagesUtils = new ImagesUtils();
-            imagesUtils.write(contents, inputPdfName, config.getPassword());
+            try (ImagesUtils imagesUtils = new ImagesUtils()) {
+                imagesUtils.write(contents, inputPdfName, config.getPassword());
+            }
         }
         if (config.isGeneratePDF()) {
             PDFWriter pdfWriter = new PDFWriter();

@@ -15,6 +15,12 @@
  */
 package org.opendataloader.pdf.utils;
 
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 import org.opendataloader.pdf.containers.StaticLayoutContainers;
 import org.opendataloader.pdf.entities.SemanticPicture;
 import org.opendataloader.pdf.markdown.MarkdownSyntax;
@@ -27,24 +33,28 @@
 import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorder;
 import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderCell;
 import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderRow;
-import org.verapdf.wcag.algorithms.semanticalgorithms.consumers.ContrastRatioConsumer;
 import org.verapdf.wcag.algorithms.semanticalgorithms.containers.StaticContainers;
 
 import javax.imageio.ImageIO;
+import java.awt.geom.Point2D;
 import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.Objects;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
-public class ImagesUtils {
+public class ImagesUtils implements AutoCloseable {
     private static final Logger LOGGER = Logger.getLogger(ImagesUtils.class.getCanonicalName());
-    private ContrastRatioConsumer contrastRatioConsumer;
 
-    public ContrastRatioConsumer getContrastRatioConsumer() {
-        return contrastRatioConsumer;
-    }
+    private org.apache.pdfbox.pdmodel.PDDocument pdfBoxDocument;
+    private final Map<Integer, List<PageImageReference>> pageImagesByNumber = new HashMap<>();
+    private String loadedPdfFilePath;
+    private String loadedPassword;
 
     public void createImagesDirectory(String path) {
         File directory = new File(path);
@@ -55,8 +65,12 @@ public void createImagesDirectory(String path) {
 
     public void write(List<List<IObject>> contents, String pdfFilePath, String password) {
         for (int pageNumber = 0; pageNumber < StaticContainers.getDocument().getNumberOfPages(); pageNumber++) {
-            for (IObject content : contents.get(pageNumber)) {
-                writeFromContents(content, pdfFilePath, password);
+            try {
+                for (IObject content : contents.get(pageNumber)) {
+                    writeFromContents(content, pdfFilePath, password);
+                }
+            } finally {
+                releasePageImages(pageNumber);
             }
         }
     }
@@ -95,29 +109,27 @@ protected void writeImage(ImageChunk chunk, String pdfFilePath, String password)
         int currentImageIndex = StaticLayoutContainers.incrementImageIndex();
         if (currentImageIndex == 1) {
             createImagesDirectory(StaticLayoutContainers.getImagesDirectory());
-            contrastRatioConsumer = StaticLayoutContainers.getContrastRatioConsumer(pdfFilePath, password, false, null);
         }
         String imageFormat = StaticLayoutContainers.getImageFormat();
         String fileName = String.format(MarkdownSyntax.IMAGE_FILE_NAME_FORMAT, StaticLayoutContainers.getImagesDirectory(), File.separator, currentImageIndex, imageFormat);
         chunk.setIndex(currentImageIndex);
-        createImageFile(chunk.getBoundingBox(), fileName, imageFormat);
+        createImageFile(chunk.getBoundingBox(), fileName, imageFormat, pdfFilePath, password);
     }
 
     protected void writePicture(SemanticPicture picture, String pdfFilePath, String password) {
         int pictureIndex = picture.getPictureIndex();
-        if (contrastRatioConsumer == null) {
+        if (pictureIndex == 1) {
             createImagesDirectory(StaticLayoutContainers.getImagesDirectory());
-            contrastRatioConsumer = StaticLayoutContainers.getContrastRatioConsumer(pdfFilePath, password, false, null);
         }
         String imageFormat = StaticLayoutContainers.getImageFormat();
         String fileName = String.format(MarkdownSyntax.IMAGE_FILE_NAME_FORMAT, StaticLayoutContainers.getImagesDirectory(), File.separator, pictureIndex, imageFormat);
-        createImageFile(picture.getBoundingBox(), fileName, imageFormat);
+        createImageFile(picture.getBoundingBox(), fileName, imageFormat, pdfFilePath, password);
     }
 
-    private void createImageFile(BoundingBox imageBox, String fileName, String imageFormat) {
+    private void createImageFile(BoundingBox imageBox, String fileName, String imageFormat, String pdfFilePath, String password) {
         try {
             File outputFile = new File(fileName);
-            BufferedImage targetImage = contrastRatioConsumer != null ? contrastRatioConsumer.getPageSubImage(imageBox) : null;
+            BufferedImage targetImage = extractPageImage(imageBox, pdfFilePath, password);
             if (targetImage == null) {
                 return;
             }
@@ -127,8 +139,271 @@ private void createImageFile(BoundingBox imageBox, String fileName, String image
         }
     }
 
+    protected BufferedImage extractPageImage(BoundingBox imageBox, String pdfFilePath, String password) throws IOException {
+        if (imageBox == null || imageBox.getPageNumber() == null) {
+            return null;
+        }
+
+        ensurePdfDocument(pdfFilePath, password);
+        int pageIndex = imageBox.getPageNumber();
+        if (pageIndex < 0 || pageIndex >= pdfBoxDocument.getNumberOfPages()) {
+            return null;
+        }
+
+        PageImageReference pageImage = findBestPageImage(imageBox, pageIndex);
+        if (pageImage == null) {
+            return null;
+        }
+
+        pageImage.markUsed();
+        return pageImage.getImage();
+    }
+
+    private PageImageReference findBestPageImage(BoundingBox targetBoundingBox, int pageIndex) throws IOException {
+        List<PageImageReference> pageImages = getPageImages(pageIndex);
+        PageImageReference bestOverlapMatch = null;
+        double bestOverlapArea = 0.0;
+        double bestAreaDelta = Double.MAX_VALUE;
+        for (PageImageReference pageImage : pageImages) {
+            if (pageImage.isUsed()) {
+                continue;
+            }
+            double overlapArea = getIntersectionArea(targetBoundingBox, pageImage.getBoundingBox());
+            if (overlapArea <= 0.0) {
+                continue;
+            }
+            double areaDelta = getAreaDeltaRatio(targetBoundingBox, pageImage.getBoundingBox());
+            if (bestOverlapMatch == null || overlapArea > bestOverlapArea
+                || (Double.compare(overlapArea, bestOverlapArea) == 0 && areaDelta < bestAreaDelta)) {
+                bestOverlapMatch = pageImage;
+                bestOverlapArea = overlapArea;
+                bestAreaDelta = areaDelta;
+            }
+        }
+        if (bestOverlapMatch != null) {
+            return bestOverlapMatch;
+        }
+        return null;
+    }
+
+    private List<PageImageReference> getPageImages(int pageIndex) throws IOException {
+        List<PageImageReference> pageImages = pageImagesByNumber.get(pageIndex);
+        if (pageImages != null) {
+            return pageImages;
+        }
+
+        List<PageImageReference> extractedImages = new ArrayList<>();
+        PDPage page = pdfBoxDocument.getPage(pageIndex);
+        new PageImageCollector(page, pageIndex, extractedImages).processPage(page);
+        pageImagesByNumber.put(pageIndex, extractedImages);
+        return extractedImages;
+    }
+
+    private double getIntersectionArea(BoundingBox first, BoundingBox second) {
+        double left = Math.max(first.getLeftX(), second.getLeftX());
+        double right = Math.min(first.getRightX(), second.getRightX());
+        double bottom = Math.max(first.getBottomY(), second.getBottomY());
+        double top = Math.min(first.getTopY(), second.getTopY());
+        if (right <= left || top <= bottom) {
+            return 0.0;
+        }
+        return (right - left) * (top - bottom);
+    }
+
+    private double getAreaDeltaRatio(BoundingBox first, BoundingBox second) {
+        double firstArea = getBoundingBoxArea(first);
+        double secondArea = getBoundingBoxArea(second);
+        double largerArea = Math.max(1.0, Math.max(firstArea, secondArea));
+        return Math.abs(firstArea - secondArea) / largerArea;
+    }
+
+    private double getBoundingBoxArea(BoundingBox boundingBox) {
+        return Math.max(0.0, boundingBox.getWidth()) * Math.max(0.0, boundingBox.getHeight());
+    }
+
+    private void ensurePdfDocument(String pdfFilePath, String password) throws IOException {
+        if (pdfBoxDocument != null) {
+            if (Objects.equals(loadedPdfFilePath, pdfFilePath) && Objects.equals(loadedPassword, password)) {
+                return;
+            }
+            releaseAllPageImages();
+            closePdfDocument();
+        }
+        File pdfFile = new File(pdfFilePath);
+        pdfBoxDocument = password != null && !password.isEmpty()
+            ? Loader.loadPDF(pdfFile, password)
+            : Loader.loadPDF(pdfFile);
+        loadedPdfFilePath = pdfFilePath;
+        loadedPassword = password;
+    }
+
+    private void releasePageImages(int pageIndex) {
+        List<PageImageReference> pageImages = pageImagesByNumber.remove(pageIndex);
+        if (pageImages == null) {
+            return;
+        }
+        for (PageImageReference pageImage : pageImages) {
+            pageImage.release();
+        }
+    }
+
+    private void releaseAllPageImages() {
+        for (List<PageImageReference> pageImages : pageImagesByNumber.values()) {
+            for (PageImageReference pageImage : pageImages) {
+                pageImage.release();
+            }
+        }
+        pageImagesByNumber.clear();
+    }
+
+    private void closePdfDocument() throws IOException {
+        if (pdfBoxDocument == null) {
+            return;
+        }
+        try {
+            pdfBoxDocument.close();
+        } finally {
+            pdfBoxDocument = null;
+            loadedPdfFilePath = null;
+            loadedPassword = null;
+        }
+    }
+
     public static boolean isImageFileExists(String fileName) {
         File outputFile = new File(fileName);
         return outputFile.exists();
     }
+
+    @Override
+    public void close() {
+        releaseAllPageImages();
+        if (pdfBoxDocument != null) {
+            try {
+                closePdfDocument();
+            } catch (IOException e) {
+                LOGGER.log(Level.WARNING, "Unable to close image extraction document: " + e.getMessage());
+            }
+        }
+    }
+
+    private static final class PageImageReference {
+        private final PDImage image;
+        private final BoundingBox boundingBox;
+        private BufferedImage bufferedImage;
+        private boolean used;
+
+        private PageImageReference(PDImage image, BoundingBox boundingBox) {
+            this.image = image;
+            this.boundingBox = boundingBox;
+        }
+
+        private BoundingBox getBoundingBox() {
+            return boundingBox;
+        }
+
+        private boolean isUsed() {
+            return used;
+        }
+
+        private void markUsed() {
+            used = true;
+        }
+
+        private BufferedImage getImage() throws IOException {
+            if (bufferedImage == null) {
+                if (image instanceof PDImageXObject) {
+                    bufferedImage = ((PDImageXObject) image).getOpaqueImage();
+                } else {
+                    bufferedImage = image.getImage();
+                }
+            }
+            return bufferedImage;
+        }
+
+        private void release() {
+            if (bufferedImage != null) {
+                bufferedImage.flush();
+                bufferedImage = null;
+            }
+        }
+    }
+
+    private static final class PageImageCollector extends PDFGraphicsStreamEngine {
+        private final int pageIndex;
+        private final List<PageImageReference> pageImages;
+        private Point2D currentPoint = new Point2D.Float();
+
+        private PageImageCollector(PDPage page, int pageIndex, List<PageImageReference> pageImages) {
+            super(page);
+            this.pageIndex = pageIndex;
+            this.pageImages = pageImages;
+        }
+
+        @Override
+        public void drawImage(PDImage pdImage) {
+            Point2D lowerLeft = transformedPoint(0, 0);
+            Point2D lowerRight = transformedPoint(1, 0);
+            Point2D upperLeft = transformedPoint(0, 1);
+            Point2D upperRight = transformedPoint(1, 1);
+
+            double left = Math.min(Math.min(lowerLeft.getX(), lowerRight.getX()), Math.min(upperLeft.getX(), upperRight.getX()));
+            double right = Math.max(Math.max(lowerLeft.getX(), lowerRight.getX()), Math.max(upperLeft.getX(), upperRight.getX()));
+            double bottom = Math.min(Math.min(lowerLeft.getY(), lowerRight.getY()), Math.min(upperLeft.getY(), upperRight.getY()));
+            double top = Math.max(Math.max(lowerLeft.getY(), lowerRight.getY()), Math.max(upperLeft.getY(), upperRight.getY()));
+
+            pageImages.add(new PageImageReference(pdImage, new BoundingBox(pageIndex, left, bottom, right, top)));
+        }
+
+        @Override
+        public void appendRectangle(Point2D point0, Point2D point1, Point2D point2, Point2D point3) {
+        }
+
+        @Override
+        public void clip(int windingRule) {
+        }
+
+        @Override
+        public void moveTo(float x, float y) {
+            currentPoint = new Point2D.Float(x, y);
+        }
+
+        @Override
+        public void lineTo(float x, float y) {
+            currentPoint = new Point2D.Float(x, y);
+        }
+
+        @Override
+        public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) {
+            currentPoint = new Point2D.Float(x3, y3);
+        }
+
+        @Override
+        public Point2D getCurrentPoint() {
+            return currentPoint;
+        }
+
+        @Override
+        public void closePath() {
+        }
+
+        @Override
+        public void endPath() {
+        }
+
+        @Override
+        public void strokePath() {
+        }
+
+        @Override
+        public void fillPath(int windingRule) {
+        }
+
+        @Override
+        public void fillAndStrokePath(int windingRule) {
+        }
+
+        @Override
+        public void shadingFill(COSName shadingName) {
+        }
+    }
 }