Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -329,8 +329,9 @@ private static void generateOutputs(String inputPdfName, List<List<IObject>> con
imagesDirectory = config.getOutputFolder() + File.separator + baseName + MarkdownSyntax.IMAGES_DIRECTORY_SUFFIX;
}
StaticLayoutContainers.setImagesDirectory(imagesDirectory);
ImagesUtils imagesUtils = new ImagesUtils();
imagesUtils.write(contents, inputPdfName, config.getPassword());
try (ImagesUtils imagesUtils = new ImagesUtils()) {
imagesUtils.write(contents, inputPdfName, config.getPassword());
}
}
if (config.isGeneratePDF()) {
PDFWriter pdfWriter = new PDFWriter();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
*/
package org.opendataloader.pdf.utils;

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.opendataloader.pdf.containers.StaticLayoutContainers;
import org.opendataloader.pdf.entities.SemanticPicture;
import org.opendataloader.pdf.markdown.MarkdownSyntax;
Expand All @@ -27,24 +33,28 @@
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorder;
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderCell;
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderRow;
import org.verapdf.wcag.algorithms.semanticalgorithms.consumers.ContrastRatioConsumer;
import org.verapdf.wcag.algorithms.semanticalgorithms.containers.StaticContainers;

import javax.imageio.ImageIO;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.logging.Level;
import java.util.logging.Logger;

public class ImagesUtils {
public class ImagesUtils implements AutoCloseable {
private static final Logger LOGGER = Logger.getLogger(ImagesUtils.class.getCanonicalName());
private ContrastRatioConsumer contrastRatioConsumer;

public ContrastRatioConsumer getContrastRatioConsumer() {
return contrastRatioConsumer;
}
private org.apache.pdfbox.pdmodel.PDDocument pdfBoxDocument;
private final Map<Integer, List<PageImageReference>> pageImagesByNumber = new HashMap<>();
private String loadedPdfFilePath;
private String loadedPassword;

Comment thread
coderabbitai[bot] marked this conversation as resolved.
public void createImagesDirectory(String path) {
File directory = new File(path);
Expand All @@ -55,8 +65,12 @@ public void createImagesDirectory(String path) {

public void write(List<List<IObject>> contents, String pdfFilePath, String password) {
for (int pageNumber = 0; pageNumber < StaticContainers.getDocument().getNumberOfPages(); pageNumber++) {
for (IObject content : contents.get(pageNumber)) {
writeFromContents(content, pdfFilePath, password);
try {
for (IObject content : contents.get(pageNumber)) {
writeFromContents(content, pdfFilePath, password);
}
} finally {
releasePageImages(pageNumber);
}
}
}
Expand Down Expand Up @@ -95,29 +109,27 @@ protected void writeImage(ImageChunk chunk, String pdfFilePath, String password)
int currentImageIndex = StaticLayoutContainers.incrementImageIndex();
if (currentImageIndex == 1) {
createImagesDirectory(StaticLayoutContainers.getImagesDirectory());
contrastRatioConsumer = StaticLayoutContainers.getContrastRatioConsumer(pdfFilePath, password, false, null);
}
String imageFormat = StaticLayoutContainers.getImageFormat();
String fileName = String.format(MarkdownSyntax.IMAGE_FILE_NAME_FORMAT, StaticLayoutContainers.getImagesDirectory(), File.separator, currentImageIndex, imageFormat);
chunk.setIndex(currentImageIndex);
createImageFile(chunk.getBoundingBox(), fileName, imageFormat);
createImageFile(chunk.getBoundingBox(), fileName, imageFormat, pdfFilePath, password);
}

protected void writePicture(SemanticPicture picture, String pdfFilePath, String password) {
int pictureIndex = picture.getPictureIndex();
if (contrastRatioConsumer == null) {
if (pictureIndex == 1) {
createImagesDirectory(StaticLayoutContainers.getImagesDirectory());
contrastRatioConsumer = StaticLayoutContainers.getContrastRatioConsumer(pdfFilePath, password, false, null);
}
String imageFormat = StaticLayoutContainers.getImageFormat();
String fileName = String.format(MarkdownSyntax.IMAGE_FILE_NAME_FORMAT, StaticLayoutContainers.getImagesDirectory(), File.separator, pictureIndex, imageFormat);
createImageFile(picture.getBoundingBox(), fileName, imageFormat);
createImageFile(picture.getBoundingBox(), fileName, imageFormat, pdfFilePath, password);
}
Comment on lines 119 to 127
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Directory creation relies on external picture indexing.

The writePicture method assumes picture.getPictureIndex() returns 1 for the first picture to trigger directory creation. If SemanticPicture indices don't start at 1 or are not sequential, the images directory may not be created before attempting to write the first picture file.

Consider using a flag or checking if the directory exists rather than relying on index values.

Suggested improvement
 protected void writePicture(SemanticPicture picture, String pdfFilePath, String password) {
     int pictureIndex = picture.getPictureIndex();
-    if (pictureIndex == 1) {
-        createImagesDirectory(StaticLayoutContainers.getImagesDirectory());
-    }
+    createImagesDirectory(StaticLayoutContainers.getImagesDirectory());
     String imageFormat = StaticLayoutContainers.getImageFormat();

Note: createImagesDirectory already checks !directory.exists() so calling it unconditionally is safe and more robust.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java`
around lines 119 - 127, The writePicture method depends on
picture.getPictureIndex() == 1 to create the images directory which is brittle;
instead, always ensure the directory exists before writing: call
createImagesDirectory(StaticLayoutContainers.getImagesDirectory())
unconditionally at the start of writePicture (or explicitly check File dir = new
File(StaticLayoutContainers.getImagesDirectory()) and create it if
!dir.exists()) so createImageFile won’t fail when indices are non-sequential;
update the writePicture method (referenced symbols: writePicture,
createImagesDirectory, StaticLayoutContainers.getImagesDirectory,
createImageFile, SemanticPicture.getPictureIndex) accordingly.


private void createImageFile(BoundingBox imageBox, String fileName, String imageFormat) {
private void createImageFile(BoundingBox imageBox, String fileName, String imageFormat, String pdfFilePath, String password) {
try {
File outputFile = new File(fileName);
BufferedImage targetImage = contrastRatioConsumer != null ? contrastRatioConsumer.getPageSubImage(imageBox) : null;
BufferedImage targetImage = extractPageImage(imageBox, pdfFilePath, password);
if (targetImage == null) {
return;
}
Expand All @@ -127,8 +139,271 @@ private void createImageFile(BoundingBox imageBox, String fileName, String image
}
}

protected BufferedImage extractPageImage(BoundingBox imageBox, String pdfFilePath, String password) throws IOException {
if (imageBox == null || imageBox.getPageNumber() == null) {
return null;
}

ensurePdfDocument(pdfFilePath, password);
int pageIndex = imageBox.getPageNumber();
if (pageIndex < 0 || pageIndex >= pdfBoxDocument.getNumberOfPages()) {
return null;
}

PageImageReference pageImage = findBestPageImage(imageBox, pageIndex);
if (pageImage == null) {
return null;
}

pageImage.markUsed();
return pageImage.getImage();
}

private PageImageReference findBestPageImage(BoundingBox targetBoundingBox, int pageIndex) throws IOException {
List<PageImageReference> pageImages = getPageImages(pageIndex);
PageImageReference bestOverlapMatch = null;
double bestOverlapArea = 0.0;
double bestAreaDelta = Double.MAX_VALUE;
for (PageImageReference pageImage : pageImages) {
if (pageImage.isUsed()) {
continue;
}
double overlapArea = getIntersectionArea(targetBoundingBox, pageImage.getBoundingBox());
if (overlapArea <= 0.0) {
continue;
}
double areaDelta = getAreaDeltaRatio(targetBoundingBox, pageImage.getBoundingBox());
if (bestOverlapMatch == null || overlapArea > bestOverlapArea
|| (Double.compare(overlapArea, bestOverlapArea) == 0 && areaDelta < bestAreaDelta)) {
bestOverlapMatch = pageImage;
bestOverlapArea = overlapArea;
bestAreaDelta = areaDelta;
}
}
if (bestOverlapMatch != null) {
return bestOverlapMatch;
}
return null;
}

private List<PageImageReference> getPageImages(int pageIndex) throws IOException {
List<PageImageReference> pageImages = pageImagesByNumber.get(pageIndex);
if (pageImages != null) {
return pageImages;
}

List<PageImageReference> extractedImages = new ArrayList<>();
PDPage page = pdfBoxDocument.getPage(pageIndex);
new PageImageCollector(page, pageIndex, extractedImages).processPage(page);
pageImagesByNumber.put(pageIndex, extractedImages);
return extractedImages;
}

private double getIntersectionArea(BoundingBox first, BoundingBox second) {
double left = Math.max(first.getLeftX(), second.getLeftX());
double right = Math.min(first.getRightX(), second.getRightX());
double bottom = Math.max(first.getBottomY(), second.getBottomY());
double top = Math.min(first.getTopY(), second.getTopY());
if (right <= left || top <= bottom) {
return 0.0;
}
return (right - left) * (top - bottom);
}

private double getAreaDeltaRatio(BoundingBox first, BoundingBox second) {
double firstArea = getBoundingBoxArea(first);
double secondArea = getBoundingBoxArea(second);
double largerArea = Math.max(1.0, Math.max(firstArea, secondArea));
return Math.abs(firstArea - secondArea) / largerArea;
}

private double getBoundingBoxArea(BoundingBox boundingBox) {
return Math.max(0.0, boundingBox.getWidth()) * Math.max(0.0, boundingBox.getHeight());
}

private void ensurePdfDocument(String pdfFilePath, String password) throws IOException {
if (pdfBoxDocument != null) {
if (Objects.equals(loadedPdfFilePath, pdfFilePath) && Objects.equals(loadedPassword, password)) {
return;
}
releaseAllPageImages();
closePdfDocument();
}
File pdfFile = new File(pdfFilePath);
pdfBoxDocument = password != null && !password.isEmpty()
? Loader.loadPDF(pdfFile, password)
: Loader.loadPDF(pdfFile);
loadedPdfFilePath = pdfFilePath;
loadedPassword = password;
}

private void releasePageImages(int pageIndex) {
List<PageImageReference> pageImages = pageImagesByNumber.remove(pageIndex);
if (pageImages == null) {
return;
}
for (PageImageReference pageImage : pageImages) {
pageImage.release();
}
}

private void releaseAllPageImages() {
for (List<PageImageReference> pageImages : pageImagesByNumber.values()) {
for (PageImageReference pageImage : pageImages) {
pageImage.release();
}
}
pageImagesByNumber.clear();
}

private void closePdfDocument() throws IOException {
if (pdfBoxDocument == null) {
return;
}
try {
pdfBoxDocument.close();
} finally {
pdfBoxDocument = null;
loadedPdfFilePath = null;
loadedPassword = null;
}
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

public static boolean isImageFileExists(String fileName) {
File outputFile = new File(fileName);
return outputFile.exists();
}

@Override
public void close() {
releaseAllPageImages();
if (pdfBoxDocument != null) {
try {
closePdfDocument();
} catch (IOException e) {
LOGGER.log(Level.WARNING, "Unable to close image extraction document: " + e.getMessage());
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
}
}

private static final class PageImageReference {
private final PDImage image;
private final BoundingBox boundingBox;
private BufferedImage bufferedImage;
private boolean used;

private PageImageReference(PDImage image, BoundingBox boundingBox) {
this.image = image;
this.boundingBox = boundingBox;
}

private BoundingBox getBoundingBox() {
return boundingBox;
}

private boolean isUsed() {
return used;
}

private void markUsed() {
used = true;
}

private BufferedImage getImage() throws IOException {
if (bufferedImage == null) {
if (image instanceof PDImageXObject) {
bufferedImage = ((PDImageXObject) image).getOpaqueImage();
} else {
bufferedImage = image.getImage();
}
}
return bufferedImage;
}

private void release() {
if (bufferedImage != null) {
bufferedImage.flush();
bufferedImage = null;
}
}
}

private static final class PageImageCollector extends PDFGraphicsStreamEngine {
private final int pageIndex;
private final List<PageImageReference> pageImages;
private Point2D currentPoint = new Point2D.Float();

private PageImageCollector(PDPage page, int pageIndex, List<PageImageReference> pageImages) {
super(page);
this.pageIndex = pageIndex;
this.pageImages = pageImages;
}

@Override
public void drawImage(PDImage pdImage) {
Point2D lowerLeft = transformedPoint(0, 0);
Point2D lowerRight = transformedPoint(1, 0);
Point2D upperLeft = transformedPoint(0, 1);
Point2D upperRight = transformedPoint(1, 1);

double left = Math.min(Math.min(lowerLeft.getX(), lowerRight.getX()), Math.min(upperLeft.getX(), upperRight.getX()));
double right = Math.max(Math.max(lowerLeft.getX(), lowerRight.getX()), Math.max(upperLeft.getX(), upperRight.getX()));
double bottom = Math.min(Math.min(lowerLeft.getY(), lowerRight.getY()), Math.min(upperLeft.getY(), upperRight.getY()));
double top = Math.max(Math.max(lowerLeft.getY(), lowerRight.getY()), Math.max(upperLeft.getY(), upperRight.getY()));

pageImages.add(new PageImageReference(pdImage, new BoundingBox(pageIndex, left, bottom, right, top)));
}
Comment on lines +339 to +351
Copy link
Copy Markdown
Contributor

@coderabbitai coderabbitai Bot Apr 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

fd -t f -name "ImagesUtils.java"

Repository: opendataloader-project/opendataloader-pdf

Length of output: 315


🏁 Script executed:

fd "ImagesUtils.java"

Repository: opendataloader-project/opendataloader-pdf

Length of output: 173


🏁 Script executed:

wc -l java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java

Repository: opendataloader-project/opendataloader-pdf

Length of output: 177


🏁 Script executed:

sed -n '330,380p' java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java | cat -n

Repository: opendataloader-project/opendataloader-pdf

Length of output: 2497


🏁 Script executed:

sed -n '1,100p' java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java | cat -n

Repository: opendataloader-project/opendataloader-pdf

Length of output: 5261


🏁 Script executed:

sed -n '300,370p' java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java | cat -n

Repository: opendataloader-project/opendataloader-pdf

Length of output: 3141


🏁 Script executed:

sed -n '330,409p' java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java | cat -n

Repository: opendataloader-project/opendataloader-pdf

Length of output: 3229


🏁 Script executed:

rg "getClippingPath\|getGraphicsState\|clipping" java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java

Repository: opendataloader-project/opendataloader-pdf

Length of output: 67


🏁 Script executed:

rg "PDFGraphicsStreamEngine" --type java -A 20 | head -50

Repository: opendataloader-project/opendataloader-pdf

Length of output: 6165


🏁 Script executed:

web_search
PDFGraphicsStreamEngine clipping state API

Repository: opendataloader-project/opendataloader-pdf

Length of output: 197


🌐 Web query:

PDFGraphicsStreamEngine clipping path getClippingPath method PDFBox

💡 Result:

No, the PDFGraphicsStreamEngine class in Apache PDFBox does not have a direct getClippingPath method. PDFGraphicsStreamEngine is an abstract subclass of PDFStreamEngine designed for advanced graphics processing, where subclasses must implement abstract methods like clip(int windingRule), endPath, etc., to handle path construction and clipping operations. To access the current clipping path: - Call getGraphicsState (inherited from PDFStreamEngine), which returns a PDGraphicsState object. - On PDGraphicsState, call getCurrentClippingPath, which returns an java.awt.geom.Area representing the current clipping path. Note: Do not modify the returned Area directly. This is confirmed across multiple official PDFBox API documentations (versions 2.0.7 through 3.0-alpha3) and source code analysis. Subclasses like CustomGraphicsStreamEngine (an example in PDFBox) do not override or add a getClippingPath method but inherit access via getGraphicsState.

Citations:


🏁 Script executed:

rg "pageImages" java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java | head -20

Repository: opendataloader-project/opendataloader-pdf

Length of output: 1189


🏁 Script executed:

sed -n '100,200p' java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java | grep -A 15 "getPageImages"

Repository: opendataloader-project/opendataloader-pdf

Length of output: 1486


🏁 Script executed:

sed -n '100,250p' java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java | cat -n

Repository: opendataloader-project/opendataloader-pdf

Length of output: 7659


🏁 Script executed:

sed -n '260,330p' java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java | cat -n

Repository: opendataloader-project/opendataloader-pdf

Length of output: 2518


Account for PDF clipping when caching image bounds.

The drawImage() method calculates full transformed bounds but ignores any clipping state. The clip() and appendRectangle() methods that should track clipping paths are empty no-ops. When PDFs contain clipped or masked images, the stored bounds will be larger than the visible region, causing findBestPageImage() to match incorrectly and potentially export pixels outside the clipped area. Use getGraphicsState().getCurrentClippingPath() to intersect the transformed bounds with the active clipping region before storing the BoundingBox.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java`
around lines 343 - 355, drawImage currently stores full transformed image bounds
without applying the page clipping path, and clip()/appendRectangle() are
no-ops; update clip() and appendRectangle() to track/maintain the current
clipping path used by getGraphicsState().getCurrentClippingPath(), then in
drawImage(PDImage pdImage) obtain that clipping shape, transform it into the
same coordinate space as transformedPoint(...) and intersect it with the
computed left/right/top/bottom box (or skip storing if the intersection is
empty); finally create the PageImageReference with the intersected BoundingBox
(falling back to the original transformed bounds only if no clipping path is
present) so findBestPageImage() matches the visible, clipped image region.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is scoped to the native crash during image extraction. The current change replaces the old sub-image path with direct PDFBox image-object extraction and keeps the matching logic minimal for that bugfix.

Clipping-aware bounds would change the image-matching semantics and needs representative clipped-image samples to validate behavior across PDFs, so I am leaving that out of this PR. If maintainers want it, I can follow up separately with targeted samples and tests.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@the-waste-land, that's a fair and well-reasoned scope decision. The native-crash fix stands on its own and the clipping-aware matching is a separate concern that warrants dedicated samples and validation.

Would you like me to open a follow-up GitHub issue to track the clipping-aware bounds improvement so it isn't lost? I can include a backlink to this PR and the relevant context.


✏️ Learnings added
Learnt from: the-waste-land
Repo: opendataloader-project/opendataloader-pdf PR: 409
File: java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java:339-351
Timestamp: 2026-04-11T14:50:32.262Z
Learning: In `ImagesUtils.java` (Java, opendataloader-pdf), the `drawImage()` method in `PageImageCollector` intentionally stores full transformed bounds without applying the clipping path. The PR author (the-waste-land) explicitly deferred clipping-aware bound intersection to a follow-up, as it requires representative clipped-image PDF samples and targeted tests. Do not flag this as a bug in PRs that are scoped only to the native-crash fix (direct PDFBox image-object extraction path).

Note: Learnings are effective only in the context of similar code segments. To apply general coding guidelines, please configure review instructions. You can manage existing learnings in the app.


🧠 Learnings used
Learnt from: hamid17amu
Repo: opendataloader-project/opendataloader-pdf PR: 0
File: :0-0
Timestamp: 2026-04-11T10:19:08.964Z
Learning: In `DocumentProcessor.java` (Java, opendataloader-pdf), `StaticLayoutContainers.closeContrastRatioConsumer()` does not expose verification APIs (e.g., `getContrastRatioConsumer()` or `hasContrastRatioConsumer()`). Verifying the closed state of the consumer would require changes to `StaticLayoutContainers` internals, which is out of scope for resource-cleanup-only changes.


@Override
public void appendRectangle(Point2D point0, Point2D point1, Point2D point2, Point2D point3) {
}

@Override
public void clip(int windingRule) {
}
Comment on lines +338 to +359
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Check if there are any test PDFs or documentation about clipping support
rg -i "clip" --type java -C3 java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/

Repository: opendataloader-project/opendataloader-pdf

Length of output: 809


🏁 Script executed:

wc -l java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java

Repository: opendataloader-project/opendataloader-pdf

Length of output: 177


🏁 Script executed:

cat -n java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java

Repository: opendataloader-project/opendataloader-pdf

Length of output: 18564


🏁 Script executed:

rg "ForkJoinPool" java/opendataloader-pdf-core/

Repository: opendataloader-project/opendataloader-pdf

Length of output: 385


🏁 Script executed:

sed -n '1,100p' java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java

Repository: opendataloader-project/opendataloader-pdf

Length of output: 4578


🏁 Script executed:

sed -n '100,250p' java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java

Repository: opendataloader-project/opendataloader-pdf

Length of output: 7885


🏁 Script executed:

sed -n '250,350p' java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java

Repository: opendataloader-project/opendataloader-pdf

Length of output: 5030


🏁 Script executed:

rg -A5 -B5 "new PageImageCollector" java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java

Repository: opendataloader-project/opendataloader-pdf

Length of output: 516


Clipping path not applied to stored image bounds.

The drawImage() method stores the full transformed bounding box without intersecting it with the active clipping path. The clip() and appendRectangle() methods are no-ops, so clipping state is never tracked. For PDFs with clipped or masked images, the stored bounds will be larger than the visible region, which may cause findBestPageImage() to select incorrect candidates when multiple images overlap the target area.

If this is an acceptable limitation for the current use case (e.g., scanned PDFs typically don't use clipping on images), consider adding a code comment documenting this constraint.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/ImagesUtils.java`
around lines 338 - 359, drawImage currently records the full transformed image
bbox via PageImageReference and BoundingBox without considering any clipping;
implement tracking of the current clipping region (update state in
appendRectangle and clip) and intersect that clip with the transformed bbox
before adding to pageImages so stored bounds reflect visible area (use methods
around drawImage, appendRectangle, clip, pageImages, PageImageReference,
BoundingBox and pageIndex to locate code); if you decide not to support
clipping, add a clear TODO/comment in drawImage explaining that
appendRectangle/clip are no-ops and that stored bounds ignore clipping so
callers must handle clipped images.


@Override
public void moveTo(float x, float y) {
currentPoint = new Point2D.Float(x, y);
}

@Override
public void lineTo(float x, float y) {
currentPoint = new Point2D.Float(x, y);
}

@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) {
currentPoint = new Point2D.Float(x3, y3);
}

@Override
public Point2D getCurrentPoint() {
return currentPoint;
}

@Override
public void closePath() {
}

@Override
public void endPath() {
}

@Override
public void strokePath() {
}

@Override
public void fillPath(int windingRule) {
}

@Override
public void fillAndStrokePath(int windingRule) {
}

@Override
public void shadingFill(COSName shadingName) {
}
}
}
Loading