From 26a6e388cfd99feb88998708efca027048caf048 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 09:07:45 +0000 Subject: [PATCH 01/11] Expand openpdf-core operator coverage in OpenPdfCorePageRenderer Second pass at using openpdf-core as the rendering engine in openpdf-renderer. Extends the Java2D rasterizer driven by PdfContentParser with the operators most commonly missing on real-world PDFs: - CMYK colors (k, K) and color-space-aware fills/strokes (cs, CS, sc, SC, scn, SCN) for DeviceGray / DeviceRGB / DeviceCMYK. - Clipping (W, W*) with proper save/restore through q/Q. - Line styling (J, j, M, d, i) plumbed into the BasicStroke. - Extended graphics state (gs) honoring CA/ca alpha and LW/ML/LC/LJ. - Text rise (Ts). - Marked content / compatibility operators (BMC, BDC, EMC, MP, DP, BX, EX) parsed as no-ops so content inside them still renders. Adds two new conveniences on OpenPdfCoreRenderer: - renderPage(int, Graphics2D, int, int) draws directly onto a caller- supplied Graphics2D without allocating a BufferedImage, and saves/ restores the caller's transform and clip. - renderAllPages(float) returns one BufferedImage per page. Adds OpenPdfCorePageRendererOperatorsTest that builds synthetic PDFs with PdfContentByte and renders them back to verify CMYK fills, dashed strokes, clipping, marked content and text rise all drive the renderer end-to-end. README updated to reflect the broader operator table. --- openpdf-renderer/README.md | 59 ++- .../core/OpenPdfCorePageRenderer.java | 335 +++++++++++++++++- .../renderer/core/OpenPdfCoreRenderer.java | 75 ++++ .../OpenPdfCorePageRendererOperatorsTest.java | 230 ++++++++++++ .../core/OpenPdfCoreRendererTest.java | 70 ++++ 5 files changed, 742 insertions(+), 27 deletions(-) create mode 100644 openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java diff --git a/openpdf-renderer/README.md b/openpdf-renderer/README.md index c79434e07..94acee4b3 100644 --- a/openpdf-renderer/README.md +++ b/openpdf-renderer/README.md @@ -95,17 +95,26 @@ in-tree legacy parser (`PDFFile`, `PDFPage`, `PDFParser`, | Content-stream operator listing (`getContentOperators`) | `openpdf-core` (`PdfContentParser`) | | Page rasterization (`renderPage`) | `openpdf-core` (`PdfContentParser`) → Java2D via `OpenPdfCorePageRenderer` | -The Java2D rasterizer (`OpenPdfCorePageRenderer`) supports the standard subset -of PDF operators needed for typical text + simple-vector PDFs: graphics state -(`q`/`Q`/`cm`), path construction (`m`/`l`/`c`/`v`/`y`/`re`/`h`), path -painting (`S`/`s`/`f`/`f*`/`B`/`B*`/`b`/`b*`/`n`), line width (`w`), -DeviceGray/DeviceRGB colors (`g`/`G`/`rg`/`RG`), and the full text-object -machinery (`BT`/`ET`/`Tf`/`Tc`/`Tw`/`TL`/`Tz`/`Td`/`TD`/`Tm`/`T*`/`Tj`/`TJ`/`'`/`"`). -Operators outside this subset (extended graphics state `gs`, CMYK / pattern / -shading colors, XObject `Do`, inline images, marked content, clipping -`W`/`W*`, ...) are parsed but currently ignored — pages that rely heavily on -them may render with missing content. Adding more operators is a localized -change in `OpenPdfCorePageRenderer`. +The Java2D rasterizer (`OpenPdfCorePageRenderer`) supports a broad subset of +PDF content-stream operators — sufficient for typical text + vector PDFs: + +| Category | Operators | +|---|---| +| Graphics state | `q`, `Q`, `cm`, `gs` (alpha `CA`/`ca`, line styling) | +| Line style | `w`, `J`, `j`, `M`, `d`, `i` | +| Path construction | `m`, `l`, `c`, `v`, `y`, `re`, `h` | +| Path painting | `S`, `s`, `f`, `F`, `f*`, `B`, `B*`, `b`, `b*`, `n` | +| Clipping | `W`, `W*` | +| Colors (DeviceGray / DeviceRGB / DeviceCMYK) | `g`, `G`, `rg`, `RG`, `k`, `K`, `cs`, `CS`, `sc`, `SC`, `scn`, `SCN` | +| Text state | `BT`, `ET`, `Tf`, `Tc`, `Tw`, `TL`, `Tz`, `Td`, `TD`, `Tm`, `T*`, `Ts` | +| Text showing | `Tj`, `TJ`, `'`, `"` | +| Marked content / compatibility (no-op) | `BMC`, `BDC`, `EMC`, `MP`, `DP`, `BX`, `EX` | + +Operators outside this subset (XObject `Do` for forms and images, inline +images `BI`/`ID`/`EI`, shading `sh`, pattern / shading colors, type 3 font +glyph operators) are parsed but currently ignored — pages that rely +heavily on them may render with missing content. Adding more operators is a +localized change in `OpenPdfCorePageRenderer`. For pages that exercise features outside the supported subset and need pixel-perfect output today, the deprecated `PDFFile` / `PDFPage.getImage(...)` @@ -155,6 +164,34 @@ try (OpenPdfCoreRenderer renderer = new OpenPdfCoreRenderer(new File("document.p } ``` +### Rendering directly to a `Graphics2D` + +Avoid the intermediate `BufferedImage` when the caller already has a target +surface (Swing component, printer, SVG-backed graphics, ...): + +```java +try (OpenPdfCoreRenderer renderer = new OpenPdfCoreRenderer(new File("document.pdf"))) { + BufferedImage out = new BufferedImage(800, 1000, BufferedImage.TYPE_INT_ARGB); + Graphics2D g2 = out.createGraphics(); + try { + renderer.renderPage(1, g2, 800, 1000); // fit page to the box, preserve aspect + } finally { + g2.dispose(); + } +} +``` + +### Batch rendering + +```java +try (OpenPdfCoreRenderer renderer = new OpenPdfCoreRenderer(new File("document.pdf"))) { + List pages = renderer.renderAllPages(150f); + for (int i = 0; i < pages.size(); i++) { + ImageIO.write(pages.get(i), "png", new File("page-" + (i + 1) + ".png")); + } +} +``` + ## Using the legacy `PDFFile` / `PDFPage` API (deprecated) The pre-3.0.5 entry point still works but is now `@Deprecated`. New code should diff --git a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java index 2f6553e10..9a243d8ee 100644 --- a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java +++ b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java @@ -14,6 +14,7 @@ import java.awt.Font; import java.awt.Graphics2D; import java.awt.RenderingHints; +import java.awt.Shape; import java.awt.geom.AffineTransform; import java.awt.geom.Path2D; import java.io.IOException; @@ -21,6 +22,7 @@ import java.util.ArrayList; import java.util.Deque; import java.util.HashMap; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.logging.Level; @@ -49,24 +51,32 @@ * *

Supported operator subset

* * - *

Other operators (extended graphics state {@code gs}, CMYK colors, - * {@code cs}/{@code CS}/{@code scn}/{@code SCN}, clipping {@code W}/{@code W*}, - * XObject {@code Do}, inline images, shading patterns, marked content, ...) - * are silently ignored. Pages that rely heavily on those features may render - * with missing content; this renderer is intentionally a focused subset that - * handles typical text + simple-vector PDFs correctly.

+ *

Operators outside this subset (XObject {@code Do} for forms and images, + * inline images {@code BI}/{@code ID}/{@code EI}, shading {@code sh}, + * pattern / shading colors, type 3 font glyph operators) are silently ignored. + * Pages that rely heavily on those features may render with missing content; + * this renderer is intentionally a focused subset that handles typical text + + * simple-vector PDFs correctly.

* *

Coordinates

*

The PDF user space has its origin at the bottom-left and Y growing up; @@ -82,12 +92,20 @@ final class OpenPdfCorePageRenderer { /** Default user-space resolution of a PDF, in DPI. */ private static final float PDF_USER_SPACE_DPI = 72f; + // ExtGState dictionary keys not pre-defined as PdfName constants in openpdf-core. + private static final PdfName EXTGS_LW = new PdfName("LW"); + private static final PdfName EXTGS_ML = new PdfName("ML"); + private static final PdfName EXTGS_LC = new PdfName("LC"); + private static final PdfName EXTGS_LJ = new PdfName("LJ"); + private final Graphics2D g2; private final PdfDictionary resources; private final Map fontCache = new HashMap<>(); private final Deque stateStack = new ArrayDeque<>(); private final Deque ctmStack = new ArrayDeque<>(); + // LinkedList because the PDF clip may be null (no clip), which ArrayDeque rejects. + private final Deque clipStack = new LinkedList<>(); private GState state; private Path2D.Float currentPath = new Path2D.Float(); @@ -205,12 +223,14 @@ private void dispatch(String op, List operands) throws IOException { case "q": stateStack.push(state); ctmStack.push(g2.getTransform()); + clipStack.push(g2.getClip()); state = new GState(state); break; case "Q": if (!stateStack.isEmpty()) { state = stateStack.pop(); g2.setTransform(ctmStack.pop()); + g2.setClip(clipStack.pop()); } break; case "cm": @@ -218,23 +238,79 @@ private void dispatch(String op, List operands) throws IOException { num(operands, 3), num(operands, 4), num(operands, 5)); break; - // --- Line width --- + // --- Line style --- case "w": state.lineWidth = num(operands, 0); break; + case "J": + state.lineCap = (int) num(operands, 0); + break; + case "j": + state.lineJoin = (int) num(operands, 0); + break; + case "M": + state.miterLimit = Math.max(num(operands, 0), 1f); + break; + case "d": + applyDashPattern((PdfArray) operands.get(0), num(operands, 1)); + break; + case "i": + // Flatness tolerance: Java2D handles flattening internally. + break; + + // --- Extended graphics state --- + case "gs": + if (operands.get(0) instanceof PdfName extName) { + applyExtGState(extName.toString().substring(1)); + } + break; // --- Colors --- case "g": - state.fillColor = gray(num(operands, 0)); + state.fillColorSpace = ColorSpaceKind.GRAY; + state.fillColor = applyAlpha(gray(num(operands, 0)), state.fillAlpha); break; case "G": - state.strokeColor = gray(num(operands, 0)); + state.strokeColorSpace = ColorSpaceKind.GRAY; + state.strokeColor = applyAlpha(gray(num(operands, 0)), state.strokeAlpha); break; case "rg": - state.fillColor = rgb(num(operands, 0), num(operands, 1), num(operands, 2)); + state.fillColorSpace = ColorSpaceKind.RGB; + state.fillColor = applyAlpha( + rgb(num(operands, 0), num(operands, 1), num(operands, 2)), state.fillAlpha); break; case "RG": - state.strokeColor = rgb(num(operands, 0), num(operands, 1), num(operands, 2)); + state.strokeColorSpace = ColorSpaceKind.RGB; + state.strokeColor = applyAlpha( + rgb(num(operands, 0), num(operands, 1), num(operands, 2)), state.strokeAlpha); + break; + case "k": + state.fillColorSpace = ColorSpaceKind.CMYK; + state.fillColor = applyAlpha( + cmyk(num(operands, 0), num(operands, 1), num(operands, 2), num(operands, 3)), + state.fillAlpha); + break; + case "K": + state.strokeColorSpace = ColorSpaceKind.CMYK; + state.strokeColor = applyAlpha( + cmyk(num(operands, 0), num(operands, 1), num(operands, 2), num(operands, 3)), + state.strokeAlpha); + break; + case "cs": + state.fillColorSpace = colorSpaceFromName(operands.get(0)); + state.fillColor = applyAlpha(defaultColorFor(state.fillColorSpace), state.fillAlpha); + break; + case "CS": + state.strokeColorSpace = colorSpaceFromName(operands.get(0)); + state.strokeColor = applyAlpha(defaultColorFor(state.strokeColorSpace), state.strokeAlpha); + break; + case "sc": + case "scn": + state.fillColor = applyAlpha(colorFromOperands(state.fillColorSpace, operands), state.fillAlpha); + break; + case "SC": + case "SCN": + state.strokeColor = applyAlpha(colorFromOperands(state.strokeColorSpace, operands), state.strokeAlpha); break; // --- Path construction --- @@ -338,6 +414,16 @@ private void dispatch(String op, List operands) throws IOException { resetPath(); break; + // --- Clipping --- + case "W": + state.pendingClipRule = Path2D.WIND_NON_ZERO; + state.hasPendingClip = true; + break; + case "W*": + state.pendingClipRule = Path2D.WIND_EVEN_ODD; + state.hasPendingClip = true; + break; + // --- Text state --- case "BT": inTextObject = true; @@ -370,6 +456,9 @@ private void dispatch(String op, List operands) throws IOException { case "Tz": state.horizontalScaling = num(operands, 0) / 100f; break; + case "Ts": + state.textRise = num(operands, 0); + break; case "Td": textMoveTo(num(operands, 0), num(operands, 1)); break; @@ -407,6 +496,16 @@ private void dispatch(String op, List operands) throws IOException { showText(decodeString((PdfString) operands.get(2))); break; + // --- Marked content / compatibility (parsed, no rendering effect) --- + case "BMC": + case "BDC": + case "EMC": + case "MP": + case "DP": + case "BX": + case "EX": + break; + default: // Unsupported operator: ignore quietly so partial pages still render. break; @@ -417,7 +516,10 @@ private void dispatch(String op, List operands) throws IOException { private void strokePath() { g2.setColor(state.strokeColor); - g2.setStroke(new BasicStroke(Math.max(state.lineWidth, 0.001f))); + g2.setStroke(new BasicStroke( + Math.max(state.lineWidth, 0.001f), + state.lineCap, state.lineJoin, state.miterLimit, + state.dashPattern, state.dashPhase)); g2.draw(currentPath); } @@ -429,9 +531,93 @@ private void fillPath(int windingRule) { } private void resetPath() { + if (state.hasPendingClip) { + Path2D.Float clip = (Path2D.Float) currentPath.clone(); + clip.setWindingRule(state.pendingClipRule); + Shape existing = g2.getClip(); + if (existing == null) { + g2.setClip(clip); + } else { + g2.clip(clip); + } + state.hasPendingClip = false; + } currentPath = new Path2D.Float(); } + private void applyDashPattern(PdfArray array, float phase) { + if (array == null || array.size() == 0) { + state.dashPattern = null; + state.dashPhase = 0f; + return; + } + float[] dash = new float[array.size()]; + boolean allZero = true; + for (int i = 0; i < array.size(); i++) { + PdfObject e = array.getPdfObject(i); + dash[i] = e instanceof PdfNumber n ? Math.max(n.floatValue(), 0f) : 0f; + if (dash[i] > 0f) { + allZero = false; + } + } + if (allZero) { + state.dashPattern = null; + state.dashPhase = 0f; + } else { + state.dashPattern = dash; + state.dashPhase = phase; + } + } + + private void applyExtGState(String name) { + if (resources == null) { + return; + } + PdfDictionary gsResources = resources.getAsDict(PdfName.EXTGSTATE); + if (gsResources == null) { + return; + } + PdfObject obj = gsResources.get(new PdfName(name)); + PdfDictionary dict; + if (obj instanceof PdfDictionary d) { + dict = d; + } else if (obj instanceof PRIndirectReference ref) { + PdfObject resolved = PdfReader.getPdfObject(ref); + dict = resolved instanceof PdfDictionary ? (PdfDictionary) resolved : null; + } else { + dict = null; + } + if (dict == null) { + return; + } + PdfNumber ca = dict.getAsNumber(PdfName.ca); + if (ca != null) { + state.fillAlpha = clamp01(ca.floatValue()); + state.fillColor = applyAlpha(state.fillColor, state.fillAlpha); + } + PdfNumber upperCA = dict.getAsNumber(PdfName.CA); + if (upperCA != null) { + state.strokeAlpha = clamp01(upperCA.floatValue()); + state.strokeColor = applyAlpha(state.strokeColor, state.strokeAlpha); + } + PdfNumber lw = dict.getAsNumber(EXTGS_LW); + if (lw != null) { + state.lineWidth = lw.floatValue(); + } + PdfNumber ml = dict.getAsNumber(EXTGS_ML); + if (ml != null) { + state.miterLimit = Math.max(ml.floatValue(), 1f); + } + PdfNumber lc = dict.getAsNumber(EXTGS_LC); + if (lc != null) { + state.lineCap = lc.intValue(); + } + PdfNumber lj = dict.getAsNumber(EXTGS_LJ); + if (lj != null) { + state.lineJoin = lj.intValue(); + } + } + private void concatCtm(float a, float b, float c, float d, float e, float f) { AffineTransform m = new AffineTransform(a, b, c, d, e, f); g2.transform(m); @@ -463,6 +649,9 @@ private void showText(String text) { // because Graphics2D's font baseline is drawn in image-Y orientation. g2.transform(textMatrix); g2.scale(state.horizontalScaling, 1.0); + if (state.textRise != 0f) { + g2.translate(0, state.textRise); + } g2.scale(1, -1); g2.drawString(text, 0f, 0f); } finally { @@ -587,6 +776,92 @@ private static Color rgb(float r, float g, float b) { return new Color(clamp01(r), clamp01(g), clamp01(b)); } + /** Naive CMYK to sRGB approximation: r = (1-c)(1-k), g = (1-m)(1-k), b = (1-y)(1-k). */ + private static Color cmyk(float c, float m, float y, float k) { + float cc = clamp01(c); + float mm = clamp01(m); + float yy = clamp01(y); + float kk = clamp01(k); + float r = (1f - cc) * (1f - kk); + float gg = (1f - mm) * (1f - kk); + float b = (1f - yy) * (1f - kk); + return new Color(r, gg, b); + } + + private static Color applyAlpha(Color color, float alpha) { + int a = Math.round(clamp01(alpha) * 255f); + if (a == color.getAlpha()) { + return color; + } + return new Color(color.getRed(), color.getGreen(), color.getBlue(), a); + } + + private static ColorSpaceKind colorSpaceFromName(PdfObject operand) { + if (!(operand instanceof PdfName name)) { + return ColorSpaceKind.UNKNOWN; + } + String n = name.toString(); + switch (n) { + case "/DeviceGray": + case "/G": + case "/CalGray": + return ColorSpaceKind.GRAY; + case "/DeviceRGB": + case "/RGB": + case "/CalRGB": + return ColorSpaceKind.RGB; + case "/DeviceCMYK": + case "/CMYK": + return ColorSpaceKind.CMYK; + default: + return ColorSpaceKind.UNKNOWN; + } + } + + private static Color defaultColorFor(ColorSpaceKind kind) { + return kind == ColorSpaceKind.CMYK ? cmyk(0, 0, 0, 1f) : Color.BLACK; + } + + /** Picks numeric operands matching the active color space; non-numeric operands (e.g. pattern names) yield default. */ + private static Color colorFromOperands(ColorSpaceKind kind, List operands) { + int numericCount = 0; + for (int i = 0; i < operands.size() - 1; i++) { + if (operands.get(i) instanceof PdfNumber) { + numericCount++; + } + } + switch (kind) { + case GRAY: + if (numericCount >= 1) { + return gray(num(operands, 0)); + } + break; + case RGB: + if (numericCount >= 3) { + return rgb(num(operands, 0), num(operands, 1), num(operands, 2)); + } + break; + case CMYK: + if (numericCount >= 4) { + return cmyk(num(operands, 0), num(operands, 1), num(operands, 2), num(operands, 3)); + } + break; + default: + // Fall through: infer from operand count when color space is unknown / unsupported. + if (numericCount >= 4) { + return cmyk(num(operands, 0), num(operands, 1), num(operands, 2), num(operands, 3)); + } + if (numericCount == 3) { + return rgb(num(operands, 0), num(operands, 1), num(operands, 2)); + } + if (numericCount == 1) { + return gray(num(operands, 0)); + } + break; + } + return defaultColorFor(kind); + } + private static float clamp01(float v) { if (v < 0f) { return 0f; @@ -597,11 +872,26 @@ private static float clamp01(float v) { return v; } + private enum ColorSpaceKind { GRAY, RGB, CMYK, UNKNOWN } + /** Mutable per-graphics-state snapshot. Not thread-safe. */ private static final class GState { Color fillColor = Color.BLACK; Color strokeColor = Color.BLACK; + ColorSpaceKind fillColorSpace = ColorSpaceKind.GRAY; + ColorSpaceKind strokeColorSpace = ColorSpaceKind.GRAY; + float fillAlpha = 1.0f; + float strokeAlpha = 1.0f; + float lineWidth = 1.0f; + int lineCap = BasicStroke.CAP_BUTT; + int lineJoin = BasicStroke.JOIN_MITER; + float miterLimit = 10.0f; + float[] dashPattern; + float dashPhase; + + boolean hasPendingClip; + int pendingClipRule = Path2D.WIND_NON_ZERO; CMapAwareDocumentFont font; float fontSize; @@ -609,6 +899,7 @@ private static final class GState { float wordSpacing; float leading; float horizontalScaling = 1.0f; + float textRise; GState() { } @@ -616,13 +907,25 @@ private static final class GState { GState(GState other) { this.fillColor = other.fillColor; this.strokeColor = other.strokeColor; + this.fillColorSpace = other.fillColorSpace; + this.strokeColorSpace = other.strokeColorSpace; + this.fillAlpha = other.fillAlpha; + this.strokeAlpha = other.strokeAlpha; this.lineWidth = other.lineWidth; + this.lineCap = other.lineCap; + this.lineJoin = other.lineJoin; + this.miterLimit = other.miterLimit; + this.dashPattern = other.dashPattern == null ? null : other.dashPattern.clone(); + this.dashPhase = other.dashPhase; this.font = other.font; this.fontSize = other.fontSize; this.charSpacing = other.charSpacing; this.wordSpacing = other.wordSpacing; this.leading = other.leading; this.horizontalScaling = other.horizontalScaling; + this.textRise = other.textRise; + // hasPendingClip / pendingClipRule are intentionally not copied: + // the W / W* operators apply to the current path before any q/Q boundary. } } } diff --git a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCoreRenderer.java b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCoreRenderer.java index 4dd6ccb88..3c8c702d1 100644 --- a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCoreRenderer.java +++ b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCoreRenderer.java @@ -280,6 +280,81 @@ public BufferedImage renderPage(int pageNumber, float dpi) throws IOException { return out; } + /** + * Renders the requested page directly onto the supplied {@link Graphics2D}, + * scaled to fit a target box of {@code targetWidth x targetHeight} pixels. + * + *

Unlike {@link #renderPage(int, float)} this does not allocate + * a {@link BufferedImage} — callers in charge of their own target + * surface (e.g. a Swing component's paint method, an SVG-backed graphics, + * a printer graphics) can use this overload to avoid the intermediate + * raster.

+ * + *

The DPI is derived from the target size and the page size, picking + * the smaller of the horizontal / vertical scales so the page fits without + * distortion. The {@link Graphics2D} is left in the state it was in on + * entry (current transform and clip are saved and restored).

+ * + * @param pageNumber 1-based page number + * @param g2 the destination graphics + * @param targetWidth target width in pixels + * @param targetHeight target height in pixels + * @throws IOException if reading the page fails + * @throws IllegalArgumentException if {@code targetWidth} or {@code targetHeight} + * is non-positive, or {@code pageNumber} is out of range + * @throws IllegalStateException if this renderer has been closed + * @since 3.0.5 + */ + public void renderPage(int pageNumber, Graphics2D g2, int targetWidth, int targetHeight) + throws IOException { + ensureOpen(); + Objects.requireNonNull(g2, "g2"); + if (targetWidth <= 0 || targetHeight <= 0) { + throw new IllegalArgumentException( + "target size must be > 0, was " + targetWidth + "x" + targetHeight); + } + int numPages = getNumPages(); + if (pageNumber < 1 || pageNumber > numPages) { + throw new IllegalArgumentException( + "pageNumber " + pageNumber + " out of range [1, " + numPages + "]"); + } + Rectangle2D size = getPageSize(pageNumber); + float scaleX = targetWidth / (float) size.getWidth(); + float scaleY = targetHeight / (float) size.getHeight(); + float scale = Math.min(scaleX, scaleY); + float dpi = scale * PDF_USER_SPACE_DPI; + + java.awt.geom.AffineTransform savedTx = g2.getTransform(); + java.awt.Shape savedClip = g2.getClip(); + try { + OpenPdfCorePageRenderer.render(reader, pageNumber, g2, targetWidth, targetHeight, dpi); + } finally { + g2.setTransform(savedTx); + g2.setClip(savedClip); + } + } + + /** + * Renders all pages of the document to a list of {@link BufferedImage}s at + * the given DPI. Convenience for batch use cases; for large documents, + * prefer streaming page-by-page via {@link #renderPage(int, float)} to + * avoid holding every page rasterization in memory at once. + * + * @param dpi target resolution in dots per inch + * @return one image per page, in document order + * @throws IOException if any page fails to render + * @since 3.0.5 + */ + public List renderAllPages(float dpi) throws IOException { + ensureOpen(); + int pages = getNumPages(); + List images = new ArrayList<>(pages); + for (int i = 1; i <= pages; i++) { + images.add(renderPage(i, dpi)); + } + return images; + } + /** * @return the underlying {@link PdfReader}, for advanced callers that need * direct access to {@code openpdf-core} parsing APIs diff --git a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java new file mode 100644 index 000000000..49a503a1b --- /dev/null +++ b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java @@ -0,0 +1,230 @@ +/* + * Copyright 2026 the OpenPDF contributors. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + */ +package org.openpdf.renderer.core; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.awt.Color; +import java.awt.image.BufferedImage; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.util.List; + +import javax.imageio.ImageIO; + +import org.junit.jupiter.api.Test; +import org.openpdf.text.Document; +import org.openpdf.text.PageSize; +import org.openpdf.text.Rectangle; +import org.openpdf.text.pdf.PdfContentByte; +import org.openpdf.text.pdf.PdfWriter; + +/** + * Exercises operators added in the second integration pass — CMYK colors, + * clipping ({@code W} / {@code W*}), line styling ({@code J}, {@code j}, + * {@code M}, {@code d}), text rise ({@code Ts}), and marked content + * ({@code BMC} / {@code BDC} / {@code EMC}) — by writing a tiny PDF with + * {@code openpdf-core}'s {@link PdfContentByte} and rendering it back with the + * {@code openpdf-core}-driven Java2D renderer. + * + *

These tests don't assert pixel-perfect output: they assert that the + * renderer drives the operators without throwing, and that the resulting image + * actually contains the colored marks (i.e. operators were honored end-to-end).

+ */ +class OpenPdfCorePageRendererOperatorsTest { + + private static byte[] buildPdf(ContentWriter writer) throws Exception { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (Document doc = new Document(new Rectangle(PageSize.A6))) { + PdfWriter pdf = PdfWriter.getInstance(doc, baos); + doc.open(); + PdfContentByte cb = pdf.getDirectContent(); + writer.write(cb); + } + return baos.toByteArray(); + } + + @FunctionalInterface + interface ContentWriter { + void write(PdfContentByte cb) throws Exception; + } + + private static int countPixelsMatching(BufferedImage img, ColorPredicate p) { + int count = 0; + for (int y = 0; y < img.getHeight(); y += 2) { + for (int x = 0; x < img.getWidth(); x += 2) { + int argb = img.getRGB(x, y); + int r = (argb >> 16) & 0xFF; + int g = (argb >> 8) & 0xFF; + int b = argb & 0xFF; + if (p.matches(r, g, b)) { + count++; + } + } + } + return count; + } + + @FunctionalInterface + interface ColorPredicate { + boolean matches(int r, int g, int b); + } + + private static void saveForInspection(BufferedImage img, String name) throws IOException { + File outDir = new File("target/test-outputs"); + outDir.mkdirs(); + ImageIO.write(img, "png", new File(outDir, name)); + } + + @Test + void rendersCmykFillAsApproximatedRgb() throws Exception { + // 100% cyan in CMYK ~= (0, 1, 1) in RGB. Draw a big rectangle and check + // that the rendered image contains pixels that look cyan-ish. + byte[] pdf = buildPdf(cb -> { + cb.setCMYKColorFillF(1f, 0f, 0f, 0f); + cb.rectangle(20, 20, 200, 200); + cb.fill(); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + BufferedImage img = r.renderPage(1, 150f); + saveForInspection(img, "cmyk-fill.png"); + int cyanish = countPixelsMatching(img, (red, green, blue) -> + red < 80 && green > 150 && blue > 150); + assertThat(cyanish) + .as("CMYK (1,0,0,0) fill must produce cyan-ish pixels") + .isGreaterThan(100); + } + } + + @Test + void rendersDashedStrokeWithCustomLineCap() throws Exception { + // Red dashed line with round caps. Verify the renderer at least produces + // some red pixels (the stroke was drawn). + byte[] pdf = buildPdf(cb -> { + cb.setRGBColorStrokeF(1f, 0f, 0f); + cb.setLineWidth(4f); + cb.setLineCap(PdfContentByte.LINE_CAP_ROUND); + cb.setLineDash(new float[]{8f, 6f}, 0f); + cb.moveTo(20, 100); + cb.lineTo(200, 100); + cb.stroke(); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + BufferedImage img = r.renderPage(1, 150f); + saveForInspection(img, "dashed-stroke.png"); + int redish = countPixelsMatching(img, (red, green, blue) -> + red > 200 && green < 80 && blue < 80); + assertThat(redish) + .as("dashed red stroke must produce red pixels") + .isGreaterThan(20); + } + } + + @Test + void rendersClippedFillWithoutSpillingOutside() throws Exception { + // Clip to a small rectangle, then try to fill a much larger green rect. + // Pixels outside the clip must remain the opaque-white background. + byte[] pdf = buildPdf(cb -> { + cb.rectangle(50, 50, 60, 60); + cb.clip(); + cb.newPath(); + cb.setRGBColorFillF(0f, 1f, 0f); + cb.rectangle(0, 0, 400, 400); + cb.fill(); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + BufferedImage img = r.renderPage(1, 150f); + saveForInspection(img, "clipped-fill.png"); + + int greenish = countPixelsMatching(img, (red, green, blue) -> + green > 200 && red < 80 && blue < 80); + assertThat(greenish) + .as("clip must allow the green fill inside the clip rect") + .isGreaterThan(50); + + // Sample a corner well outside the clip rectangle; it must still be the + // opaque-white background. + int corner = img.getRGB(img.getWidth() - 4, img.getHeight() - 4); + int alpha = (corner >>> 24) & 0xFF; + int rch = (corner >> 16) & 0xFF; + int gch = (corner >> 8) & 0xFF; + int bch = corner & 0xFF; + assertThat(alpha).isEqualTo(0xFF); + assertThat(rch).isEqualTo(0xFF); + assertThat(gch).isEqualTo(0xFF); + assertThat(bch).isEqualTo(0xFF); + } + } + + @Test + void rendersMarkedContentWithoutFailing() throws Exception { + // Marked content blocks (BMC / EMC) must be parsed as no-ops; any text + // inside them must still be drawn. + byte[] pdf = buildPdf(cb -> { + cb.beginMarkedContentSequence(new org.openpdf.text.pdf.PdfName("Span")); + cb.setColorFill(Color.BLUE); + cb.rectangle(40, 40, 100, 50); + cb.fill(); + cb.endMarkedContentSequence(); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + List ops = r.getContentOperators(1); + // Sanity: openpdf-core must have emitted BMC / EMC for us. + assertThat(ops).contains("BMC", "EMC"); + + BufferedImage img = r.renderPage(1, 150f); + saveForInspection(img, "marked-content.png"); + + int bluish = countPixelsMatching(img, (red, green, blue) -> + blue > 200 && red < 80 && green < 80); + assertThat(bluish) + .as("content inside a marked-content sequence must still render") + .isGreaterThan(50); + } + } + + @Test + void rendersTextRiseAsVerticalOffset() throws Exception { + // Two glyphs at the same Td, one with Ts=10 (raised). They must render + // at different rows. We don't need a font setup: PdfContentByte with a + // built-in font is fine, since the renderer falls back to a Java2D font + // when it can't load the PDF font outline. + byte[] pdf = buildPdf(cb -> { + org.openpdf.text.pdf.BaseFont bf = org.openpdf.text.pdf.BaseFont + .createFont(org.openpdf.text.pdf.BaseFont.HELVETICA, + org.openpdf.text.pdf.BaseFont.WINANSI, + org.openpdf.text.pdf.BaseFont.NOT_EMBEDDED); + cb.beginText(); + cb.setFontAndSize(bf, 24f); + cb.setTextMatrix(40f, 200f); + cb.showText("A"); + cb.setTextRise(20f); + cb.showText("B"); + cb.endText(); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + List ops = r.getContentOperators(1); + assertThat(ops).contains("Ts", "Tj"); + + BufferedImage img = r.renderPage(1, 150f); + saveForInspection(img, "text-rise.png"); + + // At least *some* dark pixels (glyphs) must have been drawn. + int darkPixels = countPixelsMatching(img, (red, green, blue) -> + red < 80 && green < 80 && blue < 80); + assertThat(darkPixels).isGreaterThan(20); + } + } +} diff --git a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCoreRendererTest.java b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCoreRendererTest.java index fe5290673..0cc0db476 100644 --- a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCoreRendererTest.java +++ b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCoreRendererTest.java @@ -3,6 +3,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.awt.Graphics2D; import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; @@ -204,5 +205,74 @@ void exposesUnderlyingPdfReader() throws IOException { assertThat(r.getReader().getNumberOfPages()).isEqualTo(r.getNumPages()); } } + + @Test + void renderAllPagesReturnsOneImagePerPage() throws IOException { + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdfBytes)) { + List images = r.renderAllPages(72f); + assertThat(images).hasSize(r.getNumPages()); + assertThat(images).allSatisfy(img -> { + assertThat(img.getWidth()).isPositive(); + assertThat(img.getHeight()).isPositive(); + }); + } + } + + @Test + void renderPageOntoGraphics2DDrawsContentAndRestoresState() throws IOException { + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdfBytes)) { + int w = 200; + int h = 260; + BufferedImage target = new BufferedImage(w, h, BufferedImage.TYPE_INT_ARGB); + Graphics2D g2 = target.createGraphics(); + try { + java.awt.geom.AffineTransform before = g2.getTransform(); + java.awt.Shape beforeClip = g2.getClip(); + r.renderPage(1, g2, w, h); + // Caller-supplied Graphics2D state must be unchanged after rendering. + assertThat(g2.getTransform()).isEqualTo(before); + assertThat(g2.getClip()).isEqualTo(beforeClip); + } finally { + g2.dispose(); + } + // Something must have been drawn on the target surface. + int nonBackground = 0; + for (int y = 0; y < h; y += 4) { + for (int x = 0; x < w; x += 4) { + int argb = target.getRGB(x, y); + int a = (argb >>> 24) & 0xFF; + int rch = (argb >> 16) & 0xFF; + int gch = (argb >> 8) & 0xFF; + int bch = argb & 0xFF; + if (!(a == 0xFF && rch == 0xFF && gch == 0xFF && bch == 0xFF)) { + nonBackground++; + } + } + } + assertThat(nonBackground).isPositive(); + } + } + + @Test + void renderPageOntoGraphics2DRejectsBadArguments() throws IOException { + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdfBytes)) { + BufferedImage target = new BufferedImage(10, 10, BufferedImage.TYPE_INT_ARGB); + Graphics2D g2 = target.createGraphics(); + try { + assertThatThrownBy(() -> r.renderPage(1, null, 10, 10)) + .isInstanceOf(NullPointerException.class); + assertThatThrownBy(() -> r.renderPage(1, g2, 0, 10)) + .isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> r.renderPage(1, g2, 10, -1)) + .isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> r.renderPage(0, g2, 10, 10)) + .isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> r.renderPage(r.getNumPages() + 1, g2, 10, 10)) + .isInstanceOf(IllegalArgumentException.class); + } finally { + g2.dispose(); + } + } + } } From 5c7f0af1f09f48070237fd36565f8137de326682 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 09:31:23 +0000 Subject: [PATCH 02/11] Add XObject support (forms + images) and inline-image safety Completes the openpdf-core-driven Java2D renderer by handling the operators most commonly missing on real-world PDFs: - Do: Form XObjects render recursively, applying the form's own /Matrix and /BBox under the current CTM with state save/restore. Image XObjects decode via: * ImageIO for DCTDecode (JPEG) and JPXDecode (JPEG 2000, when supported by the runtime), * a manual raster builder for uncompressed / Flate-decoded 8-bit DeviceGray, DeviceRGB and DeviceCMYK streams (CMYK is approximated to sRGB on the fly, since Java2D can't natively draw a CMYK raster). Image XObjects honor the current fill alpha (ca from ExtGState) and the CTM, drawing into the standard (0,0)-(1,1) unit square. - Inline images (BI/ID/EI) are now pre-stripped from the content stream before PdfContentParser sees them; the parser had no inline-image handling and the raw image bytes after ID would otherwise derail tokenization for the rest of the page. - The content-stream parse loop now treats parser-level failures (malformed dictionaries, unterminated arrays) as "stop early" rather than aborting the whole renderer, matching how operator- level errors were already handled. Tests added to OpenPdfCorePageRendererOperatorsTest: - rendersJpegImageXObject builds a red JPEG, embeds it via PdfContentByte.addImage, and checks the page contains red pixels. - rendersFormXObjectViaNestedContentStream stamps a PdfTemplate with an orange fill and checks the form's content reaches the rasterizer. - inlineImagesDoNotBreakPageRendering writes a hand-rolled stream with a BI/ID/EI block followed by a red rectangle and checks the trailing rectangle still renders. README updated; module test suite: 84 tests, 0 failures. --- openpdf-renderer/README.md | 14 +- .../core/OpenPdfCorePageRenderer.java | 402 +++++++++++++++++- .../OpenPdfCorePageRendererOperatorsTest.java | 91 ++++ 3 files changed, 491 insertions(+), 16 deletions(-) diff --git a/openpdf-renderer/README.md b/openpdf-renderer/README.md index 94acee4b3..1fe19c632 100644 --- a/openpdf-renderer/README.md +++ b/openpdf-renderer/README.md @@ -108,15 +108,17 @@ PDF content-stream operators — sufficient for typical text + vector PDFs: | Colors (DeviceGray / DeviceRGB / DeviceCMYK) | `g`, `G`, `rg`, `RG`, `k`, `K`, `cs`, `CS`, `sc`, `SC`, `scn`, `SCN` | | Text state | `BT`, `ET`, `Tf`, `Tc`, `Tw`, `TL`, `Tz`, `Td`, `TD`, `Tm`, `T*`, `Ts` | | Text showing | `Tj`, `TJ`, `'`, `"` | +| XObjects | `Do` (Form XObjects recursively; Image XObjects: JPEG/`DCTDecode`, JPEG2000/`JPXDecode` where `ImageIO` supports it, and uncompressed / Flate-decoded 8-bit DeviceGray / DeviceRGB / DeviceCMYK) | | Marked content / compatibility (no-op) | `BMC`, `BDC`, `EMC`, `MP`, `DP`, `BX`, `EX` | -Operators outside this subset (XObject `Do` for forms and images, inline -images `BI`/`ID`/`EI`, shading `sh`, pattern / shading colors, type 3 font -glyph operators) are parsed but currently ignored — pages that rely -heavily on them may render with missing content. Adding more operators is a -localized change in `OpenPdfCorePageRenderer`. +Inline images (`BI`/`ID`/`EI`) are stripped from the content stream before +parsing — they aren't rendered, but they don't derail the rest of the +page either. Shading (`sh`), pattern / shading colors and type 3 font glyph +operators are silently ignored. Pages that rely heavily on those features +may render with missing content. Adding more operators is a localized change +in `OpenPdfCorePageRenderer`. -For pages that exercise features outside the supported subset and need +For pages that need features outside this supported subset and you want pixel-perfect output today, the deprecated `PDFFile` / `PDFPage.getImage(...)` API still works. diff --git a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java index 9a243d8ee..e3301f38d 100644 --- a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java +++ b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java @@ -11,12 +11,17 @@ import java.awt.AlphaComposite; import java.awt.BasicStroke; import java.awt.Color; +import java.awt.Composite; import java.awt.Font; import java.awt.Graphics2D; import java.awt.RenderingHints; import java.awt.Shape; import java.awt.geom.AffineTransform; import java.awt.geom.Path2D; +import java.awt.image.BufferedImage; +import java.awt.image.DataBufferByte; +import java.awt.image.WritableRaster; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.ArrayDeque; import java.util.ArrayList; @@ -28,8 +33,11 @@ import java.util.logging.Level; import java.util.logging.Logger; +import javax.imageio.ImageIO; + import org.openpdf.text.pdf.CMapAwareDocumentFont; import org.openpdf.text.pdf.PRIndirectReference; +import org.openpdf.text.pdf.PRStream; import org.openpdf.text.pdf.PRTokeniser; import org.openpdf.text.pdf.PdfArray; import org.openpdf.text.pdf.PdfContentParser; @@ -69,14 +77,18 @@ *
  • Text showing: {@code Tj}, {@code TJ}, {@code '}, {@code "}
  • *
  • Marked content / compatibility (no-op): {@code BMC}, {@code BDC}, * {@code EMC}, {@code MP}, {@code DP}, {@code BX}, {@code EX}
  • + *
  • XObjects ({@code Do}): Form XObjects (recursive content streams with + * their own {@code BBox} / {@code Matrix}) and Image XObjects + * (JPEG via {@code DCTDecode}, JPEG2000 via {@code JPXDecode} when + * supported by {@code ImageIO}, and uncompressed / Flate-decoded + * 8-bit DeviceGray / DeviceRGB / DeviceCMYK images).
  • * * - *

    Operators outside this subset (XObject {@code Do} for forms and images, - * inline images {@code BI}/{@code ID}/{@code EI}, shading {@code sh}, - * pattern / shading colors, type 3 font glyph operators) are silently ignored. - * Pages that rely heavily on those features may render with missing content; - * this renderer is intentionally a focused subset that handles typical text + - * simple-vector PDFs correctly.

    + *

    Inline images ({@code BI}/{@code ID}/{@code EI}) are stripped from the + * content stream before parsing — they're not rendered, but they don't + * derail the rest of the page either. Shading {@code sh}, pattern / shading + * colors and type 3 font glyph operators are silently ignored. Pages that + * rely heavily on those features may render with missing content.

    * *

    Coordinates

    *

    The PDF user space has its origin at the bottom-left and Y growing up; @@ -202,12 +214,25 @@ static void render(PdfReader reader, int pageNumber, Graphics2D g2, } private void processContent(byte[] contentBytes) throws IOException { - PdfContentParser parser = new PdfContentParser(new PRTokeniser(contentBytes)); + byte[] sanitized = stripInlineImages(contentBytes); + PdfContentParser parser = new PdfContentParser(new PRTokeniser(sanitized)); List operands = new ArrayList<>(); - while (!parser.parse(operands).isEmpty()) { - PdfLiteral op = (PdfLiteral) operands.get(operands.size() - 1); + while (true) { + List parsed; + try { + parsed = parser.parse(operands); + } catch (IOException | RuntimeException e) { + // Malformed token sequence (e.g. unbalanced dict, unknown construct). + // Stop processing rather than abort the whole page rendering. + LOG.log(Level.FINE, "Aborting content stream early due to: {0}", e); + return; + } + if (parsed.isEmpty()) { + return; + } + PdfLiteral op = (PdfLiteral) parsed.get(parsed.size() - 1); try { - dispatch(op.toString(), operands); + dispatch(op.toString(), parsed); } catch (RuntimeException e) { // A malformed operator must not abort the whole page; log for diagnostics. LOG.log(Level.FINE, "Skipping operator ''{0}'' due to: {1}", @@ -216,6 +241,72 @@ private void processContent(byte[] contentBytes) throws IOException { } } + /** + * Returns a copy of {@code content} with every inline-image block + * ({@code BI ... ID ... EI}) removed. {@code PdfContentParser} has no + * special handling for inline images, so the raw image bytes between + * {@code ID} and {@code EI} would derail tokenization. Removing the + * block keeps the rest of the page parseable; the inline image itself + * isn't rendered. + */ + private static byte[] stripInlineImages(byte[] content) { + if (content == null || content.length == 0) { + return content; + } + java.io.ByteArrayOutputStream out = new java.io.ByteArrayOutputStream(content.length); + int i = 0; + while (i < content.length) { + int biStart = findToken(content, i, 'B', 'I'); + if (biStart < 0) { + out.write(content, i, content.length - i); + break; + } + out.write(content, i, biStart - i); + // Find the matching EI (preceded by whitespace, followed by whitespace or EOF). + int eiEnd = findEndInlineImage(content, biStart + 2); + if (eiEnd < 0) { + // No EI found: bail out, drop the rest of the stream. + break; + } + i = eiEnd; + } + return out.toByteArray(); + } + + /** Finds the offset of a two-byte token (e.g. {@code "BI"}) bounded by whitespace. */ + private static int findToken(byte[] buf, int from, char c1, char c2) { + for (int i = from; i < buf.length - 1; i++) { + if (buf[i] != c1 || buf[i + 1] != c2) { + continue; + } + boolean leftOk = i == 0 || isPdfWhitespace(buf[i - 1]); + boolean rightOk = i + 2 >= buf.length || isPdfWhitespace(buf[i + 2]); + if (leftOk && rightOk) { + return i; + } + } + return -1; + } + + /** Returns the index just past the closing {@code EI} of an inline image starting after {@code BI}. */ + private static int findEndInlineImage(byte[] buf, int from) { + for (int i = from; i < buf.length - 1; i++) { + if (buf[i] != 'E' || buf[i + 1] != 'I') { + continue; + } + boolean leftOk = i > 0 && isPdfWhitespace(buf[i - 1]); + boolean rightOk = i + 2 >= buf.length || isPdfWhitespace(buf[i + 2]); + if (leftOk && rightOk) { + return i + 2; + } + } + return -1; + } + + private static boolean isPdfWhitespace(byte b) { + return b == ' ' || b == '\t' || b == '\n' || b == '\r' || b == '\f' || b == 0; + } + /** Dispatches one operator. Operands include the trailing operator literal at index size-1. */ private void dispatch(String op, List operands) throws IOException { switch (op) { @@ -496,6 +587,13 @@ private void dispatch(String op, List operands) throws IOException { showText(decodeString((PdfString) operands.get(2))); break; + // --- XObject invocation --- + case "Do": + if (operands.get(0) instanceof PdfName xobjName) { + doXObject(xobjName.toString().substring(1)); + } + break; + // --- Marked content / compatibility (parsed, no rendering effect) --- case "BMC": case "BDC": @@ -623,6 +721,290 @@ private void concatCtm(float a, float b, float c, float d, float e, float f) { g2.transform(m); } + // ---------- XObject helpers ---------- + + /** + * Resolves an XObject by its resource name and dispatches to the right + * sub-renderer (Form or Image). Unknown subtypes are silently ignored. + */ + private void doXObject(String name) { + if (resources == null) { + return; + } + PdfDictionary xobjects = resources.getAsDict(PdfName.XOBJECT); + if (xobjects == null) { + return; + } + PdfObject ref = xobjects.get(new PdfName(name)); + PdfObject resolved = ref instanceof PRIndirectReference ind + ? PdfReader.getPdfObject(ind) : ref; + if (!(resolved instanceof PRStream stream)) { + return; + } + PdfName subtype = stream.getAsName(PdfName.SUBTYPE); + if (PdfName.FORM.equals(subtype)) { + renderForm(stream); + } else if (PdfName.IMAGE.equals(subtype)) { + renderImage(stream); + } + } + + /** + * Renders a Form XObject by parsing its content stream recursively, with + * the form's own resources (falling back to the parent page's) and any + * {@code /Matrix} entry applied on top of the current CTM. The current + * graphics state and CTM are saved and restored around the call so the + * form's content can't leak out. + */ + private void renderForm(PRStream form) { + AffineTransform savedTx = g2.getTransform(); + Shape savedClip = g2.getClip(); + GState savedState = state; + try { + state = new GState(state); + PdfArray matrix = form.getAsArray(PdfName.MATRIX); + if (matrix != null && matrix.size() >= 6) { + AffineTransform m = new AffineTransform( + floatAt(matrix, 0), floatAt(matrix, 1), + floatAt(matrix, 2), floatAt(matrix, 3), + floatAt(matrix, 4), floatAt(matrix, 5)); + g2.transform(m); + } + PdfArray bbox = form.getAsArray(PdfName.BBOX); + if (bbox != null && bbox.size() >= 4) { + float x = floatAt(bbox, 0); + float y = floatAt(bbox, 1); + float w = floatAt(bbox, 2) - x; + float h = floatAt(bbox, 3) - y; + if (w > 0 && h > 0) { + g2.clip(new java.awt.geom.Rectangle2D.Float(x, y, w, h)); + } + } + PdfDictionary formResources = form.getAsDict(PdfName.RESOURCES); + PdfDictionary effective = formResources != null ? formResources : resources; + OpenPdfCorePageRenderer nested = new OpenPdfCorePageRenderer(g2, effective); + nested.state = state; + byte[] body = PdfReader.getStreamBytes(form); + nested.processContent(body); + } catch (IOException | RuntimeException e) { + LOG.log(Level.FINE, "Skipping Form XObject due to: {0}", e); + } finally { + state = savedState; + g2.setTransform(savedTx); + g2.setClip(savedClip); + } + } + + /** + * Renders an Image XObject under the current CTM. The image occupies the + * unit square (0,0)-(1,1) in user space, per the PDF spec, with the CTM + * supplying the actual placement/size. + */ + private void renderImage(PRStream image) { + BufferedImage img = decodeImage(image); + if (img == null) { + return; + } + AffineTransform saved = g2.getTransform(); + try { + // PDF images map (0,0)-(1,1) in user space to the full image, with Y running up. + // Java2D draws top-to-bottom, so we translate up by 1 and flip Y back. + g2.translate(0, 1); + g2.scale(1.0 / img.getWidth(), -1.0 / img.getHeight()); + Composite saveComposite = null; + if (state.fillAlpha < 1f) { + saveComposite = g2.getComposite(); + g2.setComposite(AlphaComposite.getInstance(AlphaComposite.SRC_OVER, state.fillAlpha)); + } + try { + g2.drawImage(img, 0, 0, null); + } finally { + if (saveComposite != null) { + g2.setComposite(saveComposite); + } + } + } finally { + g2.setTransform(saved); + } + } + + private BufferedImage decodeImage(PRStream stream) { + PdfNumber widthN = stream.getAsNumber(PdfName.WIDTH); + PdfNumber heightN = stream.getAsNumber(PdfName.HEIGHT); + if (widthN == null || heightN == null) { + return null; + } + int width = widthN.intValue(); + int height = heightN.intValue(); + if (width <= 0 || height <= 0) { + return null; + } + PdfObject filterObj = stream.get(PdfName.FILTER); + if (hasFilter(filterObj, PdfName.DCTDECODE) || hasFilter(filterObj, PdfName.JPXDECODE)) { + return decodeViaImageIO(stream); + } + return decodeRawRaster(stream, width, height); + } + + private static boolean hasFilter(PdfObject filterObj, PdfName name) { + if (filterObj == null) { + return false; + } + if (filterObj instanceof PdfName n) { + return n.equals(name); + } + if (filterObj instanceof PdfArray arr) { + for (PdfObject e : arr.getElements()) { + PdfObject direct = e instanceof PRIndirectReference ref + ? PdfReader.getPdfObject(ref) : e; + if (name.equals(direct)) { + return true; + } + } + } + return false; + } + + /** Decodes DCT/JPX-encoded image streams via the JRE's {@link ImageIO}. */ + private BufferedImage decodeViaImageIO(PRStream stream) { + try { + byte[] raw = PdfReader.getStreamBytesRaw(stream); + BufferedImage img = ImageIO.read(new ByteArrayInputStream(raw)); + if (img != null) { + return img; + } + // Some PDFs apply additional filters before DCTDecode; fall back to fully decoded bytes. + byte[] decoded = PdfReader.getStreamBytes(stream); + return ImageIO.read(new ByteArrayInputStream(decoded)); + } catch (IOException | RuntimeException e) { + LOG.log(Level.FINE, "Skipping JPEG/JPX image XObject due to: {0}", e); + return null; + } + } + + /** + * Decodes an uncompressed / Flate-decoded image XObject into a {@link BufferedImage}. + * Supports 8-bit DeviceGray, DeviceRGB and DeviceCMYK; bit depths and color spaces + * outside that set yield {@code null}. + */ + private BufferedImage decodeRawRaster(PRStream stream, int width, int height) { + try { + PdfNumber bpcN = stream.getAsNumber(PdfName.BITSPERCOMPONENT); + int bpc = bpcN == null ? 8 : bpcN.intValue(); + if (bpc != 8) { + return null; + } + int components = imageComponents(stream.get(PdfName.COLORSPACE)); + if (components <= 0) { + return null; + } + byte[] decoded = PdfReader.getStreamBytes(stream); + int rowBytes = width * components; + int expected = rowBytes * height; + if (decoded.length < expected) { + return null; + } + switch (components) { + case 1: + return buildGrayImage(decoded, width, height); + case 3: + return buildRgbImage(decoded, width, height); + case 4: + return buildCmykImage(decoded, width, height); + default: + return null; + } + } catch (IOException | RuntimeException e) { + LOG.log(Level.FINE, "Skipping raw image XObject due to: {0}", e); + return null; + } + } + + /** Returns the number of color components for a {@code /ColorSpace} entry, or 0 if unsupported. */ + private static int imageComponents(PdfObject csObj) { + if (csObj instanceof PRIndirectReference ind) { + csObj = PdfReader.getPdfObject(ind); + } + if (csObj instanceof PdfName n) { + if (PdfName.DEVICEGRAY.equals(n) || new PdfName("G").equals(n) || new PdfName("CalGray").equals(n)) { + return 1; + } + if (PdfName.DEVICERGB.equals(n) || new PdfName("RGB").equals(n) || new PdfName("CalRGB").equals(n)) { + return 3; + } + if (PdfName.DEVICECMYK.equals(n) || new PdfName("CMYK").equals(n)) { + return 4; + } + } + if (csObj instanceof PdfArray arr && arr.size() >= 1) { + PdfObject head = arr.getDirectObject(0); + // ICCBased: [/ICCBased <<.../N n>>] + if (new PdfName("ICCBased").equals(head) && arr.size() >= 2) { + PdfObject paramsObj = arr.getDirectObject(1); + if (paramsObj instanceof PdfDictionary params) { + PdfNumber n = params.getAsNumber(new PdfName("N")); + if (n != null) { + return n.intValue(); + } + } + } + // CalGray / CalRGB / Lab + if (new PdfName("CalGray").equals(head)) { + return 1; + } + if (new PdfName("CalRGB").equals(head) || new PdfName("Lab").equals(head)) { + return 3; + } + } + return 0; + } + + private static BufferedImage buildGrayImage(byte[] data, int width, int height) { + BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY); + byte[] target = ((DataBufferByte) img.getRaster().getDataBuffer()).getData(); + System.arraycopy(data, 0, target, 0, Math.min(target.length, data.length)); + return img; + } + + private static BufferedImage buildRgbImage(byte[] data, int width, int height) { + BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_3BYTE_BGR); + // PDF DeviceRGB stores R,G,B; BufferedImage.TYPE_3BYTE_BGR stores B,G,R. + // Swap on the fly while copying so the image displays with correct colors. + WritableRaster raster = img.getRaster(); + byte[] dst = ((DataBufferByte) raster.getDataBuffer()).getData(); + int pixels = width * height; + for (int p = 0, di = 0, si = 0; p < pixels; p++, di += 3, si += 3) { + dst[di] = data[si + 2]; // B + dst[di + 1] = data[si + 1]; // G + dst[di + 2] = data[si]; // R + } + return img; + } + + private static BufferedImage buildCmykImage(byte[] data, int width, int height) { + // Build an sRGB BufferedImage and approximate CMYK -> RGB per pixel, + // since Java2D can't natively draw a 4-component CMYK raster. + BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_3BYTE_BGR); + byte[] dst = ((DataBufferByte) img.getRaster().getDataBuffer()).getData(); + int pixels = width * height; + for (int p = 0, di = 0, si = 0; p < pixels; p++, di += 3, si += 4) { + float c = (data[si] & 0xFF) / 255f; + float m = (data[si + 1] & 0xFF) / 255f; + float y = (data[si + 2] & 0xFF) / 255f; + float k = (data[si + 3] & 0xFF) / 255f; + float oneMinusK = 1f - k; + dst[di] = (byte) Math.round((1f - y) * oneMinusK * 255f); // B + dst[di + 1] = (byte) Math.round((1f - m) * oneMinusK * 255f); // G + dst[di + 2] = (byte) Math.round((1f - c) * oneMinusK * 255f); // R + } + return img; + } + + private static float floatAt(PdfArray arr, int idx) { + PdfObject obj = arr.getPdfObject(idx); + return obj instanceof PdfNumber n ? n.floatValue() : 0f; + } + // ---------- Text helpers ---------- private void textMoveTo(float tx, float ty) { diff --git a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java index 49a503a1b..97b6600b9 100644 --- a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java +++ b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java @@ -194,6 +194,97 @@ void rendersMarkedContentWithoutFailing() throws Exception { } } + @Test + void rendersJpegImageXObject() throws Exception { + // Generate a 32x16 solid-red JPEG, embed it as an Image XObject in a PDF, + // then verify the rendered page contains red pixels (the image was drawn). + BufferedImage jpegSource = new BufferedImage(32, 16, BufferedImage.TYPE_INT_RGB); + java.awt.Graphics2D gs = jpegSource.createGraphics(); + try { + gs.setColor(Color.RED); + gs.fillRect(0, 0, 32, 16); + } finally { + gs.dispose(); + } + ByteArrayOutputStream jpegOut = new ByteArrayOutputStream(); + ImageIO.write(jpegSource, "jpg", jpegOut); + org.openpdf.text.Image pdfImage = org.openpdf.text.Image.getInstance(jpegOut.toByteArray()); + + byte[] pdf = buildPdf(cb -> { + // Scale the image up so it covers a recognizable area of the page. + cb.addImage(pdfImage, 160f, 0f, 0f, 80f, 30f, 100f); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + List ops = r.getContentOperators(1); + assertThat(ops).contains("Do"); + + BufferedImage img = r.renderPage(1, 150f); + saveForInspection(img, "image-xobject.png"); + int redish = countPixelsMatching(img, (red, green, blue) -> + red > 180 && green < 100 && blue < 100); + assertThat(redish) + .as("JPEG Image XObject must produce red pixels on the page") + .isGreaterThan(200); + } + } + + @Test + void rendersFormXObjectViaNestedContentStream() throws Exception { + // Form XObjects embed their own content stream. Wrap a colored rectangle + // in a PdfTemplate and stamp it onto the page; the renderer must recurse + // into the form's content and draw the rectangle. + byte[] pdf = buildPdf(cb -> { + org.openpdf.text.pdf.PdfTemplate tpl = cb.createTemplate(100f, 60f); + tpl.setRGBColorFillF(1f, 0.5f, 0f); // orange + tpl.rectangle(0f, 0f, 100f, 60f); + tpl.fill(); + cb.addTemplate(tpl, 40f, 120f); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + BufferedImage img = r.renderPage(1, 150f); + saveForInspection(img, "form-xobject.png"); + int orangeish = countPixelsMatching(img, (red, green, blue) -> + red > 200 && green > 80 && green < 200 && blue < 80); + assertThat(orangeish) + .as("Form XObject content must be rendered onto the page") + .isGreaterThan(100); + } + } + + @Test + void inlineImagesDoNotBreakPageRendering() throws Exception { + // Build a content stream by hand that contains a tiny inline image + // followed by a normal vector draw. The inline image isn't expected + // to render, but the rest of the page must still parse and draw. + byte[] pdf = buildPdf(cb -> { + // BI/ID/EI inline image: 2x2, 8bpc, DeviceGray, 4 raw bytes. + String inline = "q\n" + + "BI /W 2 /H 2 /CS /G /BPC 8 ID\n" + + new String(new byte[]{(byte) 0xFF, 0x00, 0x00, (byte) 0xFF}, + java.nio.charset.StandardCharsets.ISO_8859_1) + + "\nEI\n" + + "Q\n" + + "1 0 0 rg\n" + + "50 50 80 80 re f\n"; + cb.setLiteral(inline); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + // Even though BI/ID/EI is in the stream, getContentOperators sees the + // BI block as raw tokens — but renderer must not crash and the trailing + // red rectangle must still be drawn. + BufferedImage img = r.renderPage(1, 150f); + saveForInspection(img, "inline-image-stripped.png"); + int redish = countPixelsMatching(img, (red, green, blue) -> + red > 200 && green < 80 && blue < 80); + assertThat(redish) + .as("after stripping inline image, the trailing red rectangle must still render") + .isGreaterThan(50); + } + } + @Test void rendersTextRiseAsVerticalOffset() throws Exception { // Two glyphs at the same Td, one with Ts=10 (raised). They must render From 029c7da1feeaa244dfea3b38436777373aad489d Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 09:35:56 +0000 Subject: [PATCH 03/11] Expand single-line Javadocs to multi-line Addresses the checkstyle 'single-line Javadoc comment should be multi-line' rule on the new openpdf-core renderer code. Affects ten one-line Javadocs across OpenPdfCorePageRenderer and one in OpenPdfCoreRenderer; behavior unchanged. --- .../core/OpenPdfCorePageRenderer.java | 36 ++++++++++++++----- .../renderer/core/OpenPdfCoreRenderer.java | 4 ++- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java index e3301f38d..a1827c539 100644 --- a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java +++ b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java @@ -101,7 +101,9 @@ final class OpenPdfCorePageRenderer { private static final Logger LOG = Logger.getLogger(OpenPdfCorePageRenderer.class.getName()); - /** Default user-space resolution of a PDF, in DPI. */ + /** + * Default user-space resolution of a PDF, in DPI. + */ private static final float PDF_USER_SPACE_DPI = 72f; // ExtGState dictionary keys not pre-defined as PdfName constants in openpdf-core. @@ -273,7 +275,9 @@ private static byte[] stripInlineImages(byte[] content) { return out.toByteArray(); } - /** Finds the offset of a two-byte token (e.g. {@code "BI"}) bounded by whitespace. */ + /** + * Finds the offset of a two-byte token (e.g. {@code "BI"}) bounded by whitespace. + */ private static int findToken(byte[] buf, int from, char c1, char c2) { for (int i = from; i < buf.length - 1; i++) { if (buf[i] != c1 || buf[i + 1] != c2) { @@ -288,7 +292,9 @@ private static int findToken(byte[] buf, int from, char c1, char c2) { return -1; } - /** Returns the index just past the closing {@code EI} of an inline image starting after {@code BI}. */ + /** + * Returns the index just past the closing {@code EI} of an inline image starting after {@code BI}. + */ private static int findEndInlineImage(byte[] buf, int from) { for (int i = from; i < buf.length - 1; i++) { if (buf[i] != 'E' || buf[i + 1] != 'I') { @@ -307,7 +313,9 @@ private static boolean isPdfWhitespace(byte b) { return b == ' ' || b == '\t' || b == '\n' || b == '\r' || b == '\f' || b == 0; } - /** Dispatches one operator. Operands include the trailing operator literal at index size-1. */ + /** + * Dispatches one operator. Operands include the trailing operator literal at index size-1. + */ private void dispatch(String op, List operands) throws IOException { switch (op) { // --- Graphics state --- @@ -865,7 +873,9 @@ private static boolean hasFilter(PdfObject filterObj, PdfName name) { return false; } - /** Decodes DCT/JPX-encoded image streams via the JRE's {@link ImageIO}. */ + /** + * Decodes DCT/JPX-encoded image streams via the JRE's {@link ImageIO}. + */ private BufferedImage decodeViaImageIO(PRStream stream) { try { byte[] raw = PdfReader.getStreamBytesRaw(stream); @@ -920,7 +930,9 @@ private BufferedImage decodeRawRaster(PRStream stream, int width, int height) { } } - /** Returns the number of color components for a {@code /ColorSpace} entry, or 0 if unsupported. */ + /** + * Returns the number of color components for a {@code /ColorSpace} entry, or 0 if unsupported. + */ private static int imageComponents(PdfObject csObj) { if (csObj instanceof PRIndirectReference ind) { csObj = PdfReader.getPdfObject(ind); @@ -1158,7 +1170,9 @@ private static Color rgb(float r, float g, float b) { return new Color(clamp01(r), clamp01(g), clamp01(b)); } - /** Naive CMYK to sRGB approximation: r = (1-c)(1-k), g = (1-m)(1-k), b = (1-y)(1-k). */ + /** + * Naive CMYK to sRGB approximation: r = (1-c)(1-k), g = (1-m)(1-k), b = (1-y)(1-k). + */ private static Color cmyk(float c, float m, float y, float k) { float cc = clamp01(c); float mm = clamp01(m); @@ -1204,7 +1218,9 @@ private static Color defaultColorFor(ColorSpaceKind kind) { return kind == ColorSpaceKind.CMYK ? cmyk(0, 0, 0, 1f) : Color.BLACK; } - /** Picks numeric operands matching the active color space; non-numeric operands (e.g. pattern names) yield default. */ + /** + * Picks numeric operands matching the active color space; non-numeric operands (e.g. pattern names) yield default. + */ private static Color colorFromOperands(ColorSpaceKind kind, List operands) { int numericCount = 0; for (int i = 0; i < operands.size() - 1; i++) { @@ -1256,7 +1272,9 @@ private static float clamp01(float v) { private enum ColorSpaceKind { GRAY, RGB, CMYK, UNKNOWN } - /** Mutable per-graphics-state snapshot. Not thread-safe. */ + /** + * Mutable per-graphics-state snapshot. Not thread-safe. + */ private static final class GState { Color fillColor = Color.BLACK; Color strokeColor = Color.BLACK; diff --git a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCoreRenderer.java b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCoreRenderer.java index 3c8c702d1..87edaa05d 100644 --- a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCoreRenderer.java +++ b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCoreRenderer.java @@ -74,7 +74,9 @@ */ public class OpenPdfCoreRenderer implements Closeable { - /** Default user-space resolution of a PDF, in DPI. */ + /** + * Default user-space resolution of a PDF, in DPI. + */ private static final float PDF_USER_SPACE_DPI = 72f; private final PdfReader reader; From 840637133b55f1102c06913cd66533d504cc51de Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 09:43:44 +0000 Subject: [PATCH 04/11] Reduce NPath complexity and avoid parameter reassignment Splits the two over-branchy helpers Codacy flagged into smaller focused methods, and stops reassigning a method parameter: - applyExtGState(String) was a flat list of seven null checks driving an NPath of 2048. Split into resolveExtGStateDict, applyExtGStateAlpha and applyExtGStateLineStyle. - imageComponents(PdfObject) was a chain of PdfName.equals checks on freshly-allocated PdfNames (NPath 3136). Now uses static Set lookups (DEVICE_GRAY_NAMES / DEVICE_RGB_NAMES / DEVICE_CMYK_NAMES) with named PdfName constants, split across componentsForNamedColorSpace, componentsForArrayColorSpace and iccBasedComponents. - imageComponents no longer reassigns its csObj parameter; uses a local `direct` reference instead. Also wraps the long XObject row in openpdf-renderer/README.md that was exceeding the 120-column limit (was 207). No behavior change; module test suite still 84/84 green. --- openpdf-renderer/README.md | 10 +- .../core/OpenPdfCorePageRenderer.java | 123 +++++++++++------- 2 files changed, 87 insertions(+), 46 deletions(-) diff --git a/openpdf-renderer/README.md b/openpdf-renderer/README.md index 1fe19c632..08cea3cad 100644 --- a/openpdf-renderer/README.md +++ b/openpdf-renderer/README.md @@ -108,7 +108,15 @@ PDF content-stream operators — sufficient for typical text + vector PDFs: | Colors (DeviceGray / DeviceRGB / DeviceCMYK) | `g`, `G`, `rg`, `RG`, `k`, `K`, `cs`, `CS`, `sc`, `SC`, `scn`, `SCN` | | Text state | `BT`, `ET`, `Tf`, `Tc`, `Tw`, `TL`, `Tz`, `Td`, `TD`, `Tm`, `T*`, `Ts` | | Text showing | `Tj`, `TJ`, `'`, `"` | -| XObjects | `Do` (Form XObjects recursively; Image XObjects: JPEG/`DCTDecode`, JPEG2000/`JPXDecode` where `ImageIO` supports it, and uncompressed / Flate-decoded 8-bit DeviceGray / DeviceRGB / DeviceCMYK) | +| XObjects | `Do` (see below) | + +XObject coverage: +- Form XObjects render recursively, applying their own `/Matrix` and `/BBox` + under the current CTM with full state save/restore. +- Image XObjects decode via `ImageIO` for JPEG (`DCTDecode`) and JPEG 2000 + (`JPXDecode`, where the runtime supports it), and via a manual raster + builder for uncompressed / Flate-decoded 8-bit DeviceGray, DeviceRGB and + DeviceCMYK streams (CMYK approximated to sRGB on the fly). | Marked content / compatibility (no-op) | `BMC`, `BDC`, `EMC`, `MP`, `DP`, `BX`, `EX` | Inline images (`BI`/`ID`/`EI`) are stripped from the content stream before diff --git a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java index a1827c539..d629aa263 100644 --- a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java +++ b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java @@ -112,6 +112,19 @@ final class OpenPdfCorePageRenderer { private static final PdfName EXTGS_LC = new PdfName("LC"); private static final PdfName EXTGS_LJ = new PdfName("LJ"); + // Color-space identifiers used by imageComponents(). + private static final PdfName CS_ICC_BASED = new PdfName("ICCBased"); + private static final PdfName CS_CAL_GRAY = new PdfName("CalGray"); + private static final PdfName CS_CAL_RGB = new PdfName("CalRGB"); + private static final PdfName CS_LAB = new PdfName("Lab"); + private static final PdfName CS_N = new PdfName("N"); + private static final java.util.Set DEVICE_GRAY_NAMES = java.util.Set.of( + PdfName.DEVICEGRAY, new PdfName("G"), CS_CAL_GRAY); + private static final java.util.Set DEVICE_RGB_NAMES = java.util.Set.of( + PdfName.DEVICERGB, new PdfName("RGB"), CS_CAL_RGB); + private static final java.util.Set DEVICE_CMYK_NAMES = java.util.Set.of( + PdfName.DEVICECMYK, new PdfName("CMYK")); + private final Graphics2D g2; private final PdfDictionary resources; private final Map fontCache = new HashMap<>(); @@ -676,26 +689,28 @@ private void applyDashPattern(PdfArray array, float phase) { } private void applyExtGState(String name) { - if (resources == null) { + PdfDictionary dict = resolveExtGStateDict(name); + if (dict == null) { return; } + applyExtGStateAlpha(dict); + applyExtGStateLineStyle(dict); + } + + private PdfDictionary resolveExtGStateDict(String name) { + if (resources == null) { + return null; + } PdfDictionary gsResources = resources.getAsDict(PdfName.EXTGSTATE); if (gsResources == null) { - return; + return null; } PdfObject obj = gsResources.get(new PdfName(name)); - PdfDictionary dict; - if (obj instanceof PdfDictionary d) { - dict = d; - } else if (obj instanceof PRIndirectReference ref) { - PdfObject resolved = PdfReader.getPdfObject(ref); - dict = resolved instanceof PdfDictionary ? (PdfDictionary) resolved : null; - } else { - dict = null; - } - if (dict == null) { - return; - } + PdfObject direct = obj instanceof PRIndirectReference ref ? PdfReader.getPdfObject(ref) : obj; + return direct instanceof PdfDictionary d ? d : null; + } + + private void applyExtGStateAlpha(PdfDictionary dict) { PdfNumber ca = dict.getAsNumber(PdfName.ca); if (ca != null) { state.fillAlpha = clamp01(ca.floatValue()); @@ -706,6 +721,9 @@ private void applyExtGState(String name) { state.strokeAlpha = clamp01(upperCA.floatValue()); state.strokeColor = applyAlpha(state.strokeColor, state.strokeAlpha); } + } + + private void applyExtGStateLineStyle(PdfDictionary dict) { PdfNumber lw = dict.getAsNumber(EXTGS_LW); if (lw != null) { state.lineWidth = lw.floatValue(); @@ -934,43 +952,58 @@ private BufferedImage decodeRawRaster(PRStream stream, int width, int height) { * Returns the number of color components for a {@code /ColorSpace} entry, or 0 if unsupported. */ private static int imageComponents(PdfObject csObj) { - if (csObj instanceof PRIndirectReference ind) { - csObj = PdfReader.getPdfObject(ind); + PdfObject direct = csObj instanceof PRIndirectReference ind ? PdfReader.getPdfObject(ind) : csObj; + if (direct instanceof PdfName n) { + return componentsForNamedColorSpace(n); } - if (csObj instanceof PdfName n) { - if (PdfName.DEVICEGRAY.equals(n) || new PdfName("G").equals(n) || new PdfName("CalGray").equals(n)) { - return 1; - } - if (PdfName.DEVICERGB.equals(n) || new PdfName("RGB").equals(n) || new PdfName("CalRGB").equals(n)) { - return 3; - } - if (PdfName.DEVICECMYK.equals(n) || new PdfName("CMYK").equals(n)) { - return 4; - } + if (direct instanceof PdfArray arr) { + return componentsForArrayColorSpace(arr); } - if (csObj instanceof PdfArray arr && arr.size() >= 1) { - PdfObject head = arr.getDirectObject(0); - // ICCBased: [/ICCBased <<.../N n>>] - if (new PdfName("ICCBased").equals(head) && arr.size() >= 2) { - PdfObject paramsObj = arr.getDirectObject(1); - if (paramsObj instanceof PdfDictionary params) { - PdfNumber n = params.getAsNumber(new PdfName("N")); - if (n != null) { - return n.intValue(); - } - } - } - // CalGray / CalRGB / Lab - if (new PdfName("CalGray").equals(head)) { - return 1; - } - if (new PdfName("CalRGB").equals(head) || new PdfName("Lab").equals(head)) { - return 3; - } + return 0; + } + + private static int componentsForNamedColorSpace(PdfName name) { + if (DEVICE_GRAY_NAMES.contains(name)) { + return 1; + } + if (DEVICE_RGB_NAMES.contains(name)) { + return 3; + } + if (DEVICE_CMYK_NAMES.contains(name)) { + return 4; } return 0; } + private static int componentsForArrayColorSpace(PdfArray arr) { + if (arr.size() < 1) { + return 0; + } + PdfObject head = arr.getDirectObject(0); + if (CS_ICC_BASED.equals(head)) { + return iccBasedComponents(arr); + } + if (CS_CAL_GRAY.equals(head)) { + return 1; + } + if (CS_CAL_RGB.equals(head) || CS_LAB.equals(head)) { + return 3; + } + return 0; + } + + private static int iccBasedComponents(PdfArray arr) { + if (arr.size() < 2) { + return 0; + } + PdfObject paramsObj = arr.getDirectObject(1); + if (!(paramsObj instanceof PdfDictionary params)) { + return 0; + } + PdfNumber n = params.getAsNumber(CS_N); + return n == null ? 0 : n.intValue(); + } + private static BufferedImage buildGrayImage(byte[] data, int width, int height) { BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY); byte[] target = ((DataBufferByte) img.getRaster().getDataBuffer()).getData(); From b064aed7b89ba661909fd2e6708e5e91ea7f9d61 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 09:51:50 +0000 Subject: [PATCH 05/11] Render text with the PDF's embedded TrueType font program Biggest correctness gap on real-world PDFs has been text rendering: mapFont() picked a generic Java2D family (Serif/Sans/Mono) from the PostScript font name, so PDFs that embedded their own subsetted fonts drew with the wrong glyph shapes (and missed glyphs whenever the name heuristic chose a family that didn't cover the Unicode chars). This commit closes that gap for the dominant case (embedded TrueType / FontFile2): - mapFont() now first calls embeddedFontFor(...), which pulls the CMapAwareDocumentFont's FontDescriptor via openpdf-core, finds the embedded font program stream (FontFile2 / FontFile3 / FontFile in that preference order), and loads it with Font.createFont. The resulting AWT Font is cached by FontDescriptor identity so the same font program isn't re-parsed for every Tj/TJ call. - When no font program is embedded, or parsing fails, falls back to the previous name-heuristic path (now mapFontByName(...)). - Failures are cached (as a null Font) so we don't retry every glyph. Test: - rendersTextUsingEmbeddedTrueTypeFont embeds LiberationSans-Regular (shipped with openpdf-core for font-fallback) in a freshly built PDF, renders the page back and verifies dark pixels appear in the text region. The embedded program is required: no name-based AWT family would match "LiberationSans". README's "Status" section updated and a candid "Honest limitations & roadmap" subsection added. It calls out the remaining gaps in priority order (Type 1 / CFF fonts, Type 3 fonts, ICC color management, patterns and shadings, inline images, soft masks, indexed/Separation/DeviceN, encryption) so future contributors know which gap to grab next. Module test suite: 85 tests, 0 failures. --- openpdf-renderer/README.md | 44 +++++++++- .../core/OpenPdfCorePageRenderer.java | 88 ++++++++++++++++++- .../OpenPdfCorePageRendererOperatorsTest.java | 35 ++++++++ 3 files changed, 165 insertions(+), 2 deletions(-) diff --git a/openpdf-renderer/README.md b/openpdf-renderer/README.md index 08cea3cad..3fc2a1dd2 100644 --- a/openpdf-renderer/README.md +++ b/openpdf-renderer/README.md @@ -110,6 +110,8 @@ PDF content-stream operators — sufficient for typical text + vector PDFs: | Text showing | `Tj`, `TJ`, `'`, `"` | | XObjects | `Do` (see below) | +| Marked content / compatibility (no-op) | `BMC`, `BDC`, `EMC`, `MP`, `DP`, `BX`, `EX` | + XObject coverage: - Form XObjects render recursively, applying their own `/Matrix` and `/BBox` under the current CTM with full state save/restore. @@ -117,7 +119,15 @@ XObject coverage: (`JPXDecode`, where the runtime supports it), and via a manual raster builder for uncompressed / Flate-decoded 8-bit DeviceGray, DeviceRGB and DeviceCMYK streams (CMYK approximated to sRGB on the fly). -| Marked content / compatibility (no-op) | `BMC`, `BDC`, `EMC`, `MP`, `DP`, `BX`, `EX` | + +Text rendering: for each `Tf`-selected font, the renderer pulls the +embedded font program (`FontFile2`/`FontFile3`/`FontFile`) out of the +FontDescriptor and loads it via `java.awt.Font.createFont`. Embedded +TrueType fonts therefore render with their own glyph shapes. When a +font isn't embedded (or the embedded program can't be loaded), the +renderer falls back to a generic Java2D family picked by PostScript-name +heuristics — glyph widths from the PDF font are still respected, +but shapes are only approximate. Inline images (`BI`/`ID`/`EI`) are stripped from the content stream before parsing — they aren't rendered, but they don't derail the rest of the @@ -130,6 +140,38 @@ For pages that need features outside this supported subset and you want pixel-perfect output today, the deprecated `PDFFile` / `PDFPage.getImage(...)` API still works. +### Honest limitations & roadmap + +`OpenPdfCoreRenderer` is intentionally a focused, lightweight renderer. +The legacy in-tree parser still wins on real-world PDFs that exercise: + +- **Embedded Type 1 / CFF / OpenType-CFF fonts.** `Font.createFont` only + loads TrueType reliably; `FontFile3` (CFF/OpenType) is attempted but + often falls back to the name-heuristic path. Subsetted TrueType fonts + with non-Unicode CMaps draw `.notdef` for codes their `cmap` table + doesn't list. Real fix: drive glyph dispatch from the PDF's encoding / + CMap to glyph IDs and render via `Font#createGlyphVector(int[])`. +- **Type 3 fonts.** Glyph operators (`d0`, `d1` + nested content streams) + are ignored. +- **Color management.** CMYK uses the textbook `(1-c)(1-k)` approximation; + no ICC profile, no UCR/BG. Anything color-managed will look noticeably + wrong. Real fix: respect the ICCBased profile via `java.awt.color.ICC_Profile`. +- **Pattern and shading paint** (`pattern`, `sh`). Ignored. +- **Inline images.** Currently dropped; the parser-level strip keeps the + rest of the page rendering. Re-implementing them on the existing raster + helpers is straightforward. +- **Soft masks (`SMask`) and transparency groups.** Ignored; image alpha + honors `ca` only, not per-pixel masks. +- **Indexed / Separation / DeviceN color spaces** for images and paths. + Ignored; falls back to filling with the color-space default. +- **Encrypted PDFs.** Out of scope for this module (see "Encryption: removed" + below). + +These gaps are why the legacy `PDFFile` / `PDFPage` path remains the +production renderer for the time being. Each item above is a fairly +localized addition to `OpenPdfCorePageRenderer`; the order above is +roughly highest-impact first. + ## Quick Start ### Basic PDF to Image Conversion diff --git a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java index d629aa263..e13101095 100644 --- a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java +++ b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java @@ -13,6 +13,7 @@ import java.awt.Color; import java.awt.Composite; import java.awt.Font; +import java.awt.FontFormatException; import java.awt.Graphics2D; import java.awt.RenderingHints; import java.awt.Shape; @@ -90,6 +91,17 @@ * colors and type 3 font glyph operators are silently ignored. Pages that * rely heavily on those features may render with missing content.

    * + *

    Text rendering

    + *

    For each {@code Tf}-selected font, the renderer pulls the embedded font + * program ({@code FontFile2}, {@code FontFile3} or {@code FontFile} on the + * FontDescriptor) out via {@code openpdf-core}'s {@code PdfReader} and hands + * the bytes to {@link java.awt.Font#createFont}. The resulting AWT font is + * cached and used to draw glyphs, so subsetted / embedded TrueType fonts + * render with their own glyph shapes. When no font program is embedded (or + * loading it fails), the renderer falls back to a generic Java2D family + * picked by PostScript-name heuristics — correct shape only by + * accident, but the glyph widths from the PDF font are still respected.

    + * *

    Coordinates

    *

    The PDF user space has its origin at the bottom-left and Y growing up; * the {@link Graphics2D} target has its origin at the top-left and Y growing @@ -112,6 +124,13 @@ final class OpenPdfCorePageRenderer { private static final PdfName EXTGS_LC = new PdfName("LC"); private static final PdfName EXTGS_LJ = new PdfName("LJ"); + // FontDescriptor entries that may hold an embedded font program, in preference order: + // FontFile2 (TrueType) is by far the most common in modern PDFs; FontFile3 holds CFF / + // OpenType subsets which TYPE-1-flagged AWT Font.createFont also accepts in many cases; + // FontFile is legacy Type 1 (rarely loadable as TRUETYPE_FONT but worth trying). + private static final java.util.List EMBEDDED_FONT_KEYS = java.util.List.of( + PdfName.FONTFILE2, PdfName.FONTFILE3, PdfName.FONTFILE); + // Color-space identifiers used by imageComponents(). private static final PdfName CS_ICC_BASED = new PdfName("ICCBased"); private static final PdfName CS_CAL_GRAY = new PdfName("CalGray"); @@ -128,6 +147,12 @@ final class OpenPdfCorePageRenderer { private final Graphics2D g2; private final PdfDictionary resources; private final Map fontCache = new HashMap<>(); + /** + * Embedded-font program cache keyed by the FontDescriptor's identity. Re-parsing a + * TrueType program for every {@code Tj} call would be wasteful; a single page can + * reference the same font hundreds of times. + */ + private final Map awtFontCache = new HashMap<>(); private final Deque stateStack = new ArrayDeque<>(); private final Deque ctmStack = new ArrayDeque<>(); @@ -1140,6 +1165,67 @@ private float computeTextAdvance(String text) { } private Font mapFont(CMapAwareDocumentFont docFont, float size) { + float pointSize = Math.max(size, 0.1f); + Font embedded = embeddedFontFor(docFont); + if (embedded != null) { + return embedded.deriveFont(pointSize); + } + return mapFontByName(docFont).deriveFont(pointSize); + } + + /** + * Tries to load the font program embedded in the PDF and turn it into a Java AWT + * {@link Font}. Returns {@code null} for fonts that don't embed a TrueType / OpenType + * program, or for which the program fails to parse — callers should fall back + * to {@link #mapFontByName(CMapAwareDocumentFont)}. + */ + private Font embeddedFontFor(CMapAwareDocumentFont docFont) { + if (docFont == null) { + return null; + } + PdfDictionary descriptor = docFont.getFontDescriptor(); + if (descriptor == null) { + return null; + } + if (awtFontCache.containsKey(descriptor)) { + return awtFontCache.get(descriptor); // may be null = previously failed to load + } + // Try TrueType first (FontFile2), then OpenType / CFF (FontFile3), then Type1 (FontFile). + PRStream program = fontProgramStream(descriptor); + if (program == null) { + awtFontCache.put(descriptor, null); + return null; + } + try { + byte[] bytes = PdfReader.getStreamBytes(program); + Font font = Font.createFont(Font.TRUETYPE_FONT, new ByteArrayInputStream(bytes)); + awtFontCache.put(descriptor, font); + return font; + } catch (FontFormatException | IOException | RuntimeException e) { + LOG.log(Level.FINE, "Falling back from embedded font program due to: {0}", e); + awtFontCache.put(descriptor, null); + return null; + } + } + + private static PRStream fontProgramStream(PdfDictionary descriptor) { + for (PdfName key : EMBEDDED_FONT_KEYS) { + PdfObject raw = descriptor.get(key); + PdfObject direct = raw instanceof PRIndirectReference ref + ? PdfReader.getPdfObject(ref) : raw; + if (direct instanceof PRStream stream) { + return stream; + } + } + return null; + } + + /** + * Maps a PDF font to a generic Java2D font family using the PostScript font name as + * a hint. This is the last-resort path used when the PDF doesn't embed a font program + * or the program can't be loaded. + */ + private static Font mapFontByName(CMapAwareDocumentFont docFont) { String family = Font.SERIF; int style = Font.PLAIN; if (docFont != null) { @@ -1160,7 +1246,7 @@ private Font mapFont(CMapAwareDocumentFont docFont, float size) { } } } - return new Font(family, style, 1).deriveFont(Math.max(size, 0.1f)); + return new Font(family, style, 1); } private CMapAwareDocumentFont lookupFont(String name) { diff --git a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java index 97b6600b9..369861633 100644 --- a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java +++ b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java @@ -285,6 +285,41 @@ void inlineImagesDoNotBreakPageRendering() throws Exception { } } + @Test + void rendersTextUsingEmbeddedTrueTypeFont() throws Exception { + // Build a PDF that embeds a real TrueType font and writes a glyph that's + // not the .notdef of any built-in AWT fallback. The renderer must extract + // the FontFile2 byte stream and use Font.createFont under the hood; we can + // detect this by checking that the rendered text region contains dark pixels + // (i.e. the glyph was drawn) while the renderer was forced to load an + // embedded program (no name-based AWT family would match this font). + java.net.URL ttf = OpenPdfCorePageRendererOperatorsTest.class.getClassLoader() + .getResource("font-fallback/LiberationSans-Regular.ttf"); + assertThat(ttf).as("LiberationSans-Regular.ttf must be on the classpath via openpdf-core").isNotNull(); + + byte[] pdf = buildPdf(cb -> { + org.openpdf.text.pdf.BaseFont bf = org.openpdf.text.pdf.BaseFont + .createFont(ttf.toString(), org.openpdf.text.pdf.BaseFont.WINANSI, + org.openpdf.text.pdf.BaseFont.EMBEDDED); + cb.beginText(); + cb.setFontAndSize(bf, 32f); + cb.setTextMatrix(40f, 200f); + cb.showText("Hello PDF"); + cb.endText(); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + BufferedImage img = r.renderPage(1, 150f); + saveForInspection(img, "embedded-truetype.png"); + + int darkPixels = countPixelsMatching(img, (red, green, blue) -> + red < 80 && green < 80 && blue < 80); + assertThat(darkPixels) + .as("text drawn with an embedded TrueType font must produce glyph pixels") + .isGreaterThan(200); + } + } + @Test void rendersTextRiseAsVerticalOffset() throws Exception { // Two glyphs at the same Td, one with Ts=10 (raised). They must render From 2db7c3fe9aabb3d67ff23e3487e8b9c9b789072b Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 09:52:29 +0000 Subject: [PATCH 06/11] Add blank line before XObject-coverage list in README Codacy markdownlint flagged the bullet list under "XObject coverage:" for missing a leading blank line (lists should be surrounded by blank lines). Single-line fix. --- openpdf-renderer/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/openpdf-renderer/README.md b/openpdf-renderer/README.md index 3fc2a1dd2..f1cc9e2a5 100644 --- a/openpdf-renderer/README.md +++ b/openpdf-renderer/README.md @@ -113,6 +113,7 @@ PDF content-stream operators — sufficient for typical text + vector PDFs: | Marked content / compatibility (no-op) | `BMC`, `BDC`, `EMC`, `MP`, `DP`, `BX`, `EX` | XObject coverage: + - Form XObjects render recursively, applying their own `/Matrix` and `/BBox` under the current CTM with full state save/restore. - Image XObjects decode via `ImageIO` for JPEG (`DCTDecode`) and JPEG 2000 From 45c2bd0aa5dd0cb982c7d531db338bff922b0a48 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 10:10:52 +0000 Subject: [PATCH 07/11] Render inline images (BI/ID/EI) and clean up FQN code-style issues Implements the inline-image roadmap item: instead of pre-stripping inline image blocks, the preprocessor now promotes each one into a synthetic Image XObject and substitutes a `/__inline_image__N Do` invocation into the content stream. The rest of the renderer treats it exactly like a regular Image XObject and reuses the existing buildGrayImage / buildRgbImage / buildCmykImage / ImageIO decode paths. Two framing strategies are used so the parser doesn't get confused by binary data: - For DCT / DCTDecode / JPXDecode filters, find the JPEG end-of-image marker (FFD9) instead of scanning for "EI" bounded by whitespace, since JPEG payloads routinely contain byte sequences that look like EI by accident. - For other filters (including no filter and FlateDecode), keep the whitespace-bounded EI heuristic but stop trimming "trailing whitespace" greedily -- image bytes can legitimately be 0x00 or 0x0A and the spec guarantees exactly one whitespace byte before EI. Abbreviated dict keys (/W, /H, /BPC, /CS, /F) and full names (/Width, /Height, ...) are both accepted; abbreviated colorspace values (/G, /RGB, /CMYK) and full names map to component counts. Tests: - inlineImageRendersAtCtmLocation builds a 2x2 DeviceGray inline image with a [black, white; white, black] checker, scales it 120x via a cm, and asserts the rendered page contains dark pixels in the right region. - jpegInlineImageDecodes uses PdfContentByte.addImage(image, ..., true) to embed a green JPEG as an inline image, then asserts the rendered page contains green pixels. README's status section now says inline images render, and the limitations list no longer mentions them. Also addresses Codacy's "unnecessary fully qualified name" warning on java.util.List / java.util.Set usage. The class now imports List, Set, Arrays, ByteArrayOutputStream, StandardCharsets and Rectangle2D directly instead of inlining the FQNs; 7 call sites simplified. Module test suite: 86 tests, 0 failures. --- openpdf-renderer/README.md | 18 +- .../core/OpenPdfCorePageRenderer.java | 354 ++++++++++++++++-- .../OpenPdfCorePageRendererOperatorsTest.java | 65 +++- 3 files changed, 392 insertions(+), 45 deletions(-) diff --git a/openpdf-renderer/README.md b/openpdf-renderer/README.md index f1cc9e2a5..38fa0f5f8 100644 --- a/openpdf-renderer/README.md +++ b/openpdf-renderer/README.md @@ -130,12 +130,15 @@ renderer falls back to a generic Java2D family picked by PostScript-name heuristics — glyph widths from the PDF font are still respected, but shapes are only approximate. -Inline images (`BI`/`ID`/`EI`) are stripped from the content stream before -parsing — they aren't rendered, but they don't derail the rest of the -page either. Shading (`sh`), pattern / shading colors and type 3 font glyph -operators are silently ignored. Pages that rely heavily on those features -may render with missing content. Adding more operators is a localized change -in `OpenPdfCorePageRenderer`. +Inline images (`BI`/`ID`/`EI`) are now rendered: a preprocess pass promotes +each inline image into a synthetic Image XObject (with JPEG framing detected +by the JPEG `FFD9` end-of-image marker when the filter is `DCTDecode` to +sidestep the ambiguous whitespace-bounded `EI` heuristic), then the rest of +the renderer treats it like any other XObject. Uncompressed, Flate-decoded +and JPEG inline images are supported. Shading (`sh`), pattern / shading +colors and type 3 font glyph operators are silently ignored. Pages that +rely heavily on those features may render with missing content. Adding more +operators is a localized change in `OpenPdfCorePageRenderer`. For pages that need features outside this supported subset and you want pixel-perfect output today, the deprecated `PDFFile` / `PDFPage.getImage(...)` @@ -158,9 +161,6 @@ The legacy in-tree parser still wins on real-world PDFs that exercise: no ICC profile, no UCR/BG. Anything color-managed will look noticeably wrong. Real fix: respect the ICCBased profile via `java.awt.color.ICC_Profile`. - **Pattern and shading paint** (`pattern`, `sh`). Ignored. -- **Inline images.** Currently dropped; the parser-level strip keeps the - rest of the page rendering. Re-implementing them on the existing raster - helpers is straightforward. - **Soft masks (`SMask`) and transparency groups.** Ignored; image alpha honors `ca` only, not per-pixel masks. - **Indexed / Separation / DeviceN color spaces** for images and paths. diff --git a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java index e13101095..27564577f 100644 --- a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java +++ b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java @@ -19,18 +19,23 @@ import java.awt.Shape; import java.awt.geom.AffineTransform; import java.awt.geom.Path2D; +import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; import java.awt.image.DataBufferByte; import java.awt.image.WritableRaster; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.ArrayDeque; import java.util.ArrayList; +import java.util.Arrays; import java.util.Deque; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; @@ -85,11 +90,13 @@ * 8-bit DeviceGray / DeviceRGB / DeviceCMYK images). * * - *

    Inline images ({@code BI}/{@code ID}/{@code EI}) are stripped from the - * content stream before parsing — they're not rendered, but they don't - * derail the rest of the page either. Shading {@code sh}, pattern / shading - * colors and type 3 font glyph operators are silently ignored. Pages that - * rely heavily on those features may render with missing content.

    + *

    Inline images ({@code BI}/{@code ID}/{@code EI}) are promoted out of the + * content stream into synthetic Image XObjects during a preprocess pass, then + * rendered via the same code path as regular Image XObjects. Uncompressed, + * Flate-decoded and JPEG inline images all work. Shading {@code sh}, + * pattern / shading colors and type 3 font glyph operators are silently + * ignored. Pages that rely heavily on those features may render with missing + * content.

    * *

    Text rendering

    *

    For each {@code Tf}-selected font, the renderer pulls the embedded font @@ -128,7 +135,7 @@ final class OpenPdfCorePageRenderer { // FontFile2 (TrueType) is by far the most common in modern PDFs; FontFile3 holds CFF / // OpenType subsets which TYPE-1-flagged AWT Font.createFont also accepts in many cases; // FontFile is legacy Type 1 (rarely loadable as TRUETYPE_FONT but worth trying). - private static final java.util.List EMBEDDED_FONT_KEYS = java.util.List.of( + private static final List EMBEDDED_FONT_KEYS = List.of( PdfName.FONTFILE2, PdfName.FONTFILE3, PdfName.FONTFILE); // Color-space identifiers used by imageComponents(). @@ -137,13 +144,20 @@ final class OpenPdfCorePageRenderer { private static final PdfName CS_CAL_RGB = new PdfName("CalRGB"); private static final PdfName CS_LAB = new PdfName("Lab"); private static final PdfName CS_N = new PdfName("N"); - private static final java.util.Set DEVICE_GRAY_NAMES = java.util.Set.of( + private static final Set DEVICE_GRAY_NAMES = Set.of( PdfName.DEVICEGRAY, new PdfName("G"), CS_CAL_GRAY); - private static final java.util.Set DEVICE_RGB_NAMES = java.util.Set.of( + private static final Set DEVICE_RGB_NAMES = Set.of( PdfName.DEVICERGB, new PdfName("RGB"), CS_CAL_RGB); - private static final java.util.Set DEVICE_CMYK_NAMES = java.util.Set.of( + private static final Set DEVICE_CMYK_NAMES = Set.of( PdfName.DEVICECMYK, new PdfName("CMYK")); + /** + * Synthetic XObject-name prefix used for inline images that have been promoted out + * of the content stream into {@link #inlineImages}. Keeping a clearly distinctive + * prefix avoids collisions with real {@code /XObject} resource names. + */ + private static final String INLINE_IMAGE_PREFIX = "__inline_image__"; + private final Graphics2D g2; private final PdfDictionary resources; private final Map fontCache = new HashMap<>(); @@ -153,6 +167,11 @@ final class OpenPdfCorePageRenderer { * reference the same font hundreds of times. */ private final Map awtFontCache = new HashMap<>(); + /** + * Decoded inline images, keyed by the synthetic XObject name we substitute into + * the content stream in place of the original {@code BI...EI} block. + */ + private final Map inlineImages = new HashMap<>(); private final Deque stateStack = new ArrayDeque<>(); private final Deque ctmStack = new ArrayDeque<>(); @@ -254,7 +273,7 @@ static void render(PdfReader reader, int pageNumber, Graphics2D g2, } private void processContent(byte[] contentBytes) throws IOException { - byte[] sanitized = stripInlineImages(contentBytes); + byte[] sanitized = preprocessInlineImages(contentBytes); PdfContentParser parser = new PdfContentParser(new PRTokeniser(sanitized)); List operands = new ArrayList<>(); while (true) { @@ -282,18 +301,21 @@ private void processContent(byte[] contentBytes) throws IOException { } /** - * Returns a copy of {@code content} with every inline-image block - * ({@code BI ... ID ... EI}) removed. {@code PdfContentParser} has no - * special handling for inline images, so the raw image bytes between - * {@code ID} and {@code EI} would derail tokenization. Removing the - * block keeps the rest of the page parseable; the inline image itself - * isn't rendered. + * Walks {@code content} looking for inline-image blocks ({@code BI ... ID ... EI}) + * and rewrites each one into a synthetic {@code /name Do} invocation, with the decoded + * pixel data stashed in {@link #inlineImages}. Blocks that can't be decoded are + * dropped from the stream so the rest of the page still parses. + * + *

    {@code PdfContentParser} has no native inline-image handling: the raw image + * bytes between {@code ID} and {@code EI} would derail tokenization. Promoting them + * to synthetic XObjects keeps the parser on a well-defined token grammar and lets + * the rest of the renderer treat them exactly like any other image XObject.

    */ - private static byte[] stripInlineImages(byte[] content) { + private byte[] preprocessInlineImages(byte[] content) { if (content == null || content.length == 0) { return content; } - java.io.ByteArrayOutputStream out = new java.io.ByteArrayOutputStream(content.length); + ByteArrayOutputStream out = new ByteArrayOutputStream(content.length); int i = 0; while (i < content.length) { int biStart = findToken(content, i, 'B', 'I'); @@ -302,17 +324,237 @@ private static byte[] stripInlineImages(byte[] content) { break; } out.write(content, i, biStart - i); - // Find the matching EI (preceded by whitespace, followed by whitespace or EOF). - int eiEnd = findEndInlineImage(content, biStart + 2); - if (eiEnd < 0) { - // No EI found: bail out, drop the rest of the stream. + int idStart = findToken(content, biStart + 2, 'I', 'D'); + if (idStart < 0) { break; } + // Dictionary tokens sit between "BI" (exclusive) and "ID" (exclusive). + byte[] header = Arrays.copyOfRange(content, biStart + 2, idStart); + int dataStart = idStart + 3; // skip "ID" + one whitespace byte + int dataEnd = locateInlineImageDataEnd(content, dataStart, header); + int eiEnd; + if (dataEnd < 0) { + // Couldn't locate a valid data end (e.g. malformed JPEG). Drop the + // rest of the stream rather than risk a runaway parser. + break; + } + // dataEnd points at the whitespace byte before "EI"; advance past "EI". + eiEnd = dataEnd + 3; + byte[] data = dataEnd > dataStart + ? Arrays.copyOfRange(content, dataStart, dataEnd) + : new byte[0]; + String synthName = registerInlineImage(header, data); + if (synthName != null) { + String invocation = " /" + synthName + " Do "; + byte[] subst = invocation.getBytes(StandardCharsets.ISO_8859_1); + out.write(subst, 0, subst.length); + } i = eiEnd; } return out.toByteArray(); } + /** + * Parses an inline-image header and decodes the image data, registering the result in + * {@link #inlineImages}. Returns the synthetic XObject name to substitute into the + * content stream, or {@code null} if the image couldn't be decoded. + */ + private String registerInlineImage(byte[] header, byte[] data) { + try { + InlineImageHeader hdr = InlineImageHeader.parse(header); + if (hdr.width <= 0 || hdr.height <= 0) { + return null; + } + // For non-JPEG paths we need a known component count to size the raster; + // JPEG decode goes through ImageIO which figures it out from the stream. + if (!hdr.isJpeg() && hdr.components <= 0) { + return null; + } + byte[] decoded = decodeInlineImageData(data, hdr); + if (decoded == null) { + return null; + } + BufferedImage img = buildImageForComponents(decoded, hdr.width, hdr.height, hdr.components); + if (img == null) { + return null; + } + String name = INLINE_IMAGE_PREFIX + inlineImages.size(); + inlineImages.put(name, img); + return name; + } catch (IOException | RuntimeException e) { + LOG.log(Level.FINE, "Skipping inline image: {0}", e); + return null; + } + } + + private static byte[] decodeInlineImageData(byte[] data, InlineImageHeader hdr) throws IOException { + if (hdr.isJpeg()) { + // ImageIO consumes the JPEG end-to-end; let the caller route via buildJpegInline. + return data; + } + if (hdr.isFlate()) { + return PdfReader.FlateDecode(data); + } + if (hdr.filter == null) { + return data; + } + return null; // Unsupported filter (CCITT, LZW, ...) -- skip. + } + + private BufferedImage buildImageForComponents(byte[] decoded, int width, int height, int components) + throws IOException { + // Synthetic JPEG path: hdr told us to defer to ImageIO. + if (components == 0) { + return ImageIO.read(new ByteArrayInputStream(decoded)); + } + int rowBytes = width * components; + if (decoded.length < rowBytes * height) { + return null; + } + switch (components) { + case 1: + return buildGrayImage(decoded, width, height); + case 3: + return buildRgbImage(decoded, width, height); + case 4: + return buildCmykImage(decoded, width, height); + default: + return null; + } + } + + /** + * Parsed form of an inline-image header dictionary. Only the entries the renderer + * needs are extracted; everything else is ignored. + */ + private static final class InlineImageHeader { + int width; + int height; + int bitsPerComponent = 8; + int components; + String filter; // null = no filter + boolean jpeg; + + boolean isJpeg() { + return jpeg; + } + + boolean isFlate() { + return "FlateDecode".equals(filter) || "Fl".equals(filter); + } + + static InlineImageHeader parse(byte[] header) { + InlineImageHeader h = new InlineImageHeader(); + List tokens = tokenizeHeader(header); + for (int i = 0; i + 1 < tokens.size(); i += 2) { + String key = tokens.get(i); + String value = tokens.get(i + 1); + if (!key.startsWith("/")) { + return h; + } + applyHeaderEntry(h, key.substring(1), value); + } + return h; + } + + private static void applyHeaderEntry(InlineImageHeader h, String key, String value) { + switch (key) { + case "W": + case "Width": + h.width = parseIntSafe(value); + break; + case "H": + case "Height": + h.height = parseIntSafe(value); + break; + case "BPC": + case "BitsPerComponent": + h.bitsPerComponent = parseIntSafe(value); + break; + case "CS": + case "ColorSpace": + h.components = componentsForAbbreviatedColorSpace(value); + break; + case "F": + case "Filter": + h.filter = stripLeadingSlash(value); + if ("DCT".equals(h.filter) || "DCTDecode".equals(h.filter) + || "JPXDecode".equals(h.filter)) { + h.jpeg = true; + h.components = 0; // signals "let ImageIO decide" + } + break; + default: + // Ignored: Decode, Mask, ImageMask, Interpolate, DecodeParms, ... + break; + } + } + + private static int parseIntSafe(String s) { + try { + return Integer.parseInt(s); + } catch (NumberFormatException e) { + return 0; + } + } + + private static String stripLeadingSlash(String s) { + return s.startsWith("/") ? s.substring(1) : s; + } + + private static int componentsForAbbreviatedColorSpace(String value) { + String name = stripLeadingSlash(value); + switch (name) { + case "G": + case "DeviceGray": + case "CalGray": + return 1; + case "RGB": + case "DeviceRGB": + case "CalRGB": + return 3; + case "CMYK": + case "DeviceCMYK": + return 4; + default: + return 0; + } + } + } + + /** + * Splits an inline-image header dictionary into whitespace-separated tokens. + * Arrays in the source (rare but legal) are kept as single bracketed tokens so the + * key/value pairing stays consistent. + */ + private static List tokenizeHeader(byte[] header) { + List tokens = new ArrayList<>(); + int i = 0; + while (i < header.length) { + while (i < header.length && isPdfWhitespace(header[i])) { + i++; + } + if (i >= header.length) { + break; + } + int start = i; + if (header[i] == '[') { + while (i < header.length && header[i] != ']') { + i++; + } + if (i < header.length) { + i++; + } + } else { + while (i < header.length && !isPdfWhitespace(header[i])) { + i++; + } + } + tokens.add(new String(header, start, i - start, StandardCharsets.ISO_8859_1)); + } + return tokens; + } + /** * Finds the offset of a two-byte token (e.g. {@code "BI"}) bounded by whitespace. */ @@ -330,6 +572,58 @@ private static int findToken(byte[] buf, int from, char c1, char c2) { return -1; } + /** + * Returns the index of the whitespace byte preceding the closing {@code EI} for the + * inline image whose data starts at {@code dataStart}, using filter-aware framing. + * For JPEG inline images (Filter = DCT / DCTDecode) we scan for the JPEG end-of-image + * marker {@code FFD9} rather than a whitespace-bounded {@code EI}, since the JPEG + * payload routinely contains byte sequences that look like {@code EI} by accident. + * For everything else we fall back to whitespace-bounded {@code EI} search. + * Returns -1 when no usable end can be found. + */ + private static int locateInlineImageDataEnd(byte[] buf, int dataStart, byte[] header) { + if (isJpegInlineHeader(header)) { + int eoi = findJpegEndOfImage(buf, dataStart); + if (eoi >= 0) { + // After FFD9 there should be a single whitespace byte and then "EI". + int p = eoi + 2; + while (p < buf.length && isPdfWhitespace(buf[p])) { + p++; + } + if (p + 1 < buf.length && buf[p] == 'E' && buf[p + 1] == 'I') { + return eoi + 2; + } + // EI marker not found right after EOI; still treat EOI as end of data. + return eoi + 2; + } + } + int eiEnd = findEndInlineImage(buf, dataStart); + return eiEnd < 0 ? -1 : eiEnd - 3; + } + + /** + * Cheaply inspects an already-extracted inline-image header for a DCT filter entry + * without going through the full token parser. Used by the framing detector before + * we commit to a decode strategy. + */ + private static boolean isJpegInlineHeader(byte[] header) { + String s = new String(header, StandardCharsets.ISO_8859_1); + return s.contains("/DCT") || s.contains("/DCTDecode") || s.contains("/JPXDecode"); + } + + /** + * Returns the index of the {@code FF} byte of the JPEG end-of-image marker + * ({@code FFD9}) at or after {@code from}, or -1 if not found. + */ + private static int findJpegEndOfImage(byte[] buf, int from) { + for (int i = from; i < buf.length - 1; i++) { + if (buf[i] == (byte) 0xFF && buf[i + 1] == (byte) 0xD9) { + return i; + } + } + return -1; + } + /** * Returns the index just past the closing {@code EI} of an inline image starting after {@code BI}. */ @@ -779,6 +1073,11 @@ private void concatCtm(float a, float b, float c, float d, float e, float f) { * sub-renderer (Form or Image). Unknown subtypes are silently ignored. */ private void doXObject(String name) { + BufferedImage inline = inlineImages.get(name); + if (inline != null) { + drawUnitSquareImage(inline); + return; + } if (resources == null) { return; } @@ -828,7 +1127,7 @@ private void renderForm(PRStream form) { float w = floatAt(bbox, 2) - x; float h = floatAt(bbox, 3) - y; if (w > 0 && h > 0) { - g2.clip(new java.awt.geom.Rectangle2D.Float(x, y, w, h)); + g2.clip(new Rectangle2D.Float(x, y, w, h)); } } PdfDictionary formResources = form.getAsDict(PdfName.RESOURCES); @@ -856,6 +1155,15 @@ private void renderImage(PRStream image) { if (img == null) { return; } + drawUnitSquareImage(img); + } + + /** + * Draws a decoded image into the standard PDF image area: the (0,0)-(1,1) unit + * square in user space, with the CTM supplying placement/size. Honors the current + * fill alpha; saves and restores the {@link Graphics2D} transform. + */ + private void drawUnitSquareImage(BufferedImage img) { AffineTransform saved = g2.getTransform(); try { // PDF images map (0,0)-(1,1) in user space to the full image, with Y running up. diff --git a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java index 369861633..034a40d9a 100644 --- a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java +++ b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java @@ -254,34 +254,73 @@ void rendersFormXObjectViaNestedContentStream() throws Exception { } @Test - void inlineImagesDoNotBreakPageRendering() throws Exception { - // Build a content stream by hand that contains a tiny inline image - // followed by a normal vector draw. The inline image isn't expected - // to render, but the rest of the page must still parse and draw. + void inlineImageRendersAtCtmLocation() throws Exception { + // Build a content stream by hand: a 2x2 DeviceGray inline image whose pixels are + // [black, white; white, black], scaled and positioned by a cm before BI so that + // it covers a recognizable area of the page. The renderer must promote the inline + // image to a synthetic XObject and actually draw it under the CTM. byte[] pdf = buildPdf(cb -> { - // BI/ID/EI inline image: 2x2, 8bpc, DeviceGray, 4 raw bytes. String inline = "q\n" + + "120 0 0 120 60 60 cm\n" + "BI /W 2 /H 2 /CS /G /BPC 8 ID\n" - + new String(new byte[]{(byte) 0xFF, 0x00, 0x00, (byte) 0xFF}, + + new String(new byte[]{0x00, (byte) 0xFF, (byte) 0xFF, 0x00}, java.nio.charset.StandardCharsets.ISO_8859_1) + "\nEI\n" + "Q\n" + // Trailing red rect proves the rest of the page also still renders. + "1 0 0 rg\n" - + "50 50 80 80 re f\n"; + + "200 200 30 30 re f\n"; cb.setLiteral(inline); }); try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { - // Even though BI/ID/EI is in the stream, getContentOperators sees the - // BI block as raw tokens — but renderer must not crash and the trailing - // red rectangle must still be drawn. BufferedImage img = r.renderPage(1, 150f); - saveForInspection(img, "inline-image-stripped.png"); + saveForInspection(img, "inline-image.png"); + + int dark = countPixelsMatching(img, (red, green, blue) -> + red < 60 && green < 60 && blue < 60); + assertThat(dark) + .as("inline image must paint its black checker squares onto the page") + .isGreaterThan(100); + int redish = countPixelsMatching(img, (red, green, blue) -> red > 200 && green < 80 && blue < 80); assertThat(redish) - .as("after stripping inline image, the trailing red rectangle must still render") - .isGreaterThan(50); + .as("content after the inline image must still render") + .isGreaterThan(10); + } + } + + @Test + void jpegInlineImageDecodes() throws Exception { + // Same as the previous test but the inline image is a JPEG written via the + // PdfContentByte.addImage(image, true) helper, which produces a properly + // framed BI/ID/EI block (binary-safe length tracking, abbreviated filter name). + BufferedImage src = new BufferedImage(16, 16, BufferedImage.TYPE_INT_RGB); + java.awt.Graphics2D gs = src.createGraphics(); + try { + gs.setColor(new Color(0, 200, 0)); + gs.fillRect(0, 0, 16, 16); + } finally { + gs.dispose(); + } + ByteArrayOutputStream jpegOut = new ByteArrayOutputStream(); + ImageIO.write(src, "jpg", jpegOut); + org.openpdf.text.Image pdfImage = org.openpdf.text.Image.getInstance(jpegOut.toByteArray()); + + byte[] pdf = buildPdf(cb -> { + cb.addImage(pdfImage, 120f, 0f, 0f, 120f, 60f, 60f, /* inline */ true); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + BufferedImage img = r.renderPage(1, 150f); + saveForInspection(img, "inline-image-jpeg.png"); + + int greenish = countPixelsMatching(img, (red, green, blue) -> + green > 150 && red < 100 && blue < 100); + assertThat(greenish) + .as("JPEG inline image must decode and paint its green region") + .isGreaterThan(100); } } From dd9a470c74c9f9f805ca7df7cfefee76a9bc3ef6 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 10:21:09 +0000 Subject: [PATCH 08/11] Render Indexed-color image XObjects Highest-ROI item left on the renderer roadmap: every PNG-to-PDF conversion produces images with `[/Indexed /DeviceRGB hival lookup]` color spaces, and the renderer was silently skipping them (decodeRawRaster falls through to null for non-Device colorspaces). This commit adds the decode path. - decodeImage now recognizes `[/Indexed base hival lookup]` (with CS_INDEXED constant) and routes to a new decodeIndexedImage. - decodeIndexedImage reads 8-bit indices from the (already Flate-decoded) stream, expands each pixel through the lookup table into the base color space's component bytes, then reuses the existing buildGrayImage / buildRgbImage / buildCmykImage helpers. The base color space's component count is determined via the existing imageComponents(). - readIndexedLookup handles both forms the spec allows: a PdfString containing the palette bytes, or a PRStream whose decoded content is the palette. - Sub-byte bit depths (1/2/4-bit indices) are explicitly rejected for now -- 8-bit is the dominant case for PNG-derived images. Test: - rendersIndexedColorImageXObject builds a 32x32 BufferedImage with an IndexColorModel (top half = magenta, bottom = cyan), embeds it via Image.getInstance(BufferedImage), and asserts both palette colors appear in the rendered page. openpdf-core's Image.getInstance preserves the IndexColorModel as `[/Indexed /DeviceRGB ...]`, so this exercises the new decode path end-to-end. README updated: Indexed moved from "limitations" to the supported Image XObject formats; only sub-byte-packed indexed images remain called out as unsupported. Module test suite: 87 tests, 0 failures. --- openpdf-renderer/README.md | 11 ++- .../core/OpenPdfCorePageRenderer.java | 88 ++++++++++++++++++- .../OpenPdfCorePageRendererOperatorsTest.java | 45 ++++++++++ 3 files changed, 139 insertions(+), 5 deletions(-) diff --git a/openpdf-renderer/README.md b/openpdf-renderer/README.md index 38fa0f5f8..7c7c6f900 100644 --- a/openpdf-renderer/README.md +++ b/openpdf-renderer/README.md @@ -119,7 +119,9 @@ XObject coverage: - Image XObjects decode via `ImageIO` for JPEG (`DCTDecode`) and JPEG 2000 (`JPXDecode`, where the runtime supports it), and via a manual raster builder for uncompressed / Flate-decoded 8-bit DeviceGray, DeviceRGB and - DeviceCMYK streams (CMYK approximated to sRGB on the fly). + DeviceCMYK streams (CMYK approximated to sRGB on the fly). 8-bit Indexed + color images are expanded through their palette into the base color space + (DeviceGray / DeviceRGB / DeviceCMYK). Text rendering: for each `Tf`-selected font, the renderer pulls the embedded font program (`FontFile2`/`FontFile3`/`FontFile`) out of the @@ -163,8 +165,11 @@ The legacy in-tree parser still wins on real-world PDFs that exercise: - **Pattern and shading paint** (`pattern`, `sh`). Ignored. - **Soft masks (`SMask`) and transparency groups.** Ignored; image alpha honors `ca` only, not per-pixel masks. -- **Indexed / Separation / DeviceN color spaces** for images and paths. - Ignored; falls back to filling with the color-space default. +- **Separation / DeviceN color spaces** for images and paths. Ignored; falls + back to filling with the color-space default. (Indexed images are now + supported.) +- **Sub-byte bit depths** (1/2/4-bit indexed images, 1-bit image masks). + Currently only 8-bit indices are decoded. - **Encrypted PDFs.** Out of scope for this module (see "Encryption: removed" below). diff --git a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java index 27564577f..8a8092a9c 100644 --- a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java +++ b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java @@ -86,8 +86,9 @@ *
  • XObjects ({@code Do}): Form XObjects (recursive content streams with * their own {@code BBox} / {@code Matrix}) and Image XObjects * (JPEG via {@code DCTDecode}, JPEG2000 via {@code JPXDecode} when - * supported by {@code ImageIO}, and uncompressed / Flate-decoded - * 8-bit DeviceGray / DeviceRGB / DeviceCMYK images).
  • + * supported by {@code ImageIO}, uncompressed / Flate-decoded 8-bit + * DeviceGray / DeviceRGB / DeviceCMYK images, and 8-bit Indexed + * images expanded through the palette into their base color space). * * *

    Inline images ({@code BI}/{@code ID}/{@code EI}) are promoted out of the @@ -143,6 +144,7 @@ final class OpenPdfCorePageRenderer { private static final PdfName CS_CAL_GRAY = new PdfName("CalGray"); private static final PdfName CS_CAL_RGB = new PdfName("CalRGB"); private static final PdfName CS_LAB = new PdfName("Lab"); + private static final PdfName CS_INDEXED = new PdfName("Indexed"); private static final PdfName CS_N = new PdfName("N"); private static final Set DEVICE_GRAY_NAMES = Set.of( PdfName.DEVICEGRAY, new PdfName("G"), CS_CAL_GRAY); @@ -1202,9 +1204,91 @@ private BufferedImage decodeImage(PRStream stream) { if (hasFilter(filterObj, PdfName.DCTDECODE) || hasFilter(filterObj, PdfName.JPXDECODE)) { return decodeViaImageIO(stream); } + PdfArray indexedCs = asIndexedColorSpace(stream.get(PdfName.COLORSPACE)); + if (indexedCs != null) { + return decodeIndexedImage(stream, width, height, indexedCs); + } return decodeRawRaster(stream, width, height); } + /** + * Returns the {@code [/Indexed base hival lookup]} array if {@code csObj} (possibly + * indirect) is an indexed color space; {@code null} otherwise. + */ + private static PdfArray asIndexedColorSpace(PdfObject csObj) { + PdfObject direct = csObj instanceof PRIndirectReference ind + ? PdfReader.getPdfObject(ind) : csObj; + if (!(direct instanceof PdfArray arr) || arr.size() < 4) { + return null; + } + PdfObject head = arr.getDirectObject(0); + return CS_INDEXED.equals(head) ? arr : null; + } + + /** + * Decodes an Indexed image XObject: each pixel is a 1-byte palette index, expanded + * via the lookup table into pixel values in the base color space. Supports 8-bit + * indices and DeviceGray / DeviceRGB / DeviceCMYK base color spaces (the overwhelming + * majority of indexed images produced by PNG-to-PDF conversion). + */ + private BufferedImage decodeIndexedImage(PRStream stream, int width, int height, PdfArray indexedCs) { + try { + PdfNumber bpcN = stream.getAsNumber(PdfName.BITSPERCOMPONENT); + int bpc = bpcN == null ? 8 : bpcN.intValue(); + if (bpc != 8) { + // Bit-packed indices (1/2/4-bit) are legal but rare; not yet supported. + return null; + } + int baseComponents = imageComponents(indexedCs.getDirectObject(1)); + if (baseComponents == 0) { + return null; + } + byte[] lookup = readIndexedLookup(indexedCs.getDirectObject(3)); + if (lookup == null || lookup.length == 0) { + return null; + } + byte[] indices = PdfReader.getStreamBytes(stream); + int pixels = width * height; + if (indices.length < pixels) { + return null; + } + byte[] expanded = new byte[pixels * baseComponents]; + int maxLookupIdx = lookup.length - baseComponents; + for (int p = 0; p < pixels; p++) { + int paletteOffset = Math.min((indices[p] & 0xFF) * baseComponents, Math.max(maxLookupIdx, 0)); + System.arraycopy(lookup, paletteOffset, expanded, p * baseComponents, baseComponents); + } + switch (baseComponents) { + case 1: + return buildGrayImage(expanded, width, height); + case 3: + return buildRgbImage(expanded, width, height); + case 4: + return buildCmykImage(expanded, width, height); + default: + return null; + } + } catch (IOException | RuntimeException e) { + LOG.log(Level.FINE, "Skipping indexed image due to: {0}", e); + return null; + } + } + + /** + * Reads the lookup table of an Indexed color space, which may appear as either a + * {@code PdfString} (containing the raw palette bytes) or a {@code PRStream} (whose + * decoded content is the palette). + */ + private static byte[] readIndexedLookup(PdfObject lookupObj) throws IOException { + if (lookupObj instanceof PdfString s) { + return s.getBytes(); + } + if (lookupObj instanceof PRStream stream) { + return PdfReader.getStreamBytes(stream); + } + return null; + } + private static boolean hasFilter(PdfObject filterObj, PdfName name) { if (filterObj == null) { return false; diff --git a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java index 034a40d9a..cfd71d107 100644 --- a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java +++ b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java @@ -194,6 +194,51 @@ void rendersMarkedContentWithoutFailing() throws Exception { } } + @Test + void rendersIndexedColorImageXObject() throws Exception { + // Build an IndexColorModel-backed BufferedImage and embed it via + // Image.getInstance(BufferedImage). openpdf-core preserves the palette and + // writes the image as `[/Indexed /DeviceRGB hival ]`, exercising + // the Indexed-color-space decode path in OpenPdfCorePageRenderer. + java.awt.image.IndexColorModel icm = new java.awt.image.IndexColorModel( + 8, 2, + new byte[]{(byte) 0xFF, 0x00}, // R + new byte[]{0x00, (byte) 0xFF}, // G + new byte[]{(byte) 0xFF, (byte) 0xFF}); // B -> palette: index 0 = magenta, 1 = cyan + BufferedImage indexed = new BufferedImage(32, 32, BufferedImage.TYPE_BYTE_INDEXED, icm); + java.awt.image.WritableRaster raster = indexed.getRaster(); + for (int y = 0; y < 32; y++) { + for (int x = 0; x < 32; x++) { + // Top half = magenta (index 0), bottom half = cyan (index 1). + raster.setSample(x, y, 0, y < 16 ? 0 : 1); + } + } + org.openpdf.text.Image pdfImage = org.openpdf.text.Image.getInstance(indexed, null); + + byte[] pdf = buildPdf(cb -> { + cb.addImage(pdfImage, 160f, 0f, 0f, 160f, 30f, 80f); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + BufferedImage img = r.renderPage(1, 150f); + saveForInspection(img, "indexed-image.png"); + + // Top half of the image region should contain magenta-ish pixels; + // bottom half cyan-ish. We just need at least some of each to prove + // the palette was decoded and the indices were looked up correctly. + int magenta = countPixelsMatching(img, (red, green, blue) -> + red > 200 && green < 80 && blue > 200); + int cyan = countPixelsMatching(img, (red, green, blue) -> + red < 80 && green > 200 && blue > 200); + assertThat(magenta) + .as("indexed image must produce magenta pixels for palette index 0") + .isGreaterThan(100); + assertThat(cyan) + .as("indexed image must produce cyan pixels for palette index 1") + .isGreaterThan(100); + } + } + @Test void rendersJpegImageXObject() throws Exception { // Generate a 32x16 solid-red JPEG, embed it as an Image XObject in a PDF, From b3b4dd45cd2dd6068f5cc179539b113c6877d572 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 10:24:14 +0000 Subject: [PATCH 09/11] Lower decodeIndexedImage NPath complexity Codacy flagged decodeIndexedImage with NPath 385 (threshold 200). Splits the method along its natural seams without changing behavior: - decodeIndexedImage now just wraps the try/catch around decodeIndexedImageOrThrow. - decodeIndexedImageOrThrow handles validation + orchestration. - readBitsPerComponent extracts the /BitsPerComponent read. - expandIndexedPalette is the per-pixel arraycopy loop. - buildImageForBaseComponents is the switch on component count. No behavior change; module test suite still 87/87 green. --- .../core/OpenPdfCorePageRenderer.java | 92 ++++++++++++------- 1 file changed, 57 insertions(+), 35 deletions(-) diff --git a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java index 8a8092a9c..1cc5e0b50 100644 --- a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java +++ b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java @@ -1233,47 +1233,69 @@ private static PdfArray asIndexedColorSpace(PdfObject csObj) { */ private BufferedImage decodeIndexedImage(PRStream stream, int width, int height, PdfArray indexedCs) { try { - PdfNumber bpcN = stream.getAsNumber(PdfName.BITSPERCOMPONENT); - int bpc = bpcN == null ? 8 : bpcN.intValue(); - if (bpc != 8) { - // Bit-packed indices (1/2/4-bit) are legal but rare; not yet supported. - return null; - } - int baseComponents = imageComponents(indexedCs.getDirectObject(1)); - if (baseComponents == 0) { - return null; - } - byte[] lookup = readIndexedLookup(indexedCs.getDirectObject(3)); - if (lookup == null || lookup.length == 0) { - return null; - } - byte[] indices = PdfReader.getStreamBytes(stream); - int pixels = width * height; - if (indices.length < pixels) { - return null; - } - byte[] expanded = new byte[pixels * baseComponents]; - int maxLookupIdx = lookup.length - baseComponents; - for (int p = 0; p < pixels; p++) { - int paletteOffset = Math.min((indices[p] & 0xFF) * baseComponents, Math.max(maxLookupIdx, 0)); - System.arraycopy(lookup, paletteOffset, expanded, p * baseComponents, baseComponents); - } - switch (baseComponents) { - case 1: - return buildGrayImage(expanded, width, height); - case 3: - return buildRgbImage(expanded, width, height); - case 4: - return buildCmykImage(expanded, width, height); - default: - return null; - } + return decodeIndexedImageOrThrow(stream, width, height, indexedCs); } catch (IOException | RuntimeException e) { LOG.log(Level.FINE, "Skipping indexed image due to: {0}", e); return null; } } + private BufferedImage decodeIndexedImageOrThrow(PRStream stream, int width, int height, + PdfArray indexedCs) throws IOException { + // Bit-packed indices (1/2/4-bit) are legal but rare; not yet supported. + int bpc = readBitsPerComponent(stream); + if (bpc != 8) { + return null; + } + int baseComponents = imageComponents(indexedCs.getDirectObject(1)); + byte[] lookup = readIndexedLookup(indexedCs.getDirectObject(3)); + if (baseComponents == 0 || lookup == null || lookup.length == 0) { + return null; + } + byte[] indices = PdfReader.getStreamBytes(stream); + int pixels = width * height; + if (indices.length < pixels) { + return null; + } + byte[] expanded = expandIndexedPalette(indices, lookup, pixels, baseComponents); + return buildImageForBaseComponents(expanded, width, height, baseComponents); + } + + private static int readBitsPerComponent(PRStream stream) { + PdfNumber bpcN = stream.getAsNumber(PdfName.BITSPERCOMPONENT); + return bpcN == null ? 8 : bpcN.intValue(); + } + + /** + * Expands one index per pixel into {@code baseComponents} palette bytes per pixel by + * looking up each index in {@code lookup}. Indices whose palette entry would extend + * past the end of the lookup table are clamped to the last valid entry rather than + * throwing — malformed PDFs do happen. + */ + private static byte[] expandIndexedPalette(byte[] indices, byte[] lookup, int pixels, int baseComponents) { + byte[] expanded = new byte[pixels * baseComponents]; + int maxLookupIdx = Math.max(lookup.length - baseComponents, 0); + for (int p = 0; p < pixels; p++) { + int paletteOffset = Math.min((indices[p] & 0xFF) * baseComponents, maxLookupIdx); + System.arraycopy(lookup, paletteOffset, expanded, p * baseComponents, baseComponents); + } + return expanded; + } + + private static BufferedImage buildImageForBaseComponents(byte[] data, int width, int height, + int baseComponents) { + switch (baseComponents) { + case 1: + return buildGrayImage(data, width, height); + case 3: + return buildRgbImage(data, width, height); + case 4: + return buildCmykImage(data, width, height); + default: + return null; + } + } + /** * Reads the lookup table of an Indexed color space, which may appear as either a * {@code PdfString} (containing the raw palette bytes) or a {@code PRStream} (whose From 6004134d720832ff77ebd94dbd6f33b1ba8ab937 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 12:22:21 +0000 Subject: [PATCH 10/11] Improve table rendering in OpenPdfCorePageRenderer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PDFs that draw tables (PdfPTable, hand-rolled re/m/l/S grids, ...) lean hard on three pieces of stroke handling that this renderer was getting wrong or skipping: - Zero-width hairline strokes (PDF §8.4.3.2). `w 0` means "the thinnest line the device can render", i.e. one device pixel. The previous `Math.max(lineWidth, 0.001f)` collapsed those hairlines to invisibility once the page CTM scaled them. Now strokePath() computes an effective width of `1 / max(|sx|, |sy|)` from the current transform so a `0 w` stroke renders as a one-device-pixel line at any DPI. - ExtGState line styling beyond LW/ML/LC/LJ. The dash array `/D` and the stroke-adjust flag `/SA` are now read out of gs dictionaries; `/D` feeds the existing dash-pattern path, `/SA` is tracked through q/Q. - Crisp axis-aligned borders. KEY_STROKE_CONTROL is now set to VALUE_STROKE_NORMALIZE so 0.5pt borders snap to integer device pixels instead of smearing into two rows of antialiased grey. Adds two regression tests: a full PdfPTable render (background fills, red 2pt header border, body-row text) and a `0 w` hairline render that asserts the stroke is actually visible after CTM scaling. https://claude.ai/code/session_01Bobvbg8Ccp2g9S5DRFsnNb --- openpdf-renderer/README.md | 13 +- .../core/OpenPdfCorePageRenderer.java | 72 ++++++++++- .../OpenPdfCorePageRendererOperatorsTest.java | 112 ++++++++++++++++++ 3 files changed, 191 insertions(+), 6 deletions(-) diff --git a/openpdf-renderer/README.md b/openpdf-renderer/README.md index 7c7c6f900..322a38c0f 100644 --- a/openpdf-renderer/README.md +++ b/openpdf-renderer/README.md @@ -100,8 +100,8 @@ PDF content-stream operators — sufficient for typical text + vector PDFs: | Category | Operators | |---|---| -| Graphics state | `q`, `Q`, `cm`, `gs` (alpha `CA`/`ca`, line styling) | -| Line style | `w`, `J`, `j`, `M`, `d`, `i` | +| Graphics state | `q`, `Q`, `cm`, `gs` (alpha `CA`/`ca`, line styling `LW`/`ML`/`LC`/`LJ`/`D`, stroke-adjust `SA`) | +| Line style | `w` (including the PDF §8.4.3.2 zero-width hairline rule), `J`, `j`, `M`, `d`, `i` | | Path construction | `m`, `l`, `c`, `v`, `y`, `re`, `h` | | Path painting | `S`, `s`, `f`, `F`, `f*`, `B`, `B*`, `b`, `b*`, `n` | | Clipping | `W`, `W*` | @@ -132,6 +132,15 @@ renderer falls back to a generic Java2D family picked by PostScript-name heuristics — glyph widths from the PDF font are still respected, but shapes are only approximate. +Tables: `OpenPdfCorePageRenderer` honors the PDF §8.4.3.2 zero-width hairline +rule (`w 0` strokes are rendered as one device pixel rather than collapsing to +nothing under the page CTM), reads dash patterns and the stroke-adjust flag +from ExtGState (`D`, `SA`), and enables Java2D `KEY_STROKE_CONTROL = +VALUE_STROKE_NORMALIZE` so that 0.5pt table borders snap to integer device +pixels instead of smearing across two rows of antialiased pixels. Full +`PdfPTable` output (cell-background fills, colored borders, header rows and +cell text) is exercised by the renderer's test suite. + Inline images (`BI`/`ID`/`EI`) are now rendered: a preprocess pass promotes each inline image into a synthetic Image XObject (with JPEG framing detected by the JPEG `FFD9` end-of-image marker when the filter is `DCTDecode` to diff --git a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java index 1cc5e0b50..5cfb30ef1 100644 --- a/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java +++ b/openpdf-renderer/src/main/java/org/openpdf/renderer/core/OpenPdfCorePageRenderer.java @@ -66,9 +66,12 @@ *

    Supported operator subset

    *
      *
    • Graphics state: {@code q}, {@code Q}, {@code cm}, - * {@code gs} (alpha {@code CA}/{@code ca} only)
    • - *
    • Line style: {@code w}, {@code J}, {@code j}, {@code M}, {@code d}, - * {@code i} (flatness, no-op)
    • + * {@code gs} (alpha {@code CA}/{@code ca}, line styling + * {@code LW}/{@code ML}/{@code LC}/{@code LJ}/{@code D}, stroke-adjust + * {@code SA}) + *
    • Line style: {@code w} (zero-width strokes follow the PDF §8.4.3.2 + * "one device pixel" hairline rule), {@code J}, {@code j}, {@code M}, + * {@code d}, {@code i} (flatness, no-op)
    • *
    • Path construction: {@code m}, {@code l}, {@code c}, {@code v}, {@code y}, * {@code re}, {@code h}
    • *
    • Path painting: {@code S}, {@code s}, {@code f}, {@code F}, {@code f*}, @@ -131,6 +134,8 @@ final class OpenPdfCorePageRenderer { private static final PdfName EXTGS_ML = new PdfName("ML"); private static final PdfName EXTGS_LC = new PdfName("LC"); private static final PdfName EXTGS_LJ = new PdfName("LJ"); + private static final PdfName EXTGS_D = new PdfName("D"); + private static final PdfName EXTGS_SA = new PdfName("SA"); // FontDescriptor entries that may hold an embedded font program, in preference order: // FontFile2 (TrueType) is by far the most common in modern PDFs; FontFile3 holds CFF / @@ -232,6 +237,12 @@ static void render(PdfReader reader, int pageNumber, Graphics2D g2, RenderingHints.VALUE_TEXT_ANTIALIAS_ON); g2.setRenderingHint(RenderingHints.KEY_FRACTIONALMETRICS, RenderingHints.VALUE_FRACTIONALMETRICS_ON); + // Snap path coordinates to integer device pixels before stroking. Critical for + // crisp table borders and other thin axis-aligned hairlines: without it, a + // 0.5pt border landing at a fractional pixel boundary smears across two rows + // of antialiased pixels and looks fuzzy / grey. + g2.setRenderingHint(RenderingHints.KEY_STROKE_CONTROL, + RenderingHints.VALUE_STROKE_NORMALIZE); // Map PDF user space (origin bottom-left, Y up) to image pixels (origin top-left, Y down), // applying page rotation and DPI scaling. @@ -957,12 +968,42 @@ private void dispatch(String op, List operands) throws IOException { private void strokePath() { g2.setColor(state.strokeColor); g2.setStroke(new BasicStroke( - Math.max(state.lineWidth, 0.001f), + effectiveLineWidth(), state.lineCap, state.lineJoin, state.miterLimit, state.dashPattern, state.dashPhase)); g2.draw(currentPath); } + /** + * Resolves the stroke width to feed to {@link BasicStroke}, honoring the PDF + * spec rule that {@code w 0} means "one device pixel wide" (§8.4.3.2). A + * literal {@code 0f} fed to {@code BasicStroke} would draw nothing useful + * once the CTM scales it up, and very thin user-space widths similarly + * collapse to nothing after the inverse pixel transform. + * + *

      The minimum width is computed from the current CTM so that the + * resulting stroke is approximately one device pixel regardless of the + * page DPI — without this, table borders rendered with {@code 0 w} + * would disappear at any reasonable rendering resolution.

      + */ + private float effectiveLineWidth() { + if (state.lineWidth > 0f) { + return state.lineWidth; + } + AffineTransform ctm = g2.getTransform(); + // Use the larger of the X/Y scale factors so that a 1-device-pixel width + // is preserved under non-uniform scaling. The scaling we care about is + // |det|^(1/2) for general transforms, but for the common rectilinear + // case the max of the diagonal magnitudes is the cheap, correct answer. + double sx = Math.hypot(ctm.getScaleX(), ctm.getShearY()); + double sy = Math.hypot(ctm.getShearX(), ctm.getScaleY()); + double scale = Math.max(sx, sy); + if (scale <= 0 || Double.isNaN(scale) || Double.isInfinite(scale)) { + return 1f; + } + return (float) (1.0 / scale); + } + private void fillPath(int windingRule) { Path2D.Float p = (Path2D.Float) currentPath.clone(); p.setWindingRule(windingRule); @@ -1061,6 +1102,24 @@ private void applyExtGStateLineStyle(PdfDictionary dict) { if (lj != null) { state.lineJoin = lj.intValue(); } + // /D in an ExtGState dictionary is a two-element array: [dashArray dashPhase]. + // Mirrors the inline `d` operator path. + PdfArray dashEntry = dict.getAsArray(EXTGS_D); + if (dashEntry != null && dashEntry.size() == 2) { + PdfObject pattern = dashEntry.getPdfObject(0); + PdfObject phase = dashEntry.getPdfObject(1); + if (pattern instanceof PdfArray patternArray && phase instanceof PdfNumber phaseNum) { + applyDashPattern(patternArray, phaseNum.floatValue()); + } + } + // /SA is the stroke-adjustment flag (PDF spec §8.4.3.4). We don't act on it + // beyond recording it: Java2D's KEY_STROKE_CONTROL = VALUE_STROKE_NORMALIZE + // hint is set globally for the page, which is what stroke adjustment does in + // practice (snap thin axis-aligned strokes to integer device pixels). + PdfObject sa = dict.get(EXTGS_SA); + if (sa != null) { + state.strokeAdjust = sa.toString().equalsIgnoreCase("true"); + } } private void concatCtm(float a, float b, float c, float d, float e, float f) { @@ -1822,6 +1881,10 @@ private static final class GState { float miterLimit = 10.0f; float[] dashPattern; float dashPhase; + // PDF §8.4.3.4 stroke-adjustment flag from ExtGState /SA. Tracked for fidelity + // through q/Q nesting; the actual pixel-snap behavior is provided globally via + // KEY_STROKE_CONTROL = VALUE_STROKE_NORMALIZE. + boolean strokeAdjust; boolean hasPendingClip; int pendingClipRule = Path2D.WIND_NON_ZERO; @@ -1850,6 +1913,7 @@ private static final class GState { this.miterLimit = other.miterLimit; this.dashPattern = other.dashPattern == null ? null : other.dashPattern.clone(); this.dashPhase = other.dashPhase; + this.strokeAdjust = other.strokeAdjust; this.font = other.font; this.fontSize = other.fontSize; this.charSpacing = other.charSpacing; diff --git a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java index cfd71d107..812485160 100644 --- a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java +++ b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java @@ -437,4 +437,116 @@ void rendersTextRiseAsVerticalOffset() throws Exception { assertThat(darkPixels).isGreaterThan(20); } } + + /** + * Renders a full {@code PdfPTable} (background fills, colored borders, header row, + * cell text) and checks that the renderer produces all three of: cell-background + * fills, border strokes in their declared color, and cell text. This is a + * regression guard for table rendering as a whole — many small operators + * (re/f/S/m/l/Tj plus q/Q/cm/w nesting) have to cooperate to get a usable table. + */ + @Test + void rendersPdfPTableWithBordersFillsAndText() throws Exception { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (Document doc = new Document(new Rectangle(PageSize.A6))) { + PdfWriter.getInstance(doc, baos); + doc.open(); + org.openpdf.text.pdf.PdfPTable table = new org.openpdf.text.pdf.PdfPTable(3); + table.setTotalWidth(240f); + table.setLockedWidth(true); + + // Header row: blue background, white text, thicker red border. + for (String header : new String[]{"Col A", "Col B", "Col C"}) { + org.openpdf.text.pdf.PdfPCell hc = new org.openpdf.text.pdf.PdfPCell( + new org.openpdf.text.Phrase(header, + new org.openpdf.text.Font(org.openpdf.text.Font.HELVETICA, 10f, + org.openpdf.text.Font.BOLD, Color.WHITE))); + hc.setBackgroundColor(new Color(0, 0, 200)); + hc.setBorderColor(Color.RED); + hc.setBorderWidth(2f); + hc.setPadding(4f); + table.addCell(hc); + } + // Body rows: default thin black borders, body text. + for (int row = 0; row < 2; row++) { + for (int col = 0; col < 3; col++) { + org.openpdf.text.pdf.PdfPCell bc = new org.openpdf.text.pdf.PdfPCell( + new org.openpdf.text.Phrase("r" + row + "c" + col)); + bc.setPadding(4f); + table.addCell(bc); + } + } + doc.add(table); + } + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(baos.toByteArray())) { + BufferedImage img = r.renderPage(1, 200f); + saveForInspection(img, "pdfptable.png"); + + int blueHeaderFill = countPixelsMatching(img, (red, green, blue) -> + blue > 150 && red < 80 && green < 80); + assertThat(blueHeaderFill) + .as("header-row blue background fill must be visible") + .isGreaterThan(200); + + int redBorder = countPixelsMatching(img, (red, green, blue) -> + red > 180 && green < 80 && blue < 80); + assertThat(redBorder) + .as("header-row red 2pt border strokes must be visible") + .isGreaterThan(20); + + // Body cells sit just below the header. Look for dark glyph pixels in the + // band of the image that contains the two body rows (header row pixels + // are white-on-blue, so the dark pixels there are body text + borders). + // We deliberately skip the header band to verify body-row text rendered. + int bandTop = (int) (img.getHeight() * 0.10); + int bandBottom = (int) (img.getHeight() * 0.30); + int bodyDark = 0; + for (int y = bandTop; y < bandBottom; y += 2) { + for (int x = 0; x < img.getWidth(); x += 2) { + int argb = img.getRGB(x, y); + int rch = (argb >> 16) & 0xFF; + int gch = (argb >> 8) & 0xFF; + int bch = argb & 0xFF; + if (rch < 80 && gch < 80 && bch < 80) { + bodyDark++; + } + } + } + assertThat(bodyDark) + .as("body-row text must produce dark glyph pixels in the body band") + .isGreaterThan(20); + } + } + + /** + * PDF §8.4.3.2: a stroke width of 0 means "the thinnest line the device can + * render", i.e. one device pixel. Naively passing the user-space width to + * {@link java.awt.BasicStroke} would collapse to nothing once the page CTM + * scales it. This test draws a {@code 0 w} line and verifies the stroke is + * actually visible — which is the common case for PDFs that draw table grids + * with hairlines. + */ + @Test + void rendersZeroWidthStrokeAsDevicePixelHairline() throws Exception { + byte[] pdf = buildPdf(cb -> { + cb.setRGBColorStrokeF(0f, 0f, 0f); + cb.setLineWidth(0f); // hairline + for (int i = 0; i < 5; i++) { + cb.moveTo(20f, 40f + 15f * i); + cb.lineTo(220f, 40f + 15f * i); + } + cb.stroke(); + }); + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(pdf)) { + BufferedImage img = r.renderPage(1, 200f); + saveForInspection(img, "hairline-stroke.png"); + int dark = countPixelsMatching(img, (red, green, blue) -> + red < 100 && green < 100 && blue < 100); + assertThat(dark) + .as("PDF zero-width strokes must render as visible hairlines, not vanish") + .isGreaterThan(20); + } + } } From 938a0b1effaeafca71b29ed0ab91d2a3cee1bf3e Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 12:34:58 +0000 Subject: [PATCH 11/11] Add richer table-text rendering test The existing PdfPTable test only exercised single-word cell values ("Col A", "r0c0"). This adds a regression test that pushes harder on the text-in-table path: multi-line wrapped descriptions, a Phrase composed of multiple Chunks with different fonts and colors (regular, bold, italic, RED), varied horizontal alignments, a colored colspan cell with vertical centering, and a larger header font. The four assertions cover the parts that are easy to silently break: - white-on-blue header glyphs (header row text under cell background), - a red Chunk inside an otherwise-black Phrase (per-Chunk fill color), - a blue colspan-cell Phrase (text under multi-column layout), - a multi-line wrapped cell producing several distinct glyph rows. https://claude.ai/code/session_01Bobvbg8Ccp2g9S5DRFsnNb --- .../OpenPdfCorePageRendererOperatorsTest.java | 169 ++++++++++++++++++ 1 file changed, 169 insertions(+) diff --git a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java index 812485160..f63520123 100644 --- a/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java +++ b/openpdf-renderer/src/test/java/org/openpdf/renderer/core/OpenPdfCorePageRendererOperatorsTest.java @@ -549,4 +549,173 @@ void rendersZeroWidthStrokeAsDevicePixelHairline() throws Exception { .isGreaterThan(20); } } + + /** + * Exercises the harder slice of "text inside a table cell": multi-line wrapped + * text in a wide cell, a {@code Phrase} composed of multiple {@link + * org.openpdf.text.Chunk}s with different fonts and colors, varied horizontal + * alignments (left / center / right), and a larger header font. The renderer + * has to handle the resulting {@code Tj} stream — one short showText per + * line, with {@code Td}/{@code Tm} moves between lines — while sitting + * under the {@code q ... re W n} clipping that {@code PdfPTable} wraps every + * cell in. + */ + @Test + void rendersTableWithRichCellText() throws Exception { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (Document doc = new Document(new Rectangle(PageSize.A5))) { + PdfWriter.getInstance(doc, baos); + doc.open(); + + org.openpdf.text.pdf.PdfPTable table = new org.openpdf.text.pdf.PdfPTable(3); + table.setTotalWidth(360f); + table.setLockedWidth(true); + table.setWidths(new float[]{1f, 2f, 1f}); + + // Larger, bold header row. + org.openpdf.text.Font headerFont = new org.openpdf.text.Font( + org.openpdf.text.Font.HELVETICA, 14f, org.openpdf.text.Font.BOLD, Color.WHITE); + for (String header : new String[]{"#", "Description", "Qty"}) { + org.openpdf.text.pdf.PdfPCell hc = new org.openpdf.text.pdf.PdfPCell( + new org.openpdf.text.Phrase(header, headerFont)); + hc.setBackgroundColor(new Color(40, 40, 120)); + hc.setHorizontalAlignment(org.openpdf.text.Element.ALIGN_CENTER); + hc.setPadding(6f); + table.addCell(hc); + } + + // Row 1: left-aligned number, multi-line wrapped description, right-aligned qty. + table.addCell(numberCell("1", org.openpdf.text.Element.ALIGN_LEFT)); + table.addCell(new org.openpdf.text.pdf.PdfPCell(new org.openpdf.text.Phrase( + "First line of a long description that should wrap onto a second " + + "and probably a third line inside its table cell.", + new org.openpdf.text.Font(org.openpdf.text.Font.HELVETICA, 10f)))); + table.addCell(numberCell("12", org.openpdf.text.Element.ALIGN_RIGHT)); + + // Row 2: a cell with mixed Chunks (regular, bold, italic, colored). + table.addCell(numberCell("2", org.openpdf.text.Element.ALIGN_LEFT)); + org.openpdf.text.Phrase mixed = new org.openpdf.text.Phrase(); + mixed.add(new org.openpdf.text.Chunk("Regular ", + new org.openpdf.text.Font(org.openpdf.text.Font.HELVETICA, 11f))); + mixed.add(new org.openpdf.text.Chunk("bold ", + new org.openpdf.text.Font(org.openpdf.text.Font.HELVETICA, 11f, + org.openpdf.text.Font.BOLD))); + mixed.add(new org.openpdf.text.Chunk("italic ", + new org.openpdf.text.Font(org.openpdf.text.Font.HELVETICA, 11f, + org.openpdf.text.Font.ITALIC))); + mixed.add(new org.openpdf.text.Chunk("RED", + new org.openpdf.text.Font(org.openpdf.text.Font.HELVETICA, 11f, + org.openpdf.text.Font.BOLD, Color.RED))); + table.addCell(new org.openpdf.text.pdf.PdfPCell(mixed)); + table.addCell(numberCell("345", org.openpdf.text.Element.ALIGN_RIGHT)); + + // Row 3: a centered, vertically centered cell spanning two columns. + org.openpdf.text.pdf.PdfPCell spanCell = new org.openpdf.text.pdf.PdfPCell( + new org.openpdf.text.Phrase("Centered span", + new org.openpdf.text.Font(org.openpdf.text.Font.HELVETICA, 12f, + org.openpdf.text.Font.NORMAL, Color.BLUE))); + spanCell.setColspan(2); + spanCell.setHorizontalAlignment(org.openpdf.text.Element.ALIGN_CENTER); + spanCell.setVerticalAlignment(org.openpdf.text.Element.ALIGN_MIDDLE); + spanCell.setFixedHeight(40f); + table.addCell(spanCell); + table.addCell(numberCell("6789", org.openpdf.text.Element.ALIGN_RIGHT)); + + doc.add(table); + } + + try (OpenPdfCoreRenderer r = new OpenPdfCoreRenderer(baos.toByteArray())) { + BufferedImage img = r.renderPage(1, 200f); + saveForInspection(img, "pdfptable-rich-text.png"); + + // (1) White header glyphs land on the dark-blue header background. + int whiteOnBlueHeader = countWhiteOnBlueHeaderPixels(img); + assertThat(whiteOnBlueHeader) + .as("header row must show white glyphs on a dark-blue background") + .isGreaterThan(20); + + // (2) Red glyph chunk renders inside its row. + int red = countPixelsMatching(img, (rch, gch, bch) -> + rch > 180 && gch < 80 && bch < 80); + assertThat(red) + .as("Chunk with explicit red font color must produce red text pixels") + .isGreaterThan(20); + + // (3) Blue span-cell text renders. + int blue = countPixelsMatching(img, (rch, gch, bch) -> + bch > 180 && rch < 80 && gch < 80); + assertThat(blue) + .as("colspan cell with blue Phrase font must produce blue text pixels") + .isGreaterThan(20); + + // (4) The long wrapped description produces several distinct lines of glyphs. + int textRowsWithGlyphs = countDarkGlyphTextRows(img); + assertThat(textRowsWithGlyphs) + .as("wrapped multi-line cell text must produce multiple rows of glyphs") + .isGreaterThanOrEqualTo(3); + } + } + + private static org.openpdf.text.pdf.PdfPCell numberCell(String text, int alignment) { + org.openpdf.text.pdf.PdfPCell c = new org.openpdf.text.pdf.PdfPCell( + new org.openpdf.text.Phrase(text, + new org.openpdf.text.Font(org.openpdf.text.Font.HELVETICA, 11f))); + c.setHorizontalAlignment(alignment); + c.setPadding(6f); + return c; + } + + private static int countWhiteOnBlueHeaderPixels(BufferedImage img) { + int matches = 0; + for (int y = 0; y < img.getHeight() / 4; y++) { + for (int x = 0; x < img.getWidth(); x++) { + int argb = img.getRGB(x, y); + int r = (argb >> 16) & 0xFF; + int g = (argb >> 8) & 0xFF; + int b = argb & 0xFF; + if (r > 220 && g > 220 && b > 220) { + int neighborX = Math.min(img.getWidth() - 1, x + 4); + int neighbor = img.getRGB(neighborX, y); + int nr = (neighbor >> 16) & 0xFF; + int ng = (neighbor >> 8) & 0xFF; + int nb = neighbor & 0xFF; + if (nb > 60 && nr < 100 && ng < 100) { + matches++; + } + } + } + } + return matches; + } + + /** + * Counts how many horizontal scanlines contain a non-trivial number of dark + * glyph-like pixels. A cell with three lines of text should produce ~three + * such bands; a cell with one line should produce ~one. + */ + private static int countDarkGlyphTextRows(BufferedImage img) { + int rows = 0; + boolean inRow = false; + for (int y = 0; y < img.getHeight(); y++) { + int dark = 0; + for (int x = 0; x < img.getWidth(); x++) { + int argb = img.getRGB(x, y); + int r = (argb >> 16) & 0xFF; + int g = (argb >> 8) & 0xFF; + int b = argb & 0xFF; + if (r < 80 && g < 80 && b < 80) { + dark++; + } + } + // 6 = a few glyph strokes; below that is just border-line bleed. + boolean isGlyphRow = dark > 6 && dark < img.getWidth() / 2; + if (isGlyphRow && !inRow) { + rows++; + inRow = true; + } else if (!isGlyphRow) { + inRow = false; + } + } + return rows; + } }