opendataloader-project · i-Veni-Vidi-Vici · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
@@ -96,6 +96,29 @@ def _non_negative_int(value: str) -> int:
 _INVALID_UNICODE_RE = re.compile(r"[\ud800-\udfff\x00]")
 
 
+def _extract_failed_pages_from_errors(errors: list[str]) -> list[int]:
+    """Extract failed page numbers from error messages.
+
+    Docling error messages follow the pattern "Page N: <error>" (e.g.,
+    "Page 26: std::bad_alloc"). Even when docling includes failed pages
+    in the pages dict as empty entries, the error messages reliably
+    indicate which pages actually failed.
+
+    Args:
+        errors: List of error message strings.
+
+    Returns:
+        Sorted list of 1-indexed page numbers that failed.
+    """
+    failed = set()
+    page_pattern = re.compile(r"^Page\s+(\d+):")
+    for msg in errors:
+        m = page_pattern.match(msg)
+        if m:
+            failed.add(int(m.group(1)))
+    return sorted(failed)
+
+
 def build_conversion_response(
     status_value: str,
     json_content: dict,
@@ -106,10 +129,12 @@ def build_conversion_response(
 ) -> dict:
     """Build a structured conversion response with status and failed page info.
 
-    When Docling encounters errors (e.g., Invalid code point in PDF font encoding),
-    it skips the affected pages and returns PARTIAL_SUCCESS. This function detects
-    which pages failed by comparing the requested page range against pages present
-    in the output.
+    When Docling encounters errors (e.g., std::bad_alloc in PDF preprocessing),
+    it may still include failed pages as empty entries in the pages dict.
+    This function combines two strategies to detect failed pages:
+    1. Parse page numbers from error messages ("Page N: <error>")
+    2. Detect pages missing from the output pages dict (gap detection)
+    Both results are merged (union) since each catches a different failure mode.
 
     Args:
         status_value: Docling ConversionStatus value as string (e.g., "success", "partial_success").
@@ -126,7 +151,14 @@ def build_conversion_response(
     failed_pages: list[int] = []
 
     if status_value == "partial_success":
-        # Detect failed pages by finding gaps in the pages dict
+        # Strategy 1: Extract failed pages from error messages (reliable —
+        # docling may include failed pages as empty entries in the pages dict,
+        # making gap detection ineffective)
+        error_failed = set(_extract_failed_pages_from_errors(errors))
+
+        # Strategy 2: Detect pages missing from the pages dict (catches
+        # failures that don't produce "Page N:" error messages)
+        gap_failed: set[int] = set()
         pages_dict = json_content.get("pages", {})
         present_pages = set()
         for k in pages_dict.keys():
@@ -140,15 +172,17 @@ def build_conversion_response(
         elif total_pages is not None:
             expected_pages = set(range(1, total_pages + 1))
         elif present_pages:
-            # Fallback: infer range from min to max of present pages
             logger.warning(
                 "No page range or total_pages available; boundary page failures cannot be detected"
             )
             expected_pages = set(range(min(present_pages), max(present_pages) + 1))
         else:
             expected_pages = set()
 
-        failed_pages = sorted(expected_pages - present_pages)
+        gap_failed = expected_pages - present_pages
+
+        # Union: each strategy catches a different failure mode
+        failed_pages = sorted(error_failed | gap_failed)
 
     response: dict[str, Any] = {
         "status": status_value,

@@ -7,7 +7,10 @@
 - error messages from Docling
 """
 
-from opendataloader_pdf.hybrid_server import build_conversion_response
+from opendataloader_pdf.hybrid_server import (
+    _extract_failed_pages_from_errors,
+    build_conversion_response,
+)
 
 
 class TestBuildConversionResponse:
@@ -163,7 +166,7 @@ def test_failure_status_no_failed_pages_detection(self):
         assert response["failed_pages"] == []
 
     def test_partial_success_missing_pages_key(self):
-        """json_content without 'pages' key should produce empty failed_pages."""
+        """json_content without 'pages' key should mark all requested pages as failed."""
         response = build_conversion_response(
             status_value="partial_success",
             json_content={"body": {"text": "hello"}},
@@ -173,3 +176,109 @@ def test_partial_success_missing_pages_key(self):
         )
         assert response["status"] == "partial_success"
         assert response["failed_pages"] == [1, 2, 3]
+
+
+class TestExtractFailedPagesFromErrors:
+    """Tests for error message-based failed page detection."""
+
+    def test_std_bad_alloc_errors(self):
+        """Page numbers should be extracted from 'Page N: std::bad_alloc' messages."""
+        errors = [
+            "Page 26: std::bad_alloc",
+            "Page 27: std::bad_alloc",
+            "Page 28: std::bad_alloc",
+        ]
+        assert _extract_failed_pages_from_errors(errors) == [26, 27, 28]
+
+    def test_mixed_error_formats(self):
+        """Only 'Page N:' prefixed messages should be matched."""
+        errors = [
+            "Page 5: Invalid code point",
+            "Unknown page: pipeline terminated early",
+            "Page 10: std::bad_alloc",
+        ]
+        assert _extract_failed_pages_from_errors(errors) == [5, 10]
+
+    def test_no_page_errors(self):
+        """Non-page errors should return empty list."""
+        errors = [
+            "Unknown page: pipeline terminated early",
+            "General error occurred",
+        ]
+        assert _extract_failed_pages_from_errors(errors) == []
+
+
+class TestBuildConversionResponseErrorParsing:
+    """Tests for failed page detection via error message parsing.
+
+    When docling includes failed pages as empty entries in the pages dict,
+    gap detection fails. Error message parsing handles this case.
+    """
+
+    def test_failed_pages_with_empty_entries_in_pages_dict(self):
+        """Failed pages present as empty entries should still be detected via errors."""
+        response = build_conversion_response(
+            status_value="partial_success",
+            json_content={"pages": {"1": {}, "2": {}, "3": {}, "4": {}, "5": {}}},
+            processing_time=2.0,
+            errors=["Page 4: std::bad_alloc", "Page 5: std::bad_alloc"],
+            requested_pages=(1, 5),
+        )
+        assert response["failed_pages"] == [4, 5]
+
+    def test_boundary_pages_detected_via_errors(self):
+        """Boundary page failures should be detected even without page range."""
+        response = build_conversion_response(
+            status_value="partial_success",
+            json_content={"pages": {"1": {}, "2": {}, "3": {}, "4": {}, "5": {}}},
+            processing_time=2.0,
+            errors=["Page 4: std::bad_alloc", "Page 5: std::bad_alloc"],
+            requested_pages=None,
+            total_pages=None,
+        )
+        assert response["failed_pages"] == [4, 5]
+
+    def test_both_strategies_combined(self):
+        """Union of error-parsed and gap-detected pages should be reported.
+
+        Page 2 is missing from dict (gap), pages 4-5 have error messages
+        but are present as empty entries. All three must appear.
+        """
+        response = build_conversion_response(
+            status_value="partial_success",
+            json_content={"pages": {"1": {}, "3": {}, "4": {}, "5": {}}},
+            processing_time=2.0,
+            errors=["Page 4: std::bad_alloc", "Page 5: std::bad_alloc"],
+            requested_pages=(1, 5),
+        )
+        assert response["failed_pages"] == [2, 4, 5]
+
+    def test_duplicate_page_in_errors(self):
+        """Duplicate page numbers in error messages should be deduplicated."""
+        errors = [
+            "Page 3: std::bad_alloc",
+            "Page 3: Invalid code point",
+        ]
+        assert _extract_failed_pages_from_errors(errors) == [3]
+
+    def test_no_page_pattern_errors_falls_back_to_gap(self):
+        """When errors lack 'Page N:' pattern, gap detection should still work."""
+        response = build_conversion_response(
+            status_value="partial_success",
+            json_content={"pages": {"1": {}, "3": {}}},
+            processing_time=1.0,
+            errors=["Unknown page: pipeline terminated early"],
+            requested_pages=(1, 3),
+        )
+        assert response["failed_pages"] == [2]
+
+    def test_empty_errors_with_partial_success(self):
+        """Partial success with no error messages should still detect gaps."""
+        response = build_conversion_response(
+            status_value="partial_success",
+            json_content={"pages": {"1": {}, "3": {}}},
+            processing_time=1.0,
+            errors=[],
+            requested_pages=(1, 3),
+        )
+        assert response["failed_pages"] == [2]