diff --git a/python/opendataloader-pdf/src/opendataloader_pdf/hybrid_server.py b/python/opendataloader-pdf/src/opendataloader_pdf/hybrid_server.py index c989e33c9..6909ac6a3 100644 --- a/python/opendataloader-pdf/src/opendataloader_pdf/hybrid_server.py +++ b/python/opendataloader-pdf/src/opendataloader_pdf/hybrid_server.py @@ -100,6 +100,29 @@ def _non_negative_int(value: str) -> int: _INVALID_UNICODE_RE = re.compile(r"[\ud800-\udfff\x00]") +def _extract_failed_pages_from_errors(errors: list[str]) -> list[int]: + """Extract failed page numbers from error messages. + + Docling error messages follow the pattern "Page N: " (e.g., + "Page 26: std::bad_alloc"). Even when docling includes failed pages + in the pages dict as empty entries, the error messages reliably + indicate which pages actually failed. + + Args: + errors: List of error message strings. + + Returns: + Sorted list of 1-indexed page numbers that failed. + """ + failed = set() + page_pattern = re.compile(r"^Page\s+(\d+):") + for msg in errors: + m = page_pattern.match(msg) + if m: + failed.add(int(m.group(1))) + return sorted(failed) + + def extract_timings(result: Any) -> dict[str, Any]: """Extract per-step pipeline timings from a Docling ConversionResult. @@ -135,10 +158,12 @@ def build_conversion_response( ) -> dict: """Build a structured conversion response with status and failed page info. - When Docling encounters errors (e.g., Invalid code point in PDF font encoding), - it skips the affected pages and returns PARTIAL_SUCCESS. This function detects - which pages failed by comparing the requested page range against pages present - in the output. + When Docling encounters errors (e.g., std::bad_alloc in PDF preprocessing), + it may still include failed pages as empty entries in the pages dict. + This function combines two strategies to detect failed pages: + 1. Parse page numbers from error messages ("Page N: ") + 2. Detect pages missing from the output pages dict (gap detection) + Both results are merged (union) since each catches a different failure mode. Args: status_value: Docling ConversionStatus value as string (e.g., "success", "partial_success"). @@ -155,7 +180,14 @@ def build_conversion_response( failed_pages: list[int] = [] if status_value == "partial_success": - # Detect failed pages by finding gaps in the pages dict + # Strategy 1: Extract failed pages from error messages (reliable — + # docling may include failed pages as empty entries in the pages dict, + # making gap detection ineffective) + error_failed = set(_extract_failed_pages_from_errors(errors)) + + # Strategy 2: Detect pages missing from the pages dict (catches + # failures that don't produce "Page N:" error messages) + gap_failed: set[int] = set() pages_dict = json_content.get("pages", {}) present_pages = set() for k in pages_dict.keys(): @@ -169,7 +201,6 @@ def build_conversion_response( elif total_pages is not None: expected_pages = set(range(1, total_pages + 1)) elif present_pages: - # Fallback: infer range from min to max of present pages logger.warning( "No page range or total_pages available; boundary page failures cannot be detected" ) @@ -177,7 +208,10 @@ def build_conversion_response( else: expected_pages = set() - failed_pages = sorted(expected_pages - present_pages) + gap_failed = expected_pages - present_pages + + # Union: each strategy catches a different failure mode + failed_pages = sorted(error_failed | gap_failed) response: dict[str, Any] = { "status": status_value, diff --git a/python/opendataloader-pdf/tests/test_hybrid_server_partial_success.py b/python/opendataloader-pdf/tests/test_hybrid_server_partial_success.py index 7cbc04ef2..e7905e535 100644 --- a/python/opendataloader-pdf/tests/test_hybrid_server_partial_success.py +++ b/python/opendataloader-pdf/tests/test_hybrid_server_partial_success.py @@ -7,7 +7,10 @@ - error messages from Docling """ -from opendataloader_pdf.hybrid_server import build_conversion_response +from opendataloader_pdf.hybrid_server import ( + _extract_failed_pages_from_errors, + build_conversion_response, +) class TestBuildConversionResponse: @@ -163,7 +166,7 @@ def test_failure_status_no_failed_pages_detection(self): assert response["failed_pages"] == [] def test_partial_success_missing_pages_key(self): - """json_content without 'pages' key should produce empty failed_pages.""" + """json_content without 'pages' key should mark all requested pages as failed.""" response = build_conversion_response( status_value="partial_success", json_content={"body": {"text": "hello"}}, @@ -173,3 +176,120 @@ def test_partial_success_missing_pages_key(self): ) assert response["status"] == "partial_success" assert response["failed_pages"] == [1, 2, 3] + + +class TestExtractFailedPagesFromErrors: + """Tests for error message-based failed page detection.""" + + def test_std_bad_alloc_errors(self): + """Page numbers should be extracted from 'Page N: std::bad_alloc' messages.""" + errors = [ + "Page 26: std::bad_alloc", + "Page 27: std::bad_alloc", + "Page 28: std::bad_alloc", + ] + assert _extract_failed_pages_from_errors(errors) == [26, 27, 28] + + def test_mixed_error_formats(self): + """Only 'Page N:' prefixed messages should be matched.""" + errors = [ + "Page 5: Invalid code point", + "Unknown page: pipeline terminated early", + "Page 10: std::bad_alloc", + ] + assert _extract_failed_pages_from_errors(errors) == [5, 10] + + def test_no_page_errors(self): + """Non-page errors should return empty list.""" + errors = [ + "Unknown page: pipeline terminated early", + "General error occurred", + ] + assert _extract_failed_pages_from_errors(errors) == [] + + +class TestBuildConversionResponseErrorParsing: + """Tests for failed page detection via error message parsing. + + When docling includes failed pages as empty entries in the pages dict, + gap detection fails. Error message parsing handles this case. + """ + + def test_failed_pages_with_empty_entries_in_pages_dict(self): + """Failed pages present as empty entries should still be detected via errors.""" + response = build_conversion_response( + status_value="partial_success", + json_content={"pages": {"1": {}, "2": {}, "3": {}, "4": {}, "5": {}}}, + processing_time=2.0, + errors=["Page 4: std::bad_alloc", "Page 5: std::bad_alloc"], + requested_pages=(1, 5), + ) + assert response["failed_pages"] == [4, 5] + + def test_boundary_pages_detected_via_errors(self): + """Boundary page failures should be detected even without page range.""" + response = build_conversion_response( + status_value="partial_success", + json_content={"pages": {"1": {}, "2": {}, "3": {}, "4": {}, "5": {}}}, + processing_time=2.0, + errors=["Page 4: std::bad_alloc", "Page 5: std::bad_alloc"], + requested_pages=None, + total_pages=None, + ) + assert response["failed_pages"] == [4, 5] + + def test_both_strategies_combined(self): + """Union of error-parsed and gap-detected pages should be reported. + + Page 2 is missing from dict (gap), pages 4-5 have error messages + but are present as empty entries. All three must appear. + """ + response = build_conversion_response( + status_value="partial_success", + json_content={"pages": {"1": {}, "3": {}, "4": {}, "5": {}}}, + processing_time=2.0, + errors=["Page 4: std::bad_alloc", "Page 5: std::bad_alloc"], + requested_pages=(1, 5), + ) + assert response["failed_pages"] == [2, 4, 5] + + def test_overlap_between_gap_and_error_is_deduplicated(self): + """Same page from gap and error parsing should appear once.""" + response = build_conversion_response( + status_value="partial_success", + json_content={"pages": {"1": {}, "3": {}}}, + processing_time=1.0, + errors=["Page 2: std::bad_alloc"], + requested_pages=(1, 3), + ) + assert response["failed_pages"] == [2] + + def test_duplicate_page_in_errors(self): + """Duplicate page numbers in error messages should be deduplicated.""" + errors = [ + "Page 3: std::bad_alloc", + "Page 3: Invalid code point", + ] + assert _extract_failed_pages_from_errors(errors) == [3] + + def test_no_page_pattern_errors_falls_back_to_gap(self): + """When errors lack 'Page N:' pattern, gap detection should still work.""" + response = build_conversion_response( + status_value="partial_success", + json_content={"pages": {"1": {}, "3": {}}}, + processing_time=1.0, + errors=["Unknown page: pipeline terminated early"], + requested_pages=(1, 3), + ) + assert response["failed_pages"] == [2] + + def test_empty_errors_with_partial_success(self): + """Partial success with no error messages should still detect gaps.""" + response = build_conversion_response( + status_value="partial_success", + json_content={"pages": {"1": {}, "3": {}}}, + processing_time=1.0, + errors=[], + requested_pages=(1, 3), + ) + assert response["failed_pages"] == [2]