Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,29 @@ def _non_negative_int(value: str) -> int:
_INVALID_UNICODE_RE = re.compile(r"[\ud800-\udfff\x00]")


def _extract_failed_pages_from_errors(errors: list[str]) -> list[int]:
"""Extract failed page numbers from error messages.

Docling error messages follow the pattern "Page N: <error>" (e.g.,
"Page 26: std::bad_alloc"). Even when docling includes failed pages
in the pages dict as empty entries, the error messages reliably
indicate which pages actually failed.

Args:
errors: List of error message strings.

Returns:
Sorted list of 1-indexed page numbers that failed.
"""
failed = set()
page_pattern = re.compile(r"^Page\s+(\d+):")
for msg in errors:
m = page_pattern.match(msg)
if m:
failed.add(int(m.group(1)))
return sorted(failed)


def build_conversion_response(
status_value: str,
json_content: dict,
Expand All @@ -106,10 +129,12 @@ def build_conversion_response(
) -> dict:
"""Build a structured conversion response with status and failed page info.

When Docling encounters errors (e.g., Invalid code point in PDF font encoding),
it skips the affected pages and returns PARTIAL_SUCCESS. This function detects
which pages failed by comparing the requested page range against pages present
in the output.
When Docling encounters errors (e.g., std::bad_alloc in PDF preprocessing),
it may still include failed pages as empty entries in the pages dict.
This function combines two strategies to detect failed pages:
1. Parse page numbers from error messages ("Page N: <error>")
2. Detect pages missing from the output pages dict (gap detection)
Both results are merged (union) since each catches a different failure mode.

Args:
status_value: Docling ConversionStatus value as string (e.g., "success", "partial_success").
Expand All @@ -126,7 +151,14 @@ def build_conversion_response(
failed_pages: list[int] = []

if status_value == "partial_success":
# Detect failed pages by finding gaps in the pages dict
# Strategy 1: Extract failed pages from error messages (reliable —
# docling may include failed pages as empty entries in the pages dict,
# making gap detection ineffective)
error_failed = set(_extract_failed_pages_from_errors(errors))

# Strategy 2: Detect pages missing from the pages dict (catches
# failures that don't produce "Page N:" error messages)
gap_failed: set[int] = set()
pages_dict = json_content.get("pages", {})
present_pages = set()
for k in pages_dict.keys():
Expand All @@ -140,15 +172,17 @@ def build_conversion_response(
elif total_pages is not None:
expected_pages = set(range(1, total_pages + 1))
elif present_pages:
# Fallback: infer range from min to max of present pages
logger.warning(
"No page range or total_pages available; boundary page failures cannot be detected"
)
expected_pages = set(range(min(present_pages), max(present_pages) + 1))
else:
expected_pages = set()

failed_pages = sorted(expected_pages - present_pages)
gap_failed = expected_pages - present_pages

# Union: each strategy catches a different failure mode
failed_pages = sorted(error_failed | gap_failed)

response: dict[str, Any] = {
"status": status_value,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
- error messages from Docling
"""

from opendataloader_pdf.hybrid_server import build_conversion_response
from opendataloader_pdf.hybrid_server import (
_extract_failed_pages_from_errors,
build_conversion_response,
)


class TestBuildConversionResponse:
Expand Down Expand Up @@ -163,7 +166,7 @@ def test_failure_status_no_failed_pages_detection(self):
assert response["failed_pages"] == []

def test_partial_success_missing_pages_key(self):
"""json_content without 'pages' key should produce empty failed_pages."""
"""json_content without 'pages' key should mark all requested pages as failed."""
response = build_conversion_response(
status_value="partial_success",
json_content={"body": {"text": "hello"}},
Expand All @@ -173,3 +176,109 @@ def test_partial_success_missing_pages_key(self):
)
assert response["status"] == "partial_success"
assert response["failed_pages"] == [1, 2, 3]


class TestExtractFailedPagesFromErrors:
"""Tests for error message-based failed page detection."""

def test_std_bad_alloc_errors(self):
"""Page numbers should be extracted from 'Page N: std::bad_alloc' messages."""
errors = [
"Page 26: std::bad_alloc",
"Page 27: std::bad_alloc",
"Page 28: std::bad_alloc",
]
assert _extract_failed_pages_from_errors(errors) == [26, 27, 28]

def test_mixed_error_formats(self):
"""Only 'Page N:' prefixed messages should be matched."""
errors = [
"Page 5: Invalid code point",
"Unknown page: pipeline terminated early",
"Page 10: std::bad_alloc",
]
assert _extract_failed_pages_from_errors(errors) == [5, 10]

def test_no_page_errors(self):
"""Non-page errors should return empty list."""
errors = [
"Unknown page: pipeline terminated early",
"General error occurred",
]
assert _extract_failed_pages_from_errors(errors) == []


class TestBuildConversionResponseErrorParsing:
"""Tests for failed page detection via error message parsing.

When docling includes failed pages as empty entries in the pages dict,
gap detection fails. Error message parsing handles this case.
"""

def test_failed_pages_with_empty_entries_in_pages_dict(self):
"""Failed pages present as empty entries should still be detected via errors."""
response = build_conversion_response(
status_value="partial_success",
json_content={"pages": {"1": {}, "2": {}, "3": {}, "4": {}, "5": {}}},
processing_time=2.0,
errors=["Page 4: std::bad_alloc", "Page 5: std::bad_alloc"],
requested_pages=(1, 5),
)
assert response["failed_pages"] == [4, 5]

def test_boundary_pages_detected_via_errors(self):
"""Boundary page failures should be detected even without page range."""
response = build_conversion_response(
status_value="partial_success",
json_content={"pages": {"1": {}, "2": {}, "3": {}, "4": {}, "5": {}}},
processing_time=2.0,
errors=["Page 4: std::bad_alloc", "Page 5: std::bad_alloc"],
requested_pages=None,
total_pages=None,
)
assert response["failed_pages"] == [4, 5]

def test_both_strategies_combined(self):
"""Union of error-parsed and gap-detected pages should be reported.

Page 2 is missing from dict (gap), pages 4-5 have error messages
but are present as empty entries. All three must appear.
"""
response = build_conversion_response(
status_value="partial_success",
json_content={"pages": {"1": {}, "3": {}, "4": {}, "5": {}}},
processing_time=2.0,
errors=["Page 4: std::bad_alloc", "Page 5: std::bad_alloc"],
requested_pages=(1, 5),
)
assert response["failed_pages"] == [2, 4, 5]

Comment thread
i-Veni-Vidi-Vici marked this conversation as resolved.
def test_duplicate_page_in_errors(self):
"""Duplicate page numbers in error messages should be deduplicated."""
errors = [
"Page 3: std::bad_alloc",
"Page 3: Invalid code point",
]
assert _extract_failed_pages_from_errors(errors) == [3]

def test_no_page_pattern_errors_falls_back_to_gap(self):
"""When errors lack 'Page N:' pattern, gap detection should still work."""
response = build_conversion_response(
status_value="partial_success",
json_content={"pages": {"1": {}, "3": {}}},
processing_time=1.0,
errors=["Unknown page: pipeline terminated early"],
requested_pages=(1, 3),
)
assert response["failed_pages"] == [2]

def test_empty_errors_with_partial_success(self):
"""Partial success with no error messages should still detect gaps."""
response = build_conversion_response(
status_value="partial_success",
json_content={"pages": {"1": {}, "3": {}}},
processing_time=1.0,
errors=[],
requested_pages=(1, 3),
)
assert response["failed_pages"] == [2]