From b8b4de42964162661b24b377b711280bb716dbd5 Mon Sep 17 00:00:00 2001 From: laplace young Date: Mon, 13 Apr 2026 10:54:46 +0800 Subject: [PATCH] fix(pdf): avoid collapsed whitespace fallback for plain pages --- .../markitdown/converters/_pdf_converter.py | 48 ++++++++++++++++--- packages/markitdown/tests/test_pdf_memory.py | 29 ++++++++++- 2 files changed, 70 insertions(+), 7 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index ffbcbd990..813919cb2 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -9,6 +9,7 @@ # Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10") PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$") +COLLAPSED_WHITESPACE_TOKEN_PATTERN = re.compile(r"[A-Za-z]{20,}") def _merge_partial_numbering_lines(text: str) -> str: @@ -57,6 +58,33 @@ def _merge_partial_numbering_lines(text: str) -> str: return "\n".join(result_lines) +def _looks_whitespace_collapsed(text: str) -> bool: + """Detect pathological extraction where words are concatenated without spaces.""" + if not text: + return False + + # Ignore short texts where long contiguous tokens can be legitimate. + if len(text) < 200: + return False + + long_tokens = COLLAPSED_WHITESPACE_TOKEN_PATTERN.findall(text) + long_token_count = len(long_tokens) + max_long_token_length = max((len(token) for token in long_tokens), default=0) + + # In collapsed output, spaces are nearly absent even for long documents. + space_ratio = text.count(" ") / len(text) + if space_ratio >= 0.02: + return False + + return long_token_count >= 5 or max_long_token_length >= 80 + + +def _extract_plain_text_from_page(page: Any) -> str: + """Extract plain page text with conservative x-tolerance to preserve spacing.""" + text = page.extract_text(x_tolerance=1, y_tolerance=3) + return text or "" + + # Load dependencies _dependency_exc_info = None try: @@ -247,8 +275,8 @@ def _extract_form_content_from_words(page: Any) -> str | None: # Adaptive max: allow more columns for wider pages # Standard letter is 612pt wide, so scale accordingly - adaptive_max_columns = int(20 * (page_width / 612)) - adaptive_max_columns = max(15, adaptive_max_columns) # At least 15 + adaptive_max_columns = int(12 * (page_width / 612)) + adaptive_max_columns = max(10, adaptive_max_columns) # At least 10 if len(global_columns) > adaptive_max_columns: return None @@ -547,7 +575,6 @@ def convert( # keep memory usage constant regardless of page count. markdown_chunks: list[str] = [] form_page_count = 0 - plain_page_indices: list[int] = [] with pdfplumber.open(pdf_bytes) as pdf: for page_idx, page in enumerate(pdf.pages): @@ -558,8 +585,7 @@ def convert( if page_content.strip(): markdown_chunks.append(page_content) else: - plain_page_indices.append(page_idx) - text = page.extract_text() + text = _extract_plain_text_from_page(page) if text and text.strip(): markdown_chunks.append(text.strip()) @@ -569,7 +595,17 @@ def convert( # the whole document (better text spacing for prose). if form_page_count == 0: pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + pdfminer_markdown = pdfminer.high_level.extract_text(pdf_bytes) + if _looks_whitespace_collapsed(pdfminer_markdown): + pdfplumber_markdown = "\n\n".join(markdown_chunks).strip() + if pdfplumber_markdown and not _looks_whitespace_collapsed( + pdfplumber_markdown + ): + markdown = pdfplumber_markdown + else: + markdown = pdfminer_markdown + else: + markdown = pdfminer_markdown else: markdown = "\n\n".join(markdown_chunks).strip() diff --git a/packages/markitdown/tests/test_pdf_memory.py b/packages/markitdown/tests/test_pdf_memory.py index 1731dd63e..cf181bdc3 100644 --- a/packages/markitdown/tests/test_pdf_memory.py +++ b/packages/markitdown/tests/test_pdf_memory.py @@ -63,7 +63,8 @@ def _make_plain_page(): "bottom": 20, }, ] - page.extract_text.return_value = "This is a long paragraph of plain text." + plain_text = "This is a long paragraph of plain text." + page.extract_text.side_effect = lambda *args, **kwargs: plain_text return page @@ -147,6 +148,32 @@ def test_plain_text_pdf_falls_back_to_pdfminer(self): ) assert result.text_content is not None + def test_plain_text_pdf_uses_pdfplumber_if_pdfminer_whitespace_collapses(self): + """Fallback to pdfplumber output when pdfminer concatenates words.""" + pages = [_make_plain_page() for _ in range(3)] + collapsed_output = "DataContaminationandEvaluation" * 20 + + with patch( + "markitdown.converters._pdf_converter.pdfplumber" + ) as mock_pdfplumber, patch( + "markitdown.converters._pdf_converter.pdfminer" + ) as mock_pdfminer: + mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages) + mock_pdfminer.high_level.extract_text.return_value = collapsed_output + + md = MarkItDown() + buf = io.BytesIO(b"fake pdf content") + from markitdown import StreamInfo + + result = md.convert_stream( + buf, + stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"), + ) + + assert mock_pdfminer.high_level.extract_text.called + assert "This is a long paragraph of plain text." in result.text_content + assert "DataContaminationandEvaluation" not in result.text_content + def test_plain_text_pdf_still_closes_all_pages(self): """Even for plain-text PDFs, page.close() must be called on every page.""" num_pages = 30