Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 42 additions & 6 deletions packages/markitdown/src/markitdown/converters/_pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
COLLAPSED_WHITESPACE_TOKEN_PATTERN = re.compile(r"[A-Za-z]{20,}")


def _merge_partial_numbering_lines(text: str) -> str:
Expand Down Expand Up @@ -57,6 +58,33 @@ def _merge_partial_numbering_lines(text: str) -> str:
return "\n".join(result_lines)


def _looks_whitespace_collapsed(text: str) -> bool:
"""Detect pathological extraction where words are concatenated without spaces."""
if not text:
return False

# Ignore short texts where long contiguous tokens can be legitimate.
if len(text) < 200:
return False

long_tokens = COLLAPSED_WHITESPACE_TOKEN_PATTERN.findall(text)
long_token_count = len(long_tokens)
max_long_token_length = max((len(token) for token in long_tokens), default=0)

# In collapsed output, spaces are nearly absent even for long documents.
space_ratio = text.count(" ") / len(text)
if space_ratio >= 0.02:
return False

return long_token_count >= 5 or max_long_token_length >= 80


def _extract_plain_text_from_page(page: Any) -> str:
"""Extract plain page text with conservative x-tolerance to preserve spacing."""
text = page.extract_text(x_tolerance=1, y_tolerance=3)
return text or ""


# Load dependencies
_dependency_exc_info = None
try:
Expand Down Expand Up @@ -247,8 +275,8 @@ def _extract_form_content_from_words(page: Any) -> str | None:

# Adaptive max: allow more columns for wider pages
# Standard letter is 612pt wide, so scale accordingly
adaptive_max_columns = int(20 * (page_width / 612))
adaptive_max_columns = max(15, adaptive_max_columns) # At least 15
adaptive_max_columns = int(12 * (page_width / 612))
adaptive_max_columns = max(10, adaptive_max_columns) # At least 10

if len(global_columns) > adaptive_max_columns:
return None
Expand Down Expand Up @@ -547,7 +575,6 @@ def convert(
# keep memory usage constant regardless of page count.
markdown_chunks: list[str] = []
form_page_count = 0
plain_page_indices: list[int] = []

with pdfplumber.open(pdf_bytes) as pdf:
for page_idx, page in enumerate(pdf.pages):
Expand All @@ -558,8 +585,7 @@ def convert(
if page_content.strip():
markdown_chunks.append(page_content)
else:
plain_page_indices.append(page_idx)
text = page.extract_text()
text = _extract_plain_text_from_page(page)
if text and text.strip():
markdown_chunks.append(text.strip())

Expand All @@ -569,7 +595,17 @@ def convert(
# the whole document (better text spacing for prose).
if form_page_count == 0:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
pdfminer_markdown = pdfminer.high_level.extract_text(pdf_bytes)
if _looks_whitespace_collapsed(pdfminer_markdown):
pdfplumber_markdown = "\n\n".join(markdown_chunks).strip()
if pdfplumber_markdown and not _looks_whitespace_collapsed(
pdfplumber_markdown
):
markdown = pdfplumber_markdown
else:
markdown = pdfminer_markdown
else:
markdown = pdfminer_markdown
else:
markdown = "\n\n".join(markdown_chunks).strip()

Expand Down
29 changes: 28 additions & 1 deletion packages/markitdown/tests/test_pdf_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ def _make_plain_page():
"bottom": 20,
},
]
page.extract_text.return_value = "This is a long paragraph of plain text."
plain_text = "This is a long paragraph of plain text."
page.extract_text.side_effect = lambda *args, **kwargs: plain_text
return page


Expand Down Expand Up @@ -147,6 +148,32 @@ def test_plain_text_pdf_falls_back_to_pdfminer(self):
)
assert result.text_content is not None

def test_plain_text_pdf_uses_pdfplumber_if_pdfminer_whitespace_collapses(self):
"""Fallback to pdfplumber output when pdfminer concatenates words."""
pages = [_make_plain_page() for _ in range(3)]
collapsed_output = "DataContaminationandEvaluation" * 20

with patch(
"markitdown.converters._pdf_converter.pdfplumber"
) as mock_pdfplumber, patch(
"markitdown.converters._pdf_converter.pdfminer"
) as mock_pdfminer:
mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages)
mock_pdfminer.high_level.extract_text.return_value = collapsed_output

md = MarkItDown()
buf = io.BytesIO(b"fake pdf content")
from markitdown import StreamInfo

result = md.convert_stream(
buf,
stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"),
)

assert mock_pdfminer.high_level.extract_text.called
assert "This is a long paragraph of plain text." in result.text_content
assert "DataContaminationandEvaluation" not in result.text_content

def test_plain_text_pdf_still_closes_all_pages(self):
"""Even for plain-text PDFs, page.close() must be called on every page."""
num_pages = 30
Expand Down