Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions app/routes/document_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,23 @@ def _make_unique_temp_path(user_id: str, filename: str) -> Optional[str]:


async def load_file_content(
filename: str, content_type: str, file_path: str, executor
filename: str,
content_type: str,
file_path: str,
executor,
raw_text: bool = False,
) -> tuple:
"""Load file content using appropriate loader."""
"""Load file content using appropriate loader.

Pass ``raw_text=True`` when the caller wants verbatim file contents (e.g.
the ``/text`` endpoint) so text-formatted files are not semantically
parsed.
"""
loader = None
try:
loader, known_type, file_ext = get_loader(filename, content_type, file_path)
loader, known_type, file_ext = get_loader(
filename, content_type, file_path, raw_text=raw_text
)
loop = asyncio.get_running_loop()
data = await loop.run_in_executor(executor, lambda: list(loader.lazy_load()))
return data, known_type, file_ext
Expand Down Expand Up @@ -1085,6 +1096,7 @@ async def extract_text_from_file(
file.content_type,
validated_temp_file_path,
request.app.state.thread_pool,
raw_text=True,
)

# Extract text content from loaded documents
Expand Down
26 changes: 24 additions & 2 deletions app/utils/document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,33 @@ def cleanup_temp_encoding_file(loader) -> None:
logger.warning(f"Failed to remove temporary UTF-8 file: {e}")


def get_loader(filename: str, file_content_type: str, filepath: str):
"""Get the appropriate document loader based on file type and\or content type."""
def get_loader(
filename: str,
file_content_type: str,
filepath: str,
raw_text: bool = False,
):
"""Get the appropriate document loader based on file type and\or content type.

When ``raw_text`` is True, text-formatted files (e.g. Markdown) are loaded
verbatim with :class:`TextLoader` so their original formatting is
preserved. This is intended for the ``/text`` endpoint, where the caller
wants the raw file contents. The embedding path should keep the default
(``raw_text=False``) so semantic loaders continue to strip formatting for
better vector search quality.
"""
file_ext = filename.split(".")[-1].lower()
known_type = True

is_markdown = file_ext == "md" or file_content_type in [
"text/markdown",
"text/x-markdown",
"application/markdown",
"application/x-markdown",
]
if raw_text and is_markdown:
return TextLoader(filepath, autodetect_encoding=True), True, file_ext
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Restore binary loader precedence with raw_text enabled

The new early return in get_loader runs before all format-specific branches, so on the /text path (raw_text=True) any upload labeled with a markdown MIME type is forced into TextLoader even when the file extension is a known binary type like .pdf or .docx. This is a regression from prior behavior where those extensions were handled by their dedicated loaders first, and it can cause failed or nonsensical extraction when clients send conflicting multipart content types.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks — good catch. Pushed a442f0f to address this:

  • Dropped the early return; raw_text is now handled inside the existing markdown elif branch so the earlier PDF/CSV/RST/XML/PPT branches retain their precedence.
  • Tightened the markdown elif itself: it still accepts .md by extension, but only accepts a markdown Content-Type when the extension is not a known binary format (new _BINARY_FILE_EXTENSIONS set). This also closes a pre-existing edge where .docx / .xlsx / .epub uploaded with text/markdown would fall through the md branch even without raw_text.
  • Added a parameterised regression test covering doc.pdf, report.docx, book.epub, data.xlsx, and slides.pptx each with Content-Type: text/markdown on the raw_text=True path — each now routes to its dedicated binary loader (22/22 pytest passing).


# File Content Type reference:
# ref.: https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types
if file_ext == "pdf" or file_content_type == "application/pdf":
Expand Down
84 changes: 83 additions & 1 deletion tests/utils/test_document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
from collections.abc import Iterator
from unittest.mock import MagicMock, patch

import pytest

from app.utils.document_loader import get_loader, clean_text, process_documents
from langchain_community.document_loaders import (
TextLoader,
UnstructuredMarkdownLoader,
)
from langchain_core.documents import Document


Expand Down Expand Up @@ -153,7 +159,6 @@ def fallback_gen():
def test_safe_pdf_loader_non_filter_error_propagates():
"""KeyError that isn't /Filter should propagate, not silently fallback."""
from app.utils.document_loader import SafePyPDFLoader
import pytest

def bad_gen():
raise KeyError("SomeOtherKey")
Expand All @@ -167,3 +172,80 @@ def bad_gen():

with pytest.raises(KeyError, match="SomeOtherKey"):
list(loader.lazy_load())


MARKDOWN_SAMPLE = (
"# Heading\n\n"
"**bold** and *italic* text with a [link](https://example.com).\n\n"
"- item 1\n"
"- item 2\n\n"
"> a blockquote\n"
)


def test_get_loader_markdown_embed_uses_unstructured(tmp_path):
"""Default (embedding) path must keep UnstructuredMarkdownLoader for .md."""
file_path = tmp_path / "notes.md"
file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")

loader, known_type, file_ext = get_loader(
"notes.md", "text/markdown", str(file_path)
)

assert isinstance(loader, UnstructuredMarkdownLoader)
assert known_type is True
assert file_ext == "md"


@pytest.mark.parametrize(
"content_type",
[
"text/markdown",
"text/x-markdown",
"application/markdown",
"application/x-markdown",
],
)
def test_get_loader_markdown_raw_text_uses_text_loader(tmp_path, content_type):
"""/text path (raw_text=True) must load .md verbatim so formatting survives."""
file_path = tmp_path / "notes.md"
file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")

loader, known_type, file_ext = get_loader(
"notes.md", content_type, str(file_path), raw_text=True
)

assert isinstance(loader, TextLoader)
assert known_type is True
assert file_ext == "md"

docs = loader.load()
assert len(docs) == 1
assert docs[0].page_content == MARKDOWN_SAMPLE


def test_get_loader_markdown_raw_text_by_extension_only(tmp_path):
"""Extension-based detection must still kick in when content type is generic."""
file_path = tmp_path / "README.md"
file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")

loader, _, _ = get_loader(
"README.md", "application/octet-stream", str(file_path), raw_text=True
)

assert isinstance(loader, TextLoader)


def test_get_loader_raw_text_leaves_pdf_alone(tmp_path):
"""raw_text must not disturb binary formats — PDF still uses the PDF loader."""
from app.utils.document_loader import SafePyPDFLoader

file_path = tmp_path / "doc.pdf"
file_path.write_text("not a real pdf")

loader, _, file_ext = get_loader(
"doc.pdf", "application/pdf", str(file_path), raw_text=True
)

assert isinstance(loader, SafePyPDFLoader)
assert file_ext == "pdf"
Loading