Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions app/routes/document_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,23 @@ def _make_unique_temp_path(user_id: str, filename: str) -> Optional[str]:


async def load_file_content(
filename: str, content_type: str, file_path: str, executor
filename: str,
content_type: str,
file_path: str,
executor,
raw_text: bool = False,
) -> tuple:
"""Load file content using appropriate loader."""
"""Load file content using appropriate loader.

Pass ``raw_text=True`` when the caller wants verbatim file contents (e.g.
the ``/text`` endpoint) so text-formatted files are not semantically
parsed.
"""
loader = None
try:
loader, known_type, file_ext = get_loader(filename, content_type, file_path)
loader, known_type, file_ext = get_loader(
filename, content_type, file_path, raw_text=raw_text
)
loop = asyncio.get_running_loop()
data = await loop.run_in_executor(executor, lambda: list(loader.lazy_load()))
return data, known_type, file_ext
Expand Down Expand Up @@ -1085,6 +1096,7 @@ async def extract_text_from_file(
file.content_type,
validated_temp_file_path,
request.app.state.thread_pool,
raw_text=True,
)

# Extract text content from loaded documents
Expand Down
46 changes: 37 additions & 9 deletions app/utils/document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@
)


# Extensions that identify binary file formats handled by dedicated loaders.
# Used to prevent a conflicting multipart Content-Type (e.g. ``text/markdown``)
# from hijacking these files into a text loader.
_BINARY_FILE_EXTENSIONS = frozenset(
{"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "epub"}
)


def detect_file_encoding(filepath: str) -> str:
"""
Detect the encoding of a file using BOM markers and chardet for broader support.
Expand Down Expand Up @@ -68,8 +76,21 @@ def cleanup_temp_encoding_file(loader) -> None:
logger.warning(f"Failed to remove temporary UTF-8 file: {e}")


def get_loader(filename: str, file_content_type: str, filepath: str):
"""Get the appropriate document loader based on file type and\or content type."""
def get_loader(
filename: str,
file_content_type: str,
filepath: str,
raw_text: bool = False,
):
"""Get the appropriate document loader based on file type and\or content type.

When ``raw_text`` is True, text-formatted files (e.g. Markdown) are loaded
verbatim with :class:`TextLoader` so their original formatting is
preserved. This is intended for the ``/text`` endpoint, where the caller
wants the raw file contents. The embedding path should keep the default
(``raw_text=False``) so semantic loaders continue to strip formatting for
better vector search quality.
"""
file_ext = filename.split(".")[-1].lower()
known_type = True

Expand Down Expand Up @@ -121,13 +142,20 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
]:
loader = UnstructuredPowerPointLoader(filepath)
elif file_ext == "md" or file_content_type in [
"text/markdown",
"text/x-markdown",
"application/markdown",
"application/x-markdown",
]:
loader = UnstructuredMarkdownLoader(filepath)
elif file_ext == "md" or (
file_content_type
in [
"text/markdown",
"text/x-markdown",
"application/markdown",
"application/x-markdown",
]
and file_ext not in _BINARY_FILE_EXTENSIONS
):
if raw_text:
loader = TextLoader(filepath, autodetect_encoding=True)
else:
loader = UnstructuredMarkdownLoader(filepath)
elif file_ext == "epub" or file_content_type == "application/epub+zip":
loader = UnstructuredEPubLoader(filepath)
elif file_ext in ["doc", "docx"] or file_content_type in [
Expand Down
113 changes: 112 additions & 1 deletion tests/utils/test_document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
from collections.abc import Iterator
from unittest.mock import MagicMock, patch

import pytest

from app.utils.document_loader import get_loader, clean_text, process_documents
from langchain_community.document_loaders import (
TextLoader,
UnstructuredMarkdownLoader,
)
from langchain_core.documents import Document


Expand Down Expand Up @@ -153,7 +159,6 @@ def fallback_gen():
def test_safe_pdf_loader_non_filter_error_propagates():
"""KeyError that isn't /Filter should propagate, not silently fallback."""
from app.utils.document_loader import SafePyPDFLoader
import pytest

def bad_gen():
raise KeyError("SomeOtherKey")
Expand All @@ -167,3 +172,109 @@ def bad_gen():

with pytest.raises(KeyError, match="SomeOtherKey"):
list(loader.lazy_load())


MARKDOWN_SAMPLE = (
"# Heading\n\n"
"**bold** and *italic* text with a [link](https://example.com).\n\n"
"- item 1\n"
"- item 2\n\n"
"> a blockquote\n"
)


def test_get_loader_markdown_embed_uses_unstructured(tmp_path):
"""Default (embedding) path must keep UnstructuredMarkdownLoader for .md."""
file_path = tmp_path / "notes.md"
file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")

loader, known_type, file_ext = get_loader(
"notes.md", "text/markdown", str(file_path)
)

assert isinstance(loader, UnstructuredMarkdownLoader)
assert known_type is True
assert file_ext == "md"


@pytest.mark.parametrize(
"content_type",
[
"text/markdown",
"text/x-markdown",
"application/markdown",
"application/x-markdown",
],
)
def test_get_loader_markdown_raw_text_uses_text_loader(tmp_path, content_type):
"""/text path (raw_text=True) must load .md verbatim so formatting survives."""
file_path = tmp_path / "notes.md"
file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")

loader, known_type, file_ext = get_loader(
"notes.md", content_type, str(file_path), raw_text=True
)

assert isinstance(loader, TextLoader)
assert known_type is True
assert file_ext == "md"

docs = loader.load()
assert len(docs) == 1
assert docs[0].page_content == MARKDOWN_SAMPLE


def test_get_loader_markdown_raw_text_by_extension_only(tmp_path):
"""Extension-based detection must still kick in when content type is generic."""
file_path = tmp_path / "README.md"
file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")

loader, _, _ = get_loader(
"README.md", "application/octet-stream", str(file_path), raw_text=True
)

assert isinstance(loader, TextLoader)


def test_get_loader_raw_text_leaves_pdf_alone(tmp_path):
"""raw_text must not disturb binary formats — PDF still uses the PDF loader."""
from app.utils.document_loader import SafePyPDFLoader

file_path = tmp_path / "doc.pdf"
file_path.write_text("not a real pdf")

loader, _, file_ext = get_loader(
"doc.pdf", "application/pdf", str(file_path), raw_text=True
)

assert isinstance(loader, SafePyPDFLoader)
assert file_ext == "pdf"


@pytest.mark.parametrize(
"filename, expected_loader_name",
[
("doc.pdf", "SafePyPDFLoader"),
("report.docx", "Docx2txtLoader"),
("book.epub", "UnstructuredEPubLoader"),
("data.xlsx", "UnstructuredExcelLoader"),
("slides.pptx", "UnstructuredPowerPointLoader"),
],
)
def test_get_loader_raw_text_respects_binary_extensions_over_markdown_mime(
tmp_path, filename, expected_loader_name
):
"""A markdown Content-Type must not override a known binary extension.

Some clients send conflicting multipart content types. For an upload named
`doc.pdf` with Content-Type `text/markdown`, the PDF loader still has to
win — otherwise a binary file is read as UTF-8 text.
"""
file_path = tmp_path / filename
file_path.write_text("placeholder binary content")

loader, _, _ = get_loader(
filename, "text/markdown", str(file_path), raw_text=True
)

assert type(loader).__name__ == expected_loader_name
Loading