danny-avila · danny-avila · Apr 19, 2026 · Apr 18, 2026 · Apr 18, 2026
diff --git a/app/routes/document_routes.py b/app/routes/document_routes.py
@@ -139,12 +139,23 @@ def _make_unique_temp_path(user_id: str, filename: str) -> Optional[str]:
 
 
 async def load_file_content(
-    filename: str, content_type: str, file_path: str, executor
+    filename: str,
+    content_type: str,
+    file_path: str,
+    executor,
+    raw_text: bool = False,
 ) -> tuple:
-    """Load file content using appropriate loader."""
+    """Load file content using appropriate loader.
+
+    Pass ``raw_text=True`` when the caller wants verbatim file contents (e.g.
+    the ``/text`` endpoint) so text-formatted files are not semantically
+    parsed.
+    """
     loader = None
     try:
-        loader, known_type, file_ext = get_loader(filename, content_type, file_path)
+        loader, known_type, file_ext = get_loader(
+            filename, content_type, file_path, raw_text=raw_text
+        )
         loop = asyncio.get_running_loop()
         data = await loop.run_in_executor(executor, lambda: list(loader.lazy_load()))
         return data, known_type, file_ext
@@ -1085,6 +1096,7 @@ async def extract_text_from_file(
             file.content_type,
             validated_temp_file_path,
             request.app.state.thread_pool,
+            raw_text=True,
         )
 
         # Extract text content from loaded documents

diff --git a/app/utils/document_loader.py b/app/utils/document_loader.py
@@ -24,6 +24,14 @@
 )
 
 
+# Extensions that identify binary file formats handled by dedicated loaders.
+# Used to prevent a conflicting multipart Content-Type (e.g. ``text/markdown``)
+# from hijacking these files into a text loader.
+_BINARY_FILE_EXTENSIONS = frozenset(
+    {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "epub"}
+)
+
+
 def detect_file_encoding(filepath: str) -> str:
     """
     Detect the encoding of a file using BOM markers and chardet for broader support.
@@ -68,8 +76,21 @@ def cleanup_temp_encoding_file(loader) -> None:
             logger.warning(f"Failed to remove temporary UTF-8 file: {e}")
 
 
-def get_loader(filename: str, file_content_type: str, filepath: str):
-    """Get the appropriate document loader based on file type and\or content type."""
+def get_loader(
+    filename: str,
+    file_content_type: str,
+    filepath: str,
+    raw_text: bool = False,
+):
+    """Get the appropriate document loader based on file type and\or content type.
+
+    When ``raw_text`` is True, text-formatted files (e.g. Markdown) are loaded
+    verbatim with :class:`TextLoader` so their original formatting is
+    preserved. This is intended for the ``/text`` endpoint, where the caller
+    wants the raw file contents. The embedding path should keep the default
+    (``raw_text=False``) so semantic loaders continue to strip formatting for
+    better vector search quality.
+    """
     file_ext = filename.split(".")[-1].lower()
     known_type = True
 
@@ -121,13 +142,20 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
         "application/vnd.openxmlformats-officedocument.presentationml.presentation",
     ]:
         loader = UnstructuredPowerPointLoader(filepath)
-    elif file_ext == "md" or file_content_type in [
-        "text/markdown",
-        "text/x-markdown",
-        "application/markdown",
-        "application/x-markdown",
-    ]:
-        loader = UnstructuredMarkdownLoader(filepath)
+    elif file_ext == "md" or (
+        file_content_type
+        in [
+            "text/markdown",
+            "text/x-markdown",
+            "application/markdown",
+            "application/x-markdown",
+        ]
+        and file_ext not in _BINARY_FILE_EXTENSIONS
+    ):
+        if raw_text:
+            loader = TextLoader(filepath, autodetect_encoding=True)
+        else:
+            loader = UnstructuredMarkdownLoader(filepath)
     elif file_ext == "epub" or file_content_type == "application/epub+zip":
         loader = UnstructuredEPubLoader(filepath)
     elif file_ext in ["doc", "docx"] or file_content_type in [

diff --git a/tests/utils/test_document_loader.py b/tests/utils/test_document_loader.py
@@ -2,7 +2,13 @@
 from collections.abc import Iterator
 from unittest.mock import MagicMock, patch
 
+import pytest
+
 from app.utils.document_loader import get_loader, clean_text, process_documents
+from langchain_community.document_loaders import (
+    TextLoader,
+    UnstructuredMarkdownLoader,
+)
 from langchain_core.documents import Document
 
 
@@ -153,7 +159,6 @@ def fallback_gen():
 def test_safe_pdf_loader_non_filter_error_propagates():
     """KeyError that isn't /Filter should propagate, not silently fallback."""
     from app.utils.document_loader import SafePyPDFLoader
-    import pytest
 
     def bad_gen():
         raise KeyError("SomeOtherKey")
@@ -167,3 +172,109 @@ def bad_gen():
 
         with pytest.raises(KeyError, match="SomeOtherKey"):
             list(loader.lazy_load())
+
+
+MARKDOWN_SAMPLE = (
+    "# Heading\n\n"
+    "**bold** and *italic* text with a [link](https://example.com).\n\n"
+    "- item 1\n"
+    "- item 2\n\n"
+    "> a blockquote\n"
+)
+
+
+def test_get_loader_markdown_embed_uses_unstructured(tmp_path):
+    """Default (embedding) path must keep UnstructuredMarkdownLoader for .md."""
+    file_path = tmp_path / "notes.md"
+    file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")
+
+    loader, known_type, file_ext = get_loader(
+        "notes.md", "text/markdown", str(file_path)
+    )
+
+    assert isinstance(loader, UnstructuredMarkdownLoader)
+    assert known_type is True
+    assert file_ext == "md"
+
+
+@pytest.mark.parametrize(
+    "content_type",
+    [
+        "text/markdown",
+        "text/x-markdown",
+        "application/markdown",
+        "application/x-markdown",
+    ],
+)
+def test_get_loader_markdown_raw_text_uses_text_loader(tmp_path, content_type):
+    """/text path (raw_text=True) must load .md verbatim so formatting survives."""
+    file_path = tmp_path / "notes.md"
+    file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")
+
+    loader, known_type, file_ext = get_loader(
+        "notes.md", content_type, str(file_path), raw_text=True
+    )
+
+    assert isinstance(loader, TextLoader)
+    assert known_type is True
+    assert file_ext == "md"
+
+    docs = loader.load()
+    assert len(docs) == 1
+    assert docs[0].page_content == MARKDOWN_SAMPLE
+
+
+def test_get_loader_markdown_raw_text_by_extension_only(tmp_path):
+    """Extension-based detection must still kick in when content type is generic."""
+    file_path = tmp_path / "README.md"
+    file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")
+
+    loader, _, _ = get_loader(
+        "README.md", "application/octet-stream", str(file_path), raw_text=True
+    )
+
+    assert isinstance(loader, TextLoader)
+
+
+def test_get_loader_raw_text_leaves_pdf_alone(tmp_path):
+    """raw_text must not disturb binary formats — PDF still uses the PDF loader."""
+    from app.utils.document_loader import SafePyPDFLoader
+
+    file_path = tmp_path / "doc.pdf"
+    file_path.write_text("not a real pdf")
+
+    loader, _, file_ext = get_loader(
+        "doc.pdf", "application/pdf", str(file_path), raw_text=True
+    )
+
+    assert isinstance(loader, SafePyPDFLoader)
+    assert file_ext == "pdf"
+
+
+@pytest.mark.parametrize(
+    "filename, expected_loader_name",
+    [
+        ("doc.pdf", "SafePyPDFLoader"),
+        ("report.docx", "Docx2txtLoader"),
+        ("book.epub", "UnstructuredEPubLoader"),
+        ("data.xlsx", "UnstructuredExcelLoader"),
+        ("slides.pptx", "UnstructuredPowerPointLoader"),
+    ],
+)
+def test_get_loader_raw_text_respects_binary_extensions_over_markdown_mime(
+    tmp_path, filename, expected_loader_name
+):
+    """A markdown Content-Type must not override a known binary extension.
+
+    Some clients send conflicting multipart content types. For an upload named
+    `doc.pdf` with Content-Type `text/markdown`, the PDF loader still has to
+    win — otherwise a binary file is read as UTF-8 text.
+    """
+    file_path = tmp_path / filename
+    file_path.write_text("placeholder binary content")
+
+    loader, _, _ = get_loader(
+        filename, "text/markdown", str(file_path), raw_text=True
+    )
+
+    assert type(loader).__name__ == expected_loader_name