danny-avila · danny-avila · Apr 19, 2026 · Apr 18, 2026 · Apr 18, 2026 · chatgpt-codex-connector
diff --git a/app/routes/document_routes.py b/app/routes/document_routes.py
@@ -139,12 +139,23 @@ def _make_unique_temp_path(user_id: str, filename: str) -> Optional[str]:
 
 
 async def load_file_content(
-    filename: str, content_type: str, file_path: str, executor
+    filename: str,
+    content_type: str,
+    file_path: str,
+    executor,
+    raw_text: bool = False,
 ) -> tuple:
-    """Load file content using appropriate loader."""
+    """Load file content using appropriate loader.
+
+    Pass ``raw_text=True`` when the caller wants verbatim file contents (e.g.
+    the ``/text`` endpoint) so text-formatted files are not semantically
+    parsed.
+    """
     loader = None
     try:
-        loader, known_type, file_ext = get_loader(filename, content_type, file_path)
+        loader, known_type, file_ext = get_loader(
+            filename, content_type, file_path, raw_text=raw_text
+        )
         loop = asyncio.get_running_loop()
         data = await loop.run_in_executor(executor, lambda: list(loader.lazy_load()))
         return data, known_type, file_ext
@@ -1085,6 +1096,7 @@ async def extract_text_from_file(
             file.content_type,
             validated_temp_file_path,
             request.app.state.thread_pool,
+            raw_text=True,
         )
 
         # Extract text content from loaded documents

diff --git a/app/utils/document_loader.py b/app/utils/document_loader.py
@@ -68,11 +68,33 @@ def cleanup_temp_encoding_file(loader) -> None:
             logger.warning(f"Failed to remove temporary UTF-8 file: {e}")
 
 
-def get_loader(filename: str, file_content_type: str, filepath: str):
-    """Get the appropriate document loader based on file type and\or content type."""
+def get_loader(
+    filename: str,
+    file_content_type: str,
+    filepath: str,
+    raw_text: bool = False,
+):
+    """Get the appropriate document loader based on file type and\or content type.
+
+    When ``raw_text`` is True, text-formatted files (e.g. Markdown) are loaded
+    verbatim with :class:`TextLoader` so their original formatting is
+    preserved. This is intended for the ``/text`` endpoint, where the caller
+    wants the raw file contents. The embedding path should keep the default
+    (``raw_text=False``) so semantic loaders continue to strip formatting for
+    better vector search quality.
+    """
     file_ext = filename.split(".")[-1].lower()
     known_type = True
 
+    is_markdown = file_ext == "md" or file_content_type in [
+        "text/markdown",
+        "text/x-markdown",
+        "application/markdown",
+        "application/x-markdown",
+    ]
+    if raw_text and is_markdown:
+        return TextLoader(filepath, autodetect_encoding=True), True, file_ext
+
     # File Content Type reference:
     # ref.: https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types
     if file_ext == "pdf" or file_content_type == "application/pdf":

diff --git a/tests/utils/test_document_loader.py b/tests/utils/test_document_loader.py
@@ -2,7 +2,13 @@
 from collections.abc import Iterator
 from unittest.mock import MagicMock, patch
 
+import pytest
+
 from app.utils.document_loader import get_loader, clean_text, process_documents
+from langchain_community.document_loaders import (
+    TextLoader,
+    UnstructuredMarkdownLoader,
+)
 from langchain_core.documents import Document
 
 
@@ -153,7 +159,6 @@ def fallback_gen():
 def test_safe_pdf_loader_non_filter_error_propagates():
     """KeyError that isn't /Filter should propagate, not silently fallback."""
     from app.utils.document_loader import SafePyPDFLoader
-    import pytest
 
     def bad_gen():
         raise KeyError("SomeOtherKey")
@@ -167,3 +172,80 @@ def bad_gen():
 
         with pytest.raises(KeyError, match="SomeOtherKey"):
             list(loader.lazy_load())
+
+
+MARKDOWN_SAMPLE = (
+    "# Heading\n\n"
+    "**bold** and *italic* text with a [link](https://example.com).\n\n"
+    "- item 1\n"
+    "- item 2\n\n"
+    "> a blockquote\n"
+)
+
+
+def test_get_loader_markdown_embed_uses_unstructured(tmp_path):
+    """Default (embedding) path must keep UnstructuredMarkdownLoader for .md."""
+    file_path = tmp_path / "notes.md"
+    file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")
+
+    loader, known_type, file_ext = get_loader(
+        "notes.md", "text/markdown", str(file_path)
+    )
+
+    assert isinstance(loader, UnstructuredMarkdownLoader)
+    assert known_type is True
+    assert file_ext == "md"
+
+
+@pytest.mark.parametrize(
+    "content_type",
+    [
+        "text/markdown",
+        "text/x-markdown",
+        "application/markdown",
+        "application/x-markdown",
+    ],
+)
+def test_get_loader_markdown_raw_text_uses_text_loader(tmp_path, content_type):
+    """/text path (raw_text=True) must load .md verbatim so formatting survives."""
+    file_path = tmp_path / "notes.md"
+    file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")
+
+    loader, known_type, file_ext = get_loader(
+        "notes.md", content_type, str(file_path), raw_text=True
+    )
+
+    assert isinstance(loader, TextLoader)
+    assert known_type is True
+    assert file_ext == "md"
+
+    docs = loader.load()
+    assert len(docs) == 1
+    assert docs[0].page_content == MARKDOWN_SAMPLE
+
+
+def test_get_loader_markdown_raw_text_by_extension_only(tmp_path):
+    """Extension-based detection must still kick in when content type is generic."""
+    file_path = tmp_path / "README.md"
+    file_path.write_text(MARKDOWN_SAMPLE, encoding="utf-8")
+
+    loader, _, _ = get_loader(
+        "README.md", "application/octet-stream", str(file_path), raw_text=True
+    )
+
+    assert isinstance(loader, TextLoader)
+
+
+def test_get_loader_raw_text_leaves_pdf_alone(tmp_path):
+    """raw_text must not disturb binary formats — PDF still uses the PDF loader."""
+    from app.utils.document_loader import SafePyPDFLoader
+
+    file_path = tmp_path / "doc.pdf"
+    file_path.write_text("not a real pdf")
+
+    loader, _, file_ext = get_loader(
+        "doc.pdf", "application/pdf", str(file_path), raw_text=True
+    )
+
+    assert isinstance(loader, SafePyPDFLoader)
+    assert file_ext == "pdf"