📝 fix: Preserve Raw Markdown Formatting on Upload as Text (danny-avila#12734)

danny-avila · krgokul · commit 1225a3f5504e · 2026-04-20T13:09:36.000+05:30
* 🐛 fix: Preserve Raw Markdown on `Upload as Text`

When `RAG_API_URL` is configured, `.md` uploads were sent to the RAG API
`/text` endpoint, which routes Markdown through `UnstructuredMarkdownLoader`
and strips formatting (`#`, `**`, lists, blockquotes). Users expect `Upload
as Text` to preserve raw content - identical bytes in a `.txt` file round-trip
verbatim, while the `.md` came back stripped.

Short-circuit the RAG API call for Markdown files (by MIME type or `.md` /
`.markdown` extension) and read the file verbatim via `parseTextNative`.
Non-Markdown paths are unaffected, and the embedding path (`/embed`) keeps
its existing loader so vector search quality is unchanged.

* 🐛 fix: normalize markdown MIME and accept `text/md`

Addressing review feedback on the `Upload as Text` short-circuit:

- Accept `text/md` in the markdown MIME set (LibreChat treats it as a
  valid markdown type elsewhere, e.g. the artifact-rendering prompt).
- Normalize the incoming MIME type (lowercase + strip parameters) before
  the set lookup so parameterized values like
  `text/markdown; charset=utf-8` and uppercase `TEXT/MARKDOWN` still
  short-circuit. Extensionless uploads relying only on the `Content-Type`
  header would otherwise fall through to the RAG `/text` endpoint and
  lose their markdown formatting.

Extend `text.spec.ts` parametrized cases with `text/md`, parameterized
MIME, uppercase, and whitespace-padded variants.

* 🧹 chore: Address Code Review Follow-ups on `Upload as Text` fix

Addressing comprehensive review feedback:

- Debug log now includes filename and MIME type so operators can
  identify which upload triggered the short-circuit without having
  to correlate other logs.
- Expand markdown extension detection beyond `.md` / `.markdown` to
  cover `.mdown`, `.mkdn`, `.mkd`, `.mdwn` (case-insensitive regex).
- Tighten `normalizeMimeType` parameter type from `string | undefined`
  to `string` to match the actual Express.Multer.File type. The
  falsy-check still protects against empty strings at runtime.
- Extend parametrized tests with the most common real-world shapes:
  `text/plain` + `.md` (the MIME most browsers/servers assign),
  the new rare extensions, and empty MIME + `.md` (pure extension
  fallback path).
- Add a positive assertion that `readFileAsString` was called with the
  expected arguments on every short-circuit case, so tests fail loudly
  if the native-parse path ever regresses.

* 🧪 test: Cover `.mdwn` regex branch in Markdown short-circuit

Every other alternation in `MARKDOWN_EXTENSIONS_RE` has at least one
test case (`md`, `markdown`, `mdown`, `mkdn`, `mkd`) but `mdwn` did
not, leaving a typo in that branch undetectable.
diff --git a/packages/api/src/files/text.spec.ts b/packages/api/src/files/text.spec.ts
@@ -300,5 +300,73 @@ describe('text', () => {
         source: FileSources.text,
       });
     });
+
+    it.each([
+      { mimetype: 'text/markdown', originalname: 'notes.md' },
+      { mimetype: 'text/x-markdown', originalname: 'notes.md' },
+      { mimetype: 'text/md', originalname: 'notes' },
+      { mimetype: 'application/markdown', originalname: 'notes.md' },
+      { mimetype: 'application/x-markdown', originalname: 'notes.md' },
+      { mimetype: 'text/plain', originalname: 'notes.md' },
+      { mimetype: 'application/octet-stream', originalname: 'README.md' },
+      { mimetype: 'application/octet-stream', originalname: 'GUIDE.MARKDOWN' },
+      { mimetype: 'application/octet-stream', originalname: 'post.mdown' },
+      { mimetype: 'application/octet-stream', originalname: 'post.mkdn' },
+      { mimetype: 'application/octet-stream', originalname: 'post.mkd' },
+      { mimetype: 'application/octet-stream', originalname: 'docs.mdwn' },
+      { mimetype: 'text/markdown; charset=utf-8', originalname: 'notes' },
+      { mimetype: 'TEXT/MARKDOWN', originalname: 'notes' },
+      { mimetype: '  text/markdown ; charset=UTF-8  ', originalname: 'notes' },
+      { mimetype: '', originalname: 'notes.md' },
+    ])(
+      'should short-circuit to native parsing for markdown file (%o)',
+      async ({ mimetype, originalname }) => {
+        process.env.RAG_API_URL = 'http://rag-api.test';
+        const mockText = '# Heading\n\n**bold** text';
+        const mockBytes = Buffer.byteLength(mockText, 'utf8');
+
+        mockedReadFileAsString.mockResolvedValue({
+          content: mockText,
+          bytes: mockBytes,
+        });
+
+        const result = await parseText({
+          req: mockReq,
+          file: { ...mockFile, mimetype, originalname },
+          file_id: mockFileId,
+        });
+
+        expect(mockedAxios.get).not.toHaveBeenCalled();
+        expect(mockedAxios.post).not.toHaveBeenCalled();
+        expect(mockedReadFileAsString).toHaveBeenCalledWith('/tmp/test.txt', {
+          fileSize: 100,
+        });
+        expect(result).toEqual({
+          text: mockText,
+          bytes: mockBytes,
+          source: FileSources.text,
+        });
+      },
+    );
+
+    it('should still call the RAG API for non-markdown text files', async () => {
+      process.env.RAG_API_URL = 'http://rag-api.test';
+      const mockText = 'plain text content';
+
+      mockedAxios.get.mockResolvedValue({ status: 200, statusText: 'OK' });
+      mockedAxios.post.mockResolvedValue({ data: { text: mockText } });
+
+      await parseText({
+        req: mockReq,
+        file: mockFile,
+        file_id: mockFileId,
+      });
+
+      expect(mockedAxios.post).toHaveBeenCalledWith(
+        'http://rag-api.test/text',
+        expect.any(Object),
+        expect.objectContaining({ timeout: 300000 }),
+      );
+    });
   });
 });
diff --git a/packages/api/src/files/text.ts b/packages/api/src/files/text.ts
@@ -7,6 +7,32 @@ import type { ServerRequest } from '~/types';
 import { logAxiosError, readFileAsString } from '~/utils';
 import { generateShortLivedToken } from '~/crypto/jwt';
 
+const MARKDOWN_MIME_TYPES = new Set([
+  'text/markdown',
+  'text/x-markdown',
+  'text/md',
+  'application/markdown',
+  'application/x-markdown',
+]);
+
+const MARKDOWN_EXTENSIONS_RE = /\.(md|markdown|mdown|mkdn|mkd|mdwn)$/i;
+
+function normalizeMimeType(mimetype: string): string {
+  if (!mimetype) {
+    return '';
+  }
+  const semi = mimetype.indexOf(';');
+  const base = semi === -1 ? mimetype : mimetype.slice(0, semi);
+  return base.trim().toLowerCase();
+}
+
+function isMarkdownFile(file: Express.Multer.File): boolean {
+  if (MARKDOWN_MIME_TYPES.has(normalizeMimeType(file.mimetype))) {
+    return true;
+  }
+  return MARKDOWN_EXTENSIONS_RE.test(file.originalname ?? '');
+}
+
 /**
  * Attempts to parse text using RAG API, falls back to native text parsing
  * @param params - The parameters object
@@ -29,6 +55,13 @@ export async function parseText({
     return parseTextNative(file);
   }
 
+  if (isMarkdownFile(file)) {
+    logger.debug(
+      `[parseText] Markdown file detected (${file.originalname}, ${file.mimetype}), using native parsing to preserve raw formatting`,
+    );
+    return parseTextNative(file);
+  }
+
   const userId = req.user?.id;
   if (!userId) {
     logger.debug('[parseText] No user ID provided, falling back to native text parsing');