diff --git a/packages/api/src/files/text.spec.ts b/packages/api/src/files/text.spec.ts index d9739c6958d3..f04330f14e47 100644 --- a/packages/api/src/files/text.spec.ts +++ b/packages/api/src/files/text.spec.ts @@ -300,5 +300,73 @@ describe('text', () => { source: FileSources.text, }); }); + + it.each([ + { mimetype: 'text/markdown', originalname: 'notes.md' }, + { mimetype: 'text/x-markdown', originalname: 'notes.md' }, + { mimetype: 'text/md', originalname: 'notes' }, + { mimetype: 'application/markdown', originalname: 'notes.md' }, + { mimetype: 'application/x-markdown', originalname: 'notes.md' }, + { mimetype: 'text/plain', originalname: 'notes.md' }, + { mimetype: 'application/octet-stream', originalname: 'README.md' }, + { mimetype: 'application/octet-stream', originalname: 'GUIDE.MARKDOWN' }, + { mimetype: 'application/octet-stream', originalname: 'post.mdown' }, + { mimetype: 'application/octet-stream', originalname: 'post.mkdn' }, + { mimetype: 'application/octet-stream', originalname: 'post.mkd' }, + { mimetype: 'application/octet-stream', originalname: 'docs.mdwn' }, + { mimetype: 'text/markdown; charset=utf-8', originalname: 'notes' }, + { mimetype: 'TEXT/MARKDOWN', originalname: 'notes' }, + { mimetype: ' text/markdown ; charset=UTF-8 ', originalname: 'notes' }, + { mimetype: '', originalname: 'notes.md' }, + ])( + 'should short-circuit to native parsing for markdown file (%o)', + async ({ mimetype, originalname }) => { + process.env.RAG_API_URL = 'http://rag-api.test'; + const mockText = '# Heading\n\n**bold** text'; + const mockBytes = Buffer.byteLength(mockText, 'utf8'); + + mockedReadFileAsString.mockResolvedValue({ + content: mockText, + bytes: mockBytes, + }); + + const result = await parseText({ + req: mockReq, + file: { ...mockFile, mimetype, originalname }, + file_id: mockFileId, + }); + + expect(mockedAxios.get).not.toHaveBeenCalled(); + expect(mockedAxios.post).not.toHaveBeenCalled(); + expect(mockedReadFileAsString).toHaveBeenCalledWith('/tmp/test.txt', { + fileSize: 100, + }); + expect(result).toEqual({ + text: mockText, + bytes: mockBytes, + source: FileSources.text, + }); + }, + ); + + it('should still call the RAG API for non-markdown text files', async () => { + process.env.RAG_API_URL = 'http://rag-api.test'; + const mockText = 'plain text content'; + + mockedAxios.get.mockResolvedValue({ status: 200, statusText: 'OK' }); + mockedAxios.post.mockResolvedValue({ data: { text: mockText } }); + + await parseText({ + req: mockReq, + file: mockFile, + file_id: mockFileId, + }); + + expect(mockedAxios.post).toHaveBeenCalledWith( + 'http://rag-api.test/text', + expect.any(Object), + expect.objectContaining({ timeout: 300000 }), + ); + }); }); }); diff --git a/packages/api/src/files/text.ts b/packages/api/src/files/text.ts index 590ce43d1e4c..eecf6642587c 100644 --- a/packages/api/src/files/text.ts +++ b/packages/api/src/files/text.ts @@ -7,6 +7,32 @@ import type { ServerRequest } from '~/types'; import { logAxiosError, readFileAsString } from '~/utils'; import { generateShortLivedToken } from '~/crypto/jwt'; +const MARKDOWN_MIME_TYPES = new Set([ + 'text/markdown', + 'text/x-markdown', + 'text/md', + 'application/markdown', + 'application/x-markdown', +]); + +const MARKDOWN_EXTENSIONS_RE = /\.(md|markdown|mdown|mkdn|mkd|mdwn)$/i; + +function normalizeMimeType(mimetype: string): string { + if (!mimetype) { + return ''; + } + const semi = mimetype.indexOf(';'); + const base = semi === -1 ? mimetype : mimetype.slice(0, semi); + return base.trim().toLowerCase(); +} + +function isMarkdownFile(file: Express.Multer.File): boolean { + if (MARKDOWN_MIME_TYPES.has(normalizeMimeType(file.mimetype))) { + return true; + } + return MARKDOWN_EXTENSIONS_RE.test(file.originalname ?? ''); +} + /** * Attempts to parse text using RAG API, falls back to native text parsing * @param params - The parameters object @@ -29,6 +55,13 @@ export async function parseText({ return parseTextNative(file); } + if (isMarkdownFile(file)) { + logger.debug( + `[parseText] Markdown file detected (${file.originalname}, ${file.mimetype}), using native parsing to preserve raw formatting`, + ); + return parseTextNative(file); + } + const userId = req.user?.id; if (!userId) { logger.debug('[parseText] No user ID provided, falling back to native text parsing');