diff --git a/api/package.json b/api/package.json index 1c40ddb33771..918546ce4724 100644 --- a/api/package.json +++ b/api/package.json @@ -80,6 +80,7 @@ "klona": "^2.0.6", "librechat-data-provider": "*", "lodash": "^4.17.23", + "mammoth": "^1.11.0", "mathjs": "^15.1.0", "meilisearch": "^0.38.0", "memorystore": "^1.6.7", @@ -102,6 +103,7 @@ "passport-jwt": "^4.0.1", "passport-ldapauth": "^3.0.1", "passport-local": "^1.0.0", + "pdfjs-dist": "^5.4.530", "rate-limit-redis": "^4.2.0", "sharp": "^0.33.5", "tiktoken": "^1.0.15", @@ -110,6 +112,7 @@ "undici": "^7.18.2", "winston": "^3.11.0", "winston-daily-rotate-file": "^5.0.0", + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", "zod": "^3.22.4" }, "devDependencies": { diff --git a/api/server/services/Files/Documents/__tests__/documents.spec.js b/api/server/services/Files/Documents/__tests__/documents.spec.js new file mode 100644 index 000000000000..b34d61595cb6 --- /dev/null +++ b/api/server/services/Files/Documents/__tests__/documents.spec.js @@ -0,0 +1,62 @@ +const path = require('path'); +const { parseDocument } = require('~/server/services/Files/Documents/crud'); + +describe('Document Parser', () => { + test('parseDocument() parses text from docx', async () => { + const file = { + filename: 'sample.docx', + path: path.join(__dirname, 'sample.docx'), + mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + }; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 29, + filename: 'sample.docx', + filepath: 'document_parser', + images: [], + text: 'This is a sample DOCX file.\n\n', + }); + }); + + test('parseDocument() parses text from xlsx', async () => { + const file = { + filename: 'sample.xlsx', + path: path.join(__dirname, 'sample.xlsx'), + mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + }; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 66, + filename: 'sample.xlsx', + filepath: 'document_parser', + images: [], + text: 'Sheet One:\nData,on,first,sheet\nSecond Sheet:\nData,On\nSecond,Sheet\n', + }); + }); + + test('parseDocument() throws error for unhandled document type', async () => { + const file = { + filename: 'nonexistent.file', + path: path.join(__dirname, 'nonexistent.file'), + mimetype: 'application/invalid', + }; + + await expect(parseDocument({ file })).rejects.toThrow( + 'Unsupported file type in document parser: application/invalid', + ); + }); + + test('parseDocument() throws error for empty document', async () => { + const file = { + filename: 'empty.docx', + path: path.join(__dirname, 'empty.docx'), + mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + }; + + await expect(parseDocument({ file })).rejects.toThrow('No text found in document'); + }); +}); diff --git a/api/server/services/Files/Documents/__tests__/empty.docx b/api/server/services/Files/Documents/__tests__/empty.docx new file mode 100644 index 000000000000..c089246167a1 Binary files /dev/null and b/api/server/services/Files/Documents/__tests__/empty.docx differ diff --git a/api/server/services/Files/Documents/__tests__/sample.docx b/api/server/services/Files/Documents/__tests__/sample.docx new file mode 100644 index 000000000000..c7e1c02b6549 Binary files /dev/null and b/api/server/services/Files/Documents/__tests__/sample.docx differ diff --git a/api/server/services/Files/Documents/__tests__/sample.xlsx b/api/server/services/Files/Documents/__tests__/sample.xlsx new file mode 100644 index 000000000000..2abb6961d15e Binary files /dev/null and b/api/server/services/Files/Documents/__tests__/sample.xlsx differ diff --git a/api/server/services/Files/Documents/crud.js b/api/server/services/Files/Documents/crud.js new file mode 100644 index 000000000000..14239c8394d7 --- /dev/null +++ b/api/server/services/Files/Documents/crud.js @@ -0,0 +1,99 @@ +const fs = require('fs'); +const { FileSources } = require('librechat-data-provider'); +const mammoth = require('mammoth'); +const XLSX = require('xlsx'); + +/** + * Parses an uploaded document and extracts its text content and metadata. + * + * Throws an Error if it fails to parse or no text is found. + * + * @param {Express.Multer.File} file - The uploaded file to parse. + * @returns {Promise} A readable stream of the file. + */ +async function parseDocument({ file }) { + let text; + switch (file.mimetype) { + case 'application/pdf': + text = await pdfToText(file); + break; + case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': + text = await wordDocToText(file); + break; + case 'application/vnd.ms-excel': + case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': + text = excelSheetToText(file); + break; + default: + throw new Error(`Unsupported file type in document parser: ${file.mimetype}`); + } + + if (!text?.trim()) { + throw Error('No text found in document'); + } + + return { + filename: file.filename, + bytes: Buffer.byteLength(text, 'utf8'), + filepath: FileSources.document_parser, + text, + images: [], + }; +} + +/** + * Parses PDF, returns text inside. + * + * @param {Express.Multer.File} file - The file. + * @returns {Promise} the text contents of the PDF. + */ +async function pdfToText(file) { + // Imported inline so that Jest can test other routes without failing due to loading ESM + const { getDocument } = await import('pdfjs-dist/legacy/build/pdf.mjs'); + + const data = new Uint8Array(await fs.promises.readFile(file.path)); + const pdf = await getDocument({ data }).promise; + + // Extract text from all pages + let fullText = ''; + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const textContent = await page.getTextContent(); + const pageText = textContent.items.map((item) => item.str).join(' '); + fullText += pageText + '\n'; + } + + return fullText; +} + +/** + * Parses Word document, returns text inside. + * + * @param {Express.Multer.File} file - The file. + * @returns {Promise} the text contents of the Word document. + */ +async function wordDocToText(file) { + const rawText = await mammoth.extractRawText({ path: file.path }); + return rawText.value; +} + +/** + * Parses Excel sheet, returns text inside. + * + * @param {Express.Multer.File} file - The file. + * @returns {string} the text contents of the XLS/XLSX. + */ +function excelSheetToText(file) { + const workbook = XLSX.readFile(file.path); + + let text = ''; + workbook.SheetNames.forEach((sheetName) => { + const worksheet = workbook.Sheets[sheetName]; + const worksheetAsCsvString = XLSX.utils.sheet_to_csv(worksheet); + text += `${sheetName}:\n${worksheetAsCsvString}\n`; + }); + + return text; +} + +module.exports = { parseDocument }; diff --git a/api/server/services/Files/strategies.js b/api/server/services/Files/strategies.js index 2ad526194b78..0f669824a035 100644 --- a/api/server/services/Files/strategies.js +++ b/api/server/services/Files/strategies.js @@ -51,6 +51,7 @@ const { const { uploadOpenAIFile, deleteOpenAIFile, getOpenAIFileStream } = require('./OpenAI'); const { getCodeOutputDownloadStream, uploadCodeEnvFile } = require('./Code'); const { uploadVectors, deleteVectors } = require('./VectorDB'); +const { parseDocument } = require('~/server/services/Files/Documents/crud'); /** * Firebase Storage Strategy Functions @@ -246,6 +247,26 @@ const vertexMistralOCRStrategy = () => ({ handleFileUpload: uploadGoogleVertexMistralOCR, }); +const documentParserStrategy = () => ({ + /** @type {typeof saveFileFromURL | null} */ + saveURL: null, + /** @type {typeof getLocalFileURL | null} */ + getFileURL: null, + /** @type {typeof saveLocalBuffer | null} */ + saveBuffer: null, + /** @type {typeof processLocalAvatar | null} */ + processAvatar: null, + /** @type {typeof uploadLocalImage | null} */ + handleImageUpload: null, + /** @type {typeof prepareImagesLocal | null} */ + prepareImagePayload: null, + /** @type {typeof deleteLocalFile | null} */ + deleteFile: null, + /** @type {typeof getLocalFileStream | null} */ + getDownloadStream: null, + handleFileUpload: parseDocument, +}); + // Strategy Selector const getStrategyFunctions = (fileSource) => { if (fileSource === FileSources.firebase) { @@ -270,6 +291,8 @@ const getStrategyFunctions = (fileSource) => { return azureMistralOCRStrategy(); } else if (fileSource === FileSources.vertexai_mistral_ocr) { return vertexMistralOCRStrategy(); + } else if (fileSource === FileSources.document_parser) { + return documentParserStrategy(); } else if (fileSource === FileSources.text) { return localStrategy(); // Text files use local strategy } else { diff --git a/package-lock.json b/package-lock.json index 62af363289be..40bf424c274e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -95,6 +95,7 @@ "klona": "^2.0.6", "librechat-data-provider": "*", "lodash": "^4.17.23", + "mammoth": "^1.11.0", "mathjs": "^15.1.0", "meilisearch": "^0.38.0", "memorystore": "^1.6.7", @@ -117,6 +118,7 @@ "passport-jwt": "^4.0.1", "passport-ldapauth": "^3.0.1", "passport-local": "^1.0.0", + "pdfjs-dist": "^5.4.530", "rate-limit-redis": "^4.2.0", "sharp": "^0.33.5", "tiktoken": "^1.0.15", @@ -125,6 +127,7 @@ "undici": "^7.18.2", "winston": "^3.11.0", "winston-daily-rotate-file": "^5.0.0", + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", "zod": "^3.22.4" }, "devDependencies": { @@ -11380,6 +11383,256 @@ "sparse-bitfield": "^3.0.3" } }, + "node_modules/@napi-rs/canvas": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.88.tgz", + "integrity": "sha512-/p08f93LEbsL5mDZFQ3DBxcPv/I4QG9EDYRRq1WNlCOXVfAHBTHMSVMwxlqG/AtnSfUr9+vgfN7MKiyDo0+Weg==", + "license": "MIT", + "optional": true, + "workspaces": [ + "e2e/*" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + }, + "optionalDependencies": { + "@napi-rs/canvas-android-arm64": "0.1.88", + "@napi-rs/canvas-darwin-arm64": "0.1.88", + "@napi-rs/canvas-darwin-x64": "0.1.88", + "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.88", + "@napi-rs/canvas-linux-arm64-gnu": "0.1.88", + "@napi-rs/canvas-linux-arm64-musl": "0.1.88", + "@napi-rs/canvas-linux-riscv64-gnu": "0.1.88", + "@napi-rs/canvas-linux-x64-gnu": "0.1.88", + "@napi-rs/canvas-linux-x64-musl": "0.1.88", + "@napi-rs/canvas-win32-arm64-msvc": "0.1.88", + "@napi-rs/canvas-win32-x64-msvc": "0.1.88" + } + }, + "node_modules/@napi-rs/canvas-android-arm64": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.88.tgz", + "integrity": "sha512-KEaClPnZuVxJ8smUWjV1wWFkByBO/D+vy4lN+Dm5DFH514oqwukxKGeck9xcKJhaWJGjfruGmYGiwRe//+/zQQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-darwin-arm64": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.88.tgz", + "integrity": "sha512-Xgywz0dDxOKSgx3eZnK85WgGMmGrQEW7ZLA/E7raZdlEE+xXCozobgqz2ZvYigpB6DJFYkqnwHjqCOTSDGlFdg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-darwin-x64": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.88.tgz", + "integrity": "sha512-Yz4wSCIQOUgNucgk+8NFtQxQxZV5NO8VKRl9ePKE6XoNyNVC8JDqtvhh3b3TPqKK8W5p2EQpAr1rjjm0mfBxdg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.88.tgz", + "integrity": "sha512-9gQM2SlTo76hYhxHi2XxWTAqpTOb+JtxMPEIr+H5nAhHhyEtNmTSDRtz93SP7mGd2G3Ojf2oF5tP9OdgtgXyKg==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-gnu": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.88.tgz", + "integrity": "sha512-7qgaOBMXuVRk9Fzztzr3BchQKXDxGbY+nwsovD3I/Sx81e+sX0ReEDYHTItNb0Je4NHbAl7D0MKyd4SvUc04sg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-musl": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.88.tgz", + "integrity": "sha512-kYyNrUsHLkoGHBc77u4Unh067GrfiCUMbGHC2+OTxbeWfZkPt2o32UOQkhnSswKd9Fko/wSqqGkY956bIUzruA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-riscv64-gnu": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.88.tgz", + "integrity": "sha512-HVuH7QgzB0yavYdNZDRyAsn/ejoXB0hn8twwFnOqUbCCdkV+REna7RXjSR7+PdfW0qMQ2YYWsLvVBT5iL/mGpw==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-gnu": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.88.tgz", + "integrity": "sha512-hvcvKIcPEQrvvJtJnwD35B3qk6umFJ8dFIr8bSymfrSMem0EQsfn1ztys8ETIFndTwdNWJKWluvxztA41ivsEw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-musl": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.88.tgz", + "integrity": "sha512-eSMpGYY2xnZSQ6UxYJ6plDboxq4KeJ4zT5HaVkUnbObNN6DlbJe0Mclh3wifAmquXfrlgTZt6zhHsUgz++AK6g==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-win32-arm64-msvc": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-arm64-msvc/-/canvas-win32-arm64-msvc-0.1.88.tgz", + "integrity": "sha512-qcIFfEgHrchyYqRrxsCeTQgpJZ/GqHiqPcU/Fvw/ARVlQeDX1VyFH+X+0gCR2tca6UJrq96vnW+5o7buCq+erA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-win32-x64-msvc": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.88.tgz", + "integrity": "sha512-ROVqbfS4QyZxYkqmaIBBpbz/BQvAR+05FXM5PAtTYVc0uyY8Y4BHJSMdGAaMf6TdIVRsQsiq+FG/dH9XhvWCFQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, "node_modules/@napi-rs/wasm-runtime": { "version": "0.2.12", "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-0.2.12.tgz", @@ -21601,6 +21854,12 @@ "node": ">=8" } }, + "node_modules/bluebird": { + "version": "3.4.7", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz", + "integrity": "sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==", + "license": "MIT" + }, "node_modules/bn.js": { "version": "5.2.2", "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-5.2.2.tgz", @@ -22854,7 +23113,6 @@ "version": "1.0.3", "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", - "dev": true, "license": "MIT" }, "node_modules/cors": { @@ -24239,6 +24497,12 @@ "dev": true, "license": "MIT" }, + "node_modules/dingbat-to-unicode": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dingbat-to-unicode/-/dingbat-to-unicode-1.0.1.tgz", + "integrity": "sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==", + "license": "BSD-2-Clause" + }, "node_modules/dlv": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/dlv/-/dlv-1.1.3.tgz", @@ -24375,6 +24639,15 @@ "resolved": "https://registry.npmjs.org/downloadjs/-/downloadjs-1.4.7.tgz", "integrity": "sha512-LN1gO7+u9xjU5oEScGFKvXhYf7Y/empUIIEAGBs1LzUq/rg5duiDrkuH5A2lQGd5jfMOb9X9usDa2oVXwJ0U/Q==" }, + "node_modules/duck": { + "version": "0.1.12", + "resolved": "https://registry.npmjs.org/duck/-/duck-0.1.12.tgz", + "integrity": "sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==", + "license": "BSD", + "dependencies": { + "underscore": "^1.13.1" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -27588,6 +27861,12 @@ "integrity": "sha512-Ius2VYcGNk7T90CppJqcIkS5ooHUZyIQK+ClZfMfMNFEF9VSE73Fq+906u/CWu92x4gzZMWOwfFYckPObzdEbA==", "dev": true }, + "node_modules/immediate": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", + "integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==", + "license": "MIT" + }, "node_modules/import-cwd": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/import-cwd/-/import-cwd-3.0.0.tgz", @@ -30052,6 +30331,45 @@ "node": ">=4.0" } }, + "node_modules/jszip": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", + "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==", + "license": "(MIT OR GPL-3.0-or-later)", + "dependencies": { + "lie": "~3.3.0", + "pako": "~1.0.2", + "readable-stream": "~2.3.6", + "setimmediate": "^1.0.5" + } + }, + "node_modules/jszip/node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", + "license": "MIT" + }, + "node_modules/jszip/node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "license": "MIT", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/jszip/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "license": "MIT" + }, "node_modules/jwa": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", @@ -30354,6 +30672,15 @@ "resolved": "packages/data-provider", "link": true }, + "node_modules/lie": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", + "integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==", + "license": "MIT", + "dependencies": { + "immediate": "~3.0.5" + } + }, "node_modules/lilconfig": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz", @@ -30997,6 +31324,17 @@ "loose-envify": "cli.js" } }, + "node_modules/lop": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/lop/-/lop-0.4.2.tgz", + "integrity": "sha512-RefILVDQ4DKoRZsJ4Pj22TxE3omDO47yFpkIBoDKzkqPRISs5U1cnAdg/5583YPkWPaLIYHOKRMQSvjFsO26cw==", + "license": "BSD-2-Clause", + "dependencies": { + "duck": "^0.1.12", + "option": "~0.2.1", + "underscore": "^1.13.1" + } + }, "node_modules/lowlight": { "version": "2.9.0", "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-2.9.0.tgz", @@ -31106,6 +31444,48 @@ "tmpl": "1.0.5" } }, + "node_modules/mammoth": { + "version": "1.11.0", + "resolved": "https://registry.npmjs.org/mammoth/-/mammoth-1.11.0.tgz", + "integrity": "sha512-BcEqqY/BOwIcI1iR5tqyVlqc3KIaMRa4egSoK83YAVrBf6+yqdAAbtUcFDCWX8Zef8/fgNZ6rl4VUv+vVX8ddQ==", + "license": "BSD-2-Clause", + "dependencies": { + "@xmldom/xmldom": "^0.8.6", + "argparse": "~1.0.3", + "base64-js": "^1.5.1", + "bluebird": "~3.4.0", + "dingbat-to-unicode": "^1.0.1", + "jszip": "^3.7.1", + "lop": "^0.4.2", + "path-is-absolute": "^1.0.0", + "underscore": "^1.13.1", + "xmlbuilder": "^10.0.0" + }, + "bin": { + "mammoth": "bin/mammoth" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/mammoth/node_modules/argparse": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", + "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", + "license": "MIT", + "dependencies": { + "sprintf-js": "~1.0.2" + } + }, + "node_modules/mammoth/node_modules/xmlbuilder": { + "version": "10.1.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-10.1.1.tgz", + "integrity": "sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==", + "license": "MIT", + "engines": { + "node": ">=4.0" + } + }, "node_modules/markdown-table": { "version": "3.0.4", "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.4.tgz", @@ -33522,6 +33902,12 @@ "integrity": "sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==", "dev": true }, + "node_modules/option": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/option/-/option-0.2.4.tgz", + "integrity": "sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==", + "license": "BSD-2-Clause" + }, "node_modules/optionator": { "version": "0.9.3", "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz", @@ -33674,7 +34060,6 @@ "version": "1.0.11", "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==", - "dev": true, "license": "(MIT AND Zlib)" }, "node_modules/parent-module": { @@ -33953,7 +34338,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", - "dev": true, "engines": { "node": ">=0.10.0" } @@ -34032,6 +34416,18 @@ "node": ">= 0.10" } }, + "node_modules/pdfjs-dist": { + "version": "5.4.530", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.530.tgz", + "integrity": "sha512-r1hWsSIGGmyYUAHR26zSXkxYWLXLMd6AwqcaFYG9YUZ0GBf5GvcjJSeo512tabM4GYFhxhl5pMCmPr7Q72Rq2Q==", + "license": "Apache-2.0", + "engines": { + "node": ">=20.16.0 || >=22.3.0" + }, + "optionalDependencies": { + "@napi-rs/canvas": "^0.1.84" + } + }, "node_modules/peek-readable": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/peek-readable/-/peek-readable-5.0.0.tgz", @@ -35713,7 +36109,6 @@ "version": "2.0.1", "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", - "dev": true, "license": "MIT" }, "node_modules/promise.series": { @@ -38176,7 +38571,6 @@ "version": "1.0.5", "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==", - "dev": true, "license": "MIT" }, "node_modules/setprototypeof": { @@ -38441,7 +38835,6 @@ "version": "1.0.3", "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", "integrity": "sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==", - "dev": true, "license": "BSD-3-Clause" }, "node_modules/sse.js": { @@ -40249,6 +40642,12 @@ "integrity": "sha512-WxONCrssBM8TSPRqN5EmsjVrsv4A8X12J4ArBiiayv3DyyG3ZlIg6yysuuSYdZsVz3TKcTg2fd//Ujd4CHV1iA==", "dev": true }, + "node_modules/underscore": { + "version": "1.13.7", + "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.7.tgz", + "integrity": "sha512-GMXzWtsc57XAtguZgaQViUOzs0KTkk8ojr3/xAxXLITqf/3EMwxC0inyETfDFjH/Krbhuep0HNbbjI9i/q3F3g==", + "license": "MIT" + }, "node_modules/undici": { "version": "7.20.0", "resolved": "https://registry.npmjs.org/undici/-/undici-7.20.0.tgz", @@ -41904,6 +42303,18 @@ } } }, + "node_modules/xlsx": { + "version": "0.20.3", + "resolved": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", + "integrity": "sha512-oLDq3jw7AcLqKWH2AhCpVTZl8mf6X2YReP+Neh0SJUzV/BdZYjth94tG5toiMB1PPrYtxOCfaoUCkvtuH+3AJA==", + "license": "Apache-2.0", + "bin": { + "xlsx": "bin/xlsx.njs" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/xml": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz", diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index 360cce69ba8d..843b8cdcdd70 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -820,6 +820,7 @@ export enum OCRStrategy { CUSTOM_OCR = 'custom_ocr', AZURE_MISTRAL_OCR = 'azure_mistral_ocr', VERTEXAI_MISTRAL_OCR = 'vertexai_mistral_ocr', + DOCUMENT_PARSER = 'document_parser', } export enum SearchCategories { diff --git a/packages/data-provider/src/types/files.ts b/packages/data-provider/src/types/files.ts index ec42520bc054..1eb8c200d6d9 100644 --- a/packages/data-provider/src/types/files.ts +++ b/packages/data-provider/src/types/files.ts @@ -13,6 +13,7 @@ export enum FileSources { azure_mistral_ocr = 'azure_mistral_ocr', vertexai_mistral_ocr = 'vertexai_mistral_ocr', text = 'text', + document_parser = 'document_parser', } export const checkOpenAIStorage = (source: string) =>