diff --git a/packages/api/src/agents/__tests__/initialize.test.ts b/packages/api/src/agents/__tests__/initialize.test.ts index 974d5f2b7fa7..c9a0fa878324 100644 --- a/packages/api/src/agents/__tests__/initialize.test.ts +++ b/packages/api/src/agents/__tests__/initialize.test.ts @@ -1231,3 +1231,130 @@ describe('initializeAgent — execute_code capability expansion', () => { ).rejects.toThrow(/google_tool_conflict/); }); }); + +describe('initializeAgent — code-generated file thread filter (regression)', () => { + /* Sibling-branched conversation regression. Pre-fix the priming chain + * filtered code-generated files by `messageId IN threadMessageIds`, + * which excluded files whose creator messageId lived on a sibling + * branch (preserved on the File record by `processCodeOutput` for + * provenance). The fix anchors `getCodeGeneratedFiles` on + * `threadFileIds` instead — file_ids referenced by the thread's + * `messages.files[]` arrays. This block locks the new contract at + * the integration boundary: assert the right call shape, not the + * underlying Mongo query (covered separately by + * `data-schemas/methods/file.spec`). */ + + beforeEach(() => { + jest.clearAllMocks(); + mockExtractLibreChatParams.mockReset(); + mockGetThreadData.mockReset(); + }); + + function setupExecuteCodeAgent() { + const { agent, req, res, loadTools, db } = createMocks({ + provider: Providers.OPENAI, + }); + agent.tools = ['execute_code']; + + /* `resendFiles: true` is the gate that opens the thread-file + * priming block in initialize.ts. Without it the whole + * codeGeneratedFiles fetch is skipped. */ + mockExtractLibreChatParams.mockReturnValue({ + resendFiles: true, + maxContextTokens: undefined, + modelOptions: { model: 'test-model' }, + }); + + return { agent, req, res, loadTools, db }; + } + + it('passes threadFileIds (not threadMessageIds) to getCodeGeneratedFiles', async () => { + const { agent, req, res, loadTools, db } = setupExecuteCodeAgent(); + + /* Simulate the branched scenario: parent message N is a sibling + * regeneration. `getThreadData` walks back from N and collects + * messageIds [N, root] plus fileIds referenced by N.files[]. */ + mockGetThreadData.mockReturnValue({ + messageIds: ['msgN', 'msgRoot'], + fileIds: ['file-pptx-skill', 'file-output-csv'], + }); + + const getCodeGeneratedFiles = jest.fn().mockResolvedValue([]); + const getUserCodeFiles = jest.fn().mockResolvedValue([]); + const getMessages = jest + .fn() + .mockResolvedValue([{ messageId: 'msgN', parentMessageId: 'msgRoot', files: [] }]); + + const dbWithThreadCalls: InitializeAgentDbMethods = { + ...db, + getMessages, + getCodeGeneratedFiles, + getUserCodeFiles, + }; + + await initializeAgent( + { + req, + res, + agent, + loadTools, + endpointOption: { endpoint: EModelEndpoint.agents }, + conversationId: 'conv-1', + parentMessageId: 'msgN', + allowedProviders: new Set([Providers.OPENAI]), + isInitialAgent: true, + codeEnvAvailable: true, + }, + dbWithThreadCalls, + ); + + expect(getCodeGeneratedFiles).toHaveBeenCalledTimes(1); + expect(getCodeGeneratedFiles).toHaveBeenCalledWith('conv-1', [ + 'file-pptx-skill', + 'file-output-csv', + ]); + /* Both functions now share the same primary anchor — symmetric + * design that closes the sibling-branch hole. */ + expect(getUserCodeFiles).toHaveBeenCalledWith(['file-pptx-skill', 'file-output-csv']); + }); + + it('skips the code-generated fetch entirely when threadFileIds is empty', async () => { + /* Empty `messages.files[]` across the thread — nothing to look up. + * The function returns early without hitting Mongo, mirroring the + * pre-fix behavior for empty-thread cases. */ + const { agent, req, res, loadTools, db } = setupExecuteCodeAgent(); + + mockGetThreadData.mockReturnValue({ + messageIds: ['msgN', 'msgRoot'], + fileIds: [], + }); + + const getCodeGeneratedFiles = jest.fn().mockResolvedValue([]); + const getUserCodeFiles = jest.fn().mockResolvedValue([]); + const getMessages = jest + .fn() + .mockResolvedValue([{ messageId: 'msgN', parentMessageId: 'msgRoot', files: [] }]); + + await initializeAgent( + { + req, + res, + agent, + loadTools, + endpointOption: { endpoint: EModelEndpoint.agents }, + conversationId: 'conv-1', + parentMessageId: 'msgN', + allowedProviders: new Set([Providers.OPENAI]), + isInitialAgent: true, + codeEnvAvailable: true, + }, + { ...db, getMessages, getCodeGeneratedFiles, getUserCodeFiles }, + ); + + expect(getCodeGeneratedFiles).toHaveBeenCalledWith('conv-1', []); + /* `getUserCodeFiles` is gated on a non-empty array at the call site, + * so it shouldn't be invoked at all. `getCodeGeneratedFiles`'s own + * empty-guard is exercised by data-schemas tests. */ + expect(getUserCodeFiles).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/api/src/agents/initialize.ts b/packages/api/src/agents/initialize.ts index 6e30904d41fb..c5ee08d7174f 100644 --- a/packages/api/src/agents/initialize.ts +++ b/packages/api/src/agents/initialize.ts @@ -233,8 +233,10 @@ export interface InitializeAgentDbMethods extends EndpointDbMethods { getToolFilesByIds: (fileIds: string[], toolSet: Set) => Promise; /** Get conversation file IDs */ getConvoFiles: (conversationId: string) => Promise; - /** Get code-generated files by conversation ID and optional message IDs */ - getCodeGeneratedFiles?: (conversationId: string, messageIds?: string[]) => Promise; + /** Get code-generated files by conversation ID and the file_ids + * referenced from messages in the current thread (collected via + * `messages.files[].file_id` during thread walk). */ + getCodeGeneratedFiles?: (conversationId: string, threadFileIds?: string[]) => Promise; /** Get user-uploaded execute_code files by file IDs (from message.files in thread) */ getUserCodeFiles?: (fileIds: string[]) => Promise; /** Get messages for a conversation (supports select for field projection) */ @@ -423,7 +425,6 @@ export async function initializeAgent( let userCodeFiles: IMongoFile[] = []; if (toolResourceSet.has(EToolResources.execute_code)) { - let threadMessageIds: string[] | undefined; let threadFileIds: string[] | undefined; if (parentMessageId && parentMessageId !== Constants.NO_PARENT && db.getMessages) { @@ -433,22 +434,29 @@ export async function initializeAgent( 'messageId parentMessageId files', ); if (messages && messages.length > 0) { - /** Single O(n) pass: build Map, traverse thread, collect both IDs */ - const threadData = getThreadData(messages, parentMessageId); - threadMessageIds = threadData.messageIds; - threadFileIds = threadData.fileIds; + /** Walk the parent chain and collect file_ids referenced by + * any message in the thread (`messages.files[].file_id`). + * Used as the primary anchor for both + * `getCodeGeneratedFiles` and `getUserCodeFiles` — + * message ids no longer needed at this layer. */ + threadFileIds = getThreadData(messages, parentMessageId).fileIds; } } - /** Code-generated files (context: execute_code) filtered by messageId */ + /** Code-generated and user-uploaded execute_code files share the + * same primary anchor: file_ids referenced by messages in the + * current thread. The two queries differ only by `context` + * (`execute_code` for generated outputs, others for uploads). + * Anchoring both on `threadFileIds` reaches files regardless of + * which sibling first generated them — see `getCodeGeneratedFiles` + * for the branched-conversation rationale. */ if (db.getCodeGeneratedFiles) { codeGeneratedFiles = (await db.getCodeGeneratedFiles( conversationId, - threadMessageIds, + threadFileIds, )) as IMongoFile[]; } - /** User-uploaded execute_code files (context: agents/message_attachment) from thread messages */ if (db.getUserCodeFiles && threadFileIds && threadFileIds.length > 0) { userCodeFiles = (await db.getUserCodeFiles(threadFileIds)) as IMongoFile[]; } diff --git a/packages/data-schemas/src/methods/file.spec.ts b/packages/data-schemas/src/methods/file.spec.ts index cb573c9df3de..9bb3895d27cc 100644 --- a/packages/data-schemas/src/methods/file.spec.ts +++ b/packages/data-schemas/src/methods/file.spec.ts @@ -318,6 +318,154 @@ describe('File Methods', () => { }); }); + describe('getCodeGeneratedFiles', () => { + /* The function filters by `file_id IN threadFileIds` — the file_ids + * referenced by messages in the current conversation thread — + * rather than by `messageId IN threadMessageIds`. The change + * matters when a code-output file is shared across sibling branches + * (regenerations); the File record's own `messageId` points at + * whichever sibling FIRST created it (preserved deliberately by + * processCodeOutput for provenance), but `threadFileIds` reaches + * any sibling that references the file via `messages.files[]`. */ + + it('finds a code-output file referenced by the current thread', async () => { + const userId = new mongoose.Types.ObjectId(); + const conversationId = uuidv4(); + const fileId = uuidv4(); + + await fileMethods.createFile({ + file_id: fileId, + user: userId, + conversationId, + messageId: 'msg-original-creator', + filename: 'output.csv', + filepath: '/uploads/output.csv', + type: 'text/csv', + bytes: 100, + context: FileContext.execute_code, + metadata: { + codeEnvRef: { + kind: 'user', + id: userId.toString(), + storage_session_id: 'sess', + file_id: fileId, + }, + }, + }); + + const files = await fileMethods.getCodeGeneratedFiles(conversationId, [fileId]); + expect(files).toHaveLength(1); + expect(files[0].file_id).toBe(fileId); + }); + + it('reaches a file whose creator messageId is on a sibling branch (regression)', async () => { + /* Branched-conversation case: sibling A creates the file (its + * messageId is preserved on the File record), sibling N + * recreates the same filename — claimCodeFile finds the existing + * record and the messageId stays at A. The current thread (parent + * = N) doesn't include A. Filtering by threadFileIds (which + * includes the file_id N's message references) reaches it. */ + const userId = new mongoose.Types.ObjectId(); + const conversationId = uuidv4(); + const fileId = uuidv4(); + + await fileMethods.createFile({ + file_id: fileId, + user: userId, + conversationId, + /* The file's messageId points at sibling A — NOT in the + * current thread [siblingN, root]. The old `messageId IN` + * filter would have excluded the file here. */ + messageId: 'siblingA', + filename: 'output.csv', + filepath: '/uploads/output.csv', + type: 'text/csv', + bytes: 100, + context: FileContext.execute_code, + metadata: { + codeEnvRef: { + kind: 'user', + id: userId.toString(), + storage_session_id: 'sess', + file_id: fileId, + }, + }, + }); + + const files = await fileMethods.getCodeGeneratedFiles(conversationId, [fileId]); + expect(files).toHaveLength(1); + expect(files[0].file_id).toBe(fileId); + }); + + it('returns empty when threadFileIds is missing or empty', async () => { + const conversationId = uuidv4(); + expect(await fileMethods.getCodeGeneratedFiles(conversationId)).toEqual([]); + expect(await fileMethods.getCodeGeneratedFiles(conversationId, [])).toEqual([]); + }); + + it('does not cross conversation boundaries even with matching file_id', async () => { + const userId = new mongoose.Types.ObjectId(); + const fileId = uuidv4(); + + await fileMethods.createFile({ + file_id: fileId, + user: userId, + conversationId: 'other-conv', + messageId: 'msg-creator', + filename: 'output.csv', + filepath: '/uploads/output.csv', + type: 'text/csv', + bytes: 100, + context: FileContext.execute_code, + metadata: { + codeEnvRef: { + kind: 'user', + id: userId.toString(), + storage_session_id: 'sess', + file_id: fileId, + }, + }, + }); + + const files = await fileMethods.getCodeGeneratedFiles('this-conv', [fileId]); + expect(files).toEqual([]); + }); + + it('excludes non-execute_code files even when file_id matches', async () => { + /* `tool_resources.execute_code.file_ids` is the source of + * threadFileIds, but `messages.files[]` includes files of + * every context. The `context: execute_code` filter prevents + * a user-uploaded chat file from being mistakenly fetched via + * this function (it'd go through getUserCodeFiles instead). */ + const userId = new mongoose.Types.ObjectId(); + const conversationId = uuidv4(); + const fileId = uuidv4(); + + await fileMethods.createFile({ + file_id: fileId, + user: userId, + conversationId, + messageId: 'msg-1', + filename: 'note.txt', + filepath: '/uploads/note.txt', + type: 'text/plain', + bytes: 100, + context: FileContext.message_attachment, + metadata: { + codeEnvRef: { + kind: 'user', + id: userId.toString(), + storage_session_id: 'sess', + file_id: fileId, + }, + }, + }); + + const files = await fileMethods.getCodeGeneratedFiles(conversationId, [fileId]); + expect(files).toEqual([]); + }); + }); + describe('updateFile', () => { it('should update file data and remove TTL', async () => { const fileId = uuidv4(); diff --git a/packages/data-schemas/src/methods/file.ts b/packages/data-schemas/src/methods/file.ts index 4261588a3380..e1b49b2c673f 100644 --- a/packages/data-schemas/src/methods/file.ts +++ b/packages/data-schemas/src/methods/file.ts @@ -96,30 +96,53 @@ export function createFileMethods(mongoose: typeof import('mongoose')) { /** * Retrieves files generated by code execution for a given conversation. - * These files are stored locally with fileIdentifier metadata for code env re-upload. + * These files are stored locally with `metadata.codeEnvRef` for code env re-upload. + * + * Filtered by `file_id IN threadFileIds` (the file_ids referenced from + * messages in the current thread's `files` arrays) rather than by + * `messageId IN threadMessageIds` (the File record's own creator id). + * This mirrors `getUserCodeFiles` and fixes the branched-conversation + * case where: + * 1. Sibling A creates `output.csv` via processCodeOutput → File + * record persists with `messageId = A`. + * 2. Sibling N (a regeneration of A's parent) recreates `output.csv`. + * `claimCodeFile` is keyed by `(filename, conversationId, context)` + * and finds A's record; `processCodeOutput` deliberately preserves + * A's `messageId` to keep file→original-creator provenance intact. + * 3. Turn N+1's user message has `parentMessageId = N` (sibling of A). + * `getThreadData` walks back from N: thread = [N, root]. A is + * NOT in that thread, so a `messageId IN [N, root]` filter + * excludes the file. + * The message at N references the file via `messages.files = [...]`, + * so `threadFileIds` (collected from that array during thread walk) + * DOES include the file_id. Filtering by that key reaches the right + * file regardless of which sibling first created it. * * @param conversationId - The conversation ID to search for - * @param messageIds - Array of messageIds to filter by (for linear thread filtering). - * While technically optional, this function returns empty if not provided. - * This is intentional: code-generated files must be filtered by thread to avoid - * including files from other branches of a conversation. - * @returns Files generated by code execution in the conversation, filtered by messageIds + * @param threadFileIds - Array of file_ids referenced by messages in + * the current thread (collected from `messages.files[].file_id`). + * While technically optional, this function returns empty if not + * provided — code-generated files must be filtered by thread to + * avoid pulling in files from other branches of a conversation. + * @returns Files generated by code execution in the conversation, + * filtered to those referenced by the current thread. */ async function getCodeGeneratedFiles( conversationId: string, - messageIds?: string[], + threadFileIds?: string[], ): Promise { if (!conversationId) { return []; } /** - * Return early if messageIds not provided - this is intentional behavior. - * Code-generated files must be filtered by thread messageIds to ensure we only - * return files relevant to the current conversation branch, not orphaned files - * from other branches or deleted messages. + * Return early if threadFileIds not provided — same rationale as + * the messageIds-based predecessor: code-generated files must be + * filtered by thread to ensure we only return files relevant to + * the current conversation branch, not orphaned files from other + * branches or deleted messages. */ - if (!messageIds || messageIds.length === 0) { + if (!threadFileIds || threadFileIds.length === 0) { return []; } @@ -127,7 +150,7 @@ export function createFileMethods(mongoose: typeof import('mongoose')) { const filter: FilterQuery = { conversationId, context: FileContext.execute_code, - messageId: { $exists: true, $in: messageIds }, + file_id: { $in: threadFileIds }, 'metadata.codeEnvRef': { $exists: true }, };