diff --git a/api/server/controllers/agents/client.js b/api/server/controllers/agents/client.js index 47a10165e3bb..8302a5e65f7a 100644 --- a/api/server/controllers/agents/client.js +++ b/api/server/controllers/agents/client.js @@ -46,6 +46,7 @@ const { isAgentsEndpoint, isEphemeralAgentId, removeNullishValues, + validateVisionModel, } = require('librechat-data-provider'); const { filterFilesByAgentAccess } = require('~/server/services/Files/permissions'); const { encodeAndFormat } = require('~/server/services/Files/images/encode'); @@ -165,11 +166,37 @@ class AgentClient extends BaseClient { * @returns {Promise>>} */ async addImageURLs(message, attachments) { + const agent = this.options.agent; + if (!agent) { + return attachments; + } + + // Determine vision capability: explicit agent.vision takes precedence, + // otherwise check if the model supports vision + let isVisionCapable = false; + if (agent.vision !== undefined) { + isVisionCapable = agent.vision === true; + } else { + const agentModel = + agent.model_parameters?.model ?? agent.model; + if (agentModel) { + const appConfig = this.options.req?.config; + isVisionCapable = validateVisionModel({ + model: agentModel, + modelSpecs: appConfig?.modelSpecs, + }); + } + } + + if (!isVisionCapable) { + return attachments; + } + const { files, image_urls } = await encodeAndFormat( this.options.req, attachments, { - provider: this.options.agent.provider, + provider: agent.provider, endpoint: this.options.endpoint, }, VisionModes.agents, @@ -241,6 +268,9 @@ class AgentClient extends BaseClient { ); } + // Image content in messages is filtered by the LLM layer (_convertMessagesToOpenAIParams) + // when agent.vision is false; no need to strip image_urls here. + /** @type {Record} */ const canonicalTokenCountMap = {}; /** @type {Record} */ @@ -840,8 +870,10 @@ class AgentClient extends BaseClient { customHandlers: this.options.eventHandlers, requestBody: config.configurable.requestBody, user: createSafeUser(this.options.req?.user), - summarizationConfig: appConfig?.summarization, tokenCounter, + modelSpecs: appConfig.modelSpecs, + availableModels: appConfig.availableModels, + summarizationConfig: appConfig?.summarization, }); if (!run) { diff --git a/api/server/services/AssistantService.js b/api/server/services/AssistantService.js index a7018f715b8c..0ef91d3a997c 100644 --- a/api/server/services/AssistantService.js +++ b/api/server/services/AssistantService.js @@ -437,6 +437,27 @@ async function runAssistant({ }); const tool_outputs = await processRequiredActions(openai, actions); + + // Add artifact content as user message to thread if artifacts were processed + if (openai.pendingArtifactContent?.length) { + const willAttachFileIds = + openai.pendingArtifactFileIds?.length && + openai.pendingArtifactContent.some((item) => item?.type === ContentTypes.IMAGE_FILE); + + const artifactMessage = { + role: 'user', + content: openai.pendingArtifactContent, + }; + if (willAttachFileIds) { + artifactMessage.file_ids = openai.pendingArtifactFileIds; + } + await openai.beta.threads.messages.create(thread_id, artifactMessage); + + // Clear after use + delete openai.pendingArtifactContent; + delete openai.pendingArtifactFileIds; + } + const toolRun = await openai.beta.threads.runs.submitToolOutputs(run.id, { thread_id: run.thread_id, tool_outputs, diff --git a/api/server/services/MCP.js b/api/server/services/MCP.js index 5d97891c556a..ef29b6442d99 100644 --- a/api/server/services/MCP.js +++ b/api/server/services/MCP.js @@ -545,6 +545,7 @@ function createToolInstance({ derivedSignal = config?.signal ? AbortSignal.any([config.signal]) : undefined; const mcpManager = getMCPManager(userId); const provider = (config?.metadata?.provider || _provider)?.toLowerCase(); + const endpoint = config?.metadata?.endpoint; const { args: _args, stepId, ...toolCall } = config.toolCall ?? {}; const flowId = `${serverName}:oauth_login:${config.metadata.thread_id}:${config.metadata.run_id}`; @@ -574,6 +575,9 @@ function createToolInstance({ const customUserVars = config?.configurable?.userMCPAuthMap?.[`${Constants.mcp_prefix}${serverName}`]; + // mcpManager.callTool returns FormattedContentResult: [content, artifacts] + // This tuple format is already handled by formatToolContent in @librechat/api + // and is compatible with responseFormat: CONTENT_AND_ARTIFACT const result = await mcpManager.callTool({ serverName, toolName, diff --git a/api/server/services/ToolService.js b/api/server/services/ToolService.js index ca75e7eb4f45..6c8dc77318a6 100644 --- a/api/server/services/ToolService.js +++ b/api/server/services/ToolService.js @@ -189,14 +189,20 @@ async function processRequiredActions(client, requiredActions) { let tool = ToolMap[currentAction.tool] ?? ActionToolMap[currentAction.tool]; const handleToolOutput = async (output) => { + // For MCP tools, output is [content, artifact] array + // Store the full array in requiredActions[i].output for artifact processing + // For tool output to OpenAI, we'll extract just the content requiredActions[i].output = output; + // Extract content for tool call display (first element of array if array, otherwise output itself) + const outputContent = Array.isArray(output) && output.length >= 1 ? output[0] : output; + /** @type {FunctionToolCall & PartMetadata} */ const toolCall = { function: { name: currentAction.tool, arguments: JSON.stringify(currentAction.toolInput), - output, + output: outputContent, }, id: currentAction.toolCallId, type: 'function', @@ -207,7 +213,7 @@ async function processRequiredActions(client, requiredActions) { const toolCallIndex = client.mappedOrder.get(toolCall.id); if (imageGenTools.has(currentAction.tool)) { - const imageOutput = output; + const imageOutput = outputContent; toolCall.function.output = `${currentAction.tool} displayed an image. All generated images are already plainly visible, so don't repeat the descriptions in detail. Do not list download links as they are available in the UI already. The user may download the images by clicking on them, but do not mention anything about downloading to the user.`; // Streams the "Finished" state of the tool call in the UI @@ -252,9 +258,13 @@ async function processRequiredActions(client, requiredActions) { // result: tool.result, }); + // For MCP tools with artifacts, return the content string for OpenAI tool output + // The full array [content, artifact] is stored in requiredActions[i].output for artifact processing + const finalOutput = outputContent; + return { tool_call_id: currentAction.toolCallId, - output, + output: finalOutput, }; }; @@ -410,8 +420,55 @@ async function processRequiredActions(client, requiredActions) { } } + const tool_outputs = await Promise.all(promises); + + // Process artifacts from MCP tools and prepare for next user message + const allArtifacts = []; + for (let i = 0; i < requiredActions.length; i++) { + const action = requiredActions[i]; + // MCP tools return [content, artifact] format + // For OpenRouter (string format): [string, artifacts] + // For OpenAI-compatible (array format): [[contentArray], artifacts] + if ( + action.output && + Array.isArray(action.output) && + action.output.length === 2 && + action.output[1]?.content + ) { + allArtifacts.push({ + artifacts: action.output[1], + toolName: action.tool, + }); + } + } + + if (allArtifacts.length > 0) { + const isVisionModel = getVisionCapability(client); + const artifactFileIds = []; + const artifactContent = []; + + for (const { artifacts, toolName } of allArtifacts) { + const processed = await processArtifactsForAssistants({ + artifacts, + isVisionModel, + req: client.req, + thread_id: requiredActions[0].thread_id, + conversationId: + (client.responseMessage ?? client.finalMessage)?.conversationId, + }); + + artifactFileIds.push(...processed.fileIds); + artifactContent.push(...processed.contentParts); + } + + if (artifactContent.length > 0 || artifactFileIds.length > 0) { + client.pendingArtifactContent = artifactContent; + client.pendingArtifactFileIds = artifactFileIds; + } + } + return { - tool_outputs: await Promise.all(promises), + tool_outputs, }; } diff --git a/client/src/common/agents-types.ts b/client/src/common/agents-types.ts index c3ea06f89055..d42713a07d7e 100644 --- a/client/src/common/agents-types.ts +++ b/client/src/common/agents-types.ts @@ -23,6 +23,7 @@ export type TAgentCapabilities = { [AgentCapabilities.web_search]: boolean; [AgentCapabilities.file_search]: boolean; [AgentCapabilities.execute_code]: boolean; + [AgentCapabilities.vision]: boolean; [AgentCapabilities.end_after_tools]?: boolean; [AgentCapabilities.hide_sequential_outputs]?: boolean; }; diff --git a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx index 62072e49e591..5d395ebb2647 100644 --- a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx +++ b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx @@ -37,6 +37,7 @@ import { useGetStartupConfig } from '~/data-provider'; import { ephemeralAgentByConvoId } from '~/store'; import { MenuItemProps } from '~/common'; import { cn } from '~/utils'; +import { useVisionModel } from '~/hooks'; type FileUploadType = | 'image' @@ -80,6 +81,7 @@ const AttachFileMenu = ({ const { agentsConfig } = useGetAgentsConfig(); const { data: startupConfig } = useGetStartupConfig(); const sharePointEnabled = startupConfig?.sharePointFilePickerEnabled; + const isVisionModel = useVisionModel(); const [isSharePointDialogOpen, setIsSharePointDialogOpen] = useState(false); @@ -89,10 +91,9 @@ const AttachFileMenu = ({ * */ const capabilities = useAgentCapabilities(agentsConfig?.capabilities ?? defaultAgentCapabilities); - const { fileSearchAllowedByAgent, codeAllowedByAgent, provider } = useAgentToolPermissions( - agentId, - ephemeralAgent, - ); + const { fileSearchAllowedByAgent, codeAllowedByAgent, visionEnabledByAgent, provider } = + useAgentToolPermissions(agentId, ephemeralAgent); + const isVisionAvailable = isVisionModel || visionEnabledByAgent; const handleUploadClick = (fileType?: FileUploadType) => { if (!inputRef.current) { @@ -135,32 +136,39 @@ const AttachFileMenu = ({ isDocumentSupportedProvider(currentProvider) || isAzureWithResponsesApi ) { - items.push({ - label: localize('com_ui_upload_provider'), - onClick: () => { - setToolResource(undefined); - let fileType: Exclude = 'image_document'; - if (currentProvider === Providers.GOOGLE || currentProvider === Providers.OPENROUTER) { - fileType = 'image_document_video_audio'; - } else if ( - currentProvider === Providers.BEDROCK || - endpointType === EModelEndpoint.bedrock - ) { - fileType = 'image_document_extended'; - } - onAction(fileType); - }, - icon: , - }); + if (isVisionAvailable) { + items.push({ + label: localize('com_ui_upload_provider'), + onClick: () => { + setToolResource(undefined); + let fileType: Exclude = 'image_document'; + if ( + currentProvider === Providers.GOOGLE || + currentProvider === Providers.OPENROUTER + ) { + fileType = 'image_document_video_audio'; + } else if ( + currentProvider === Providers.BEDROCK || + endpointType === EModelEndpoint.bedrock + ) { + fileType = 'image_document_extended'; + } + onAction(fileType); + }, + icon: , + }); + } } else { - items.push({ - label: localize('com_ui_upload_image_input'), - onClick: () => { - setToolResource(undefined); - onAction('image'); - }, - icon: , - }); + if (isVisionAvailable) { + items.push({ + label: localize('com_ui_upload_image_input'), + onClick: () => { + setToolResource(undefined); + onAction('image'); + }, + icon: , + }); + } } if (capabilities.contextEnabled) { @@ -237,6 +245,7 @@ const AttachFileMenu = ({ codeAllowedByAgent, fileSearchAllowedByAgent, setIsSharePointDialogOpen, + isVisionAvailable, ]); const menuTrigger = ( diff --git a/client/src/components/Chat/Input/Files/DragDropModal.tsx b/client/src/components/Chat/Input/Files/DragDropModal.tsx index cb5109c866d2..5a5ee65bfa4b 100644 --- a/client/src/components/Chat/Input/Files/DragDropModal.tsx +++ b/client/src/components/Chat/Input/Files/DragDropModal.tsx @@ -25,6 +25,7 @@ import { } from '~/hooks'; import { ephemeralAgentByConvoId } from '~/store'; import { useDragDropContext } from '~/Providers'; +import { useVisionModel } from '~/hooks'; interface DragDropModalProps { onOptionSelect: (option: EToolResources | undefined) => void; @@ -50,10 +51,10 @@ const DragDropModal = ({ onOptionSelect, setShowModal, files, isVisible }: DragD const capabilities = useAgentCapabilities(agentsConfig?.capabilities ?? defaultAgentCapabilities); const { conversationId, agentId, endpoint, endpointType, useResponsesApi } = useDragDropContext(); const ephemeralAgent = useRecoilValue(ephemeralAgentByConvoId(conversationId ?? '')); - const { fileSearchAllowedByAgent, codeAllowedByAgent, provider } = useAgentToolPermissions( - agentId, - ephemeralAgent, - ); + const { fileSearchAllowedByAgent, codeAllowedByAgent, visionEnabledByAgent, provider } = + useAgentToolPermissions(agentId, ephemeralAgent); + const isVisionModel = useVisionModel(); + const isVisionAvailable = isVisionModel || visionEnabledByAgent; const options = useMemo(() => { const _options: FileOption[] = []; @@ -103,15 +104,15 @@ const DragDropModal = ({ onOptionSelect, setShowModal, files, isVisible }: DragD label: localize('com_ui_upload_provider'), value: undefined, icon: , - condition: validFileTypes, + condition: validFileTypes && isVisionAvailable, }); } else { - // Only show image upload option if all files are images and provider doesn't support documents _options.push({ label: localize('com_ui_upload_image_input'), value: undefined, icon: , - condition: files.every((file) => getFileType(file)?.startsWith('image/')), + condition: + files.every((file) => getFileType(file)?.startsWith('image/')) && isVisionAvailable, }); } if (capabilities.fileSearchEnabled && fileSearchAllowedByAgent) { @@ -147,6 +148,7 @@ const DragDropModal = ({ onOptionSelect, setShowModal, files, isVisible }: DragD useResponsesApi, codeAllowedByAgent, fileSearchAllowedByAgent, + isVisionAvailable, ]); if (!isVisible) { diff --git a/client/src/components/SidePanel/Agents/AgentConfig.tsx b/client/src/components/SidePanel/Agents/AgentConfig.tsx index 5b1a0595c5f3..9ffdd8f790b4 100644 --- a/client/src/components/SidePanel/Agents/AgentConfig.tsx +++ b/client/src/components/SidePanel/Agents/AgentConfig.tsx @@ -1,7 +1,13 @@ -import React, { useState, useMemo, useCallback } from 'react'; +import React, { useState, useMemo, useCallback, useEffect } from 'react'; import { useToastContext } from '@librechat/client'; import { Controller, useWatch, useFormContext } from 'react-hook-form'; -import { EModelEndpoint, getEndpointField } from 'librechat-data-provider'; +import { + EModelEndpoint, + getEndpointField, + defaultAgentCapabilities, + validateVisionModel, + AgentCapabilities, +} from 'librechat-data-provider'; import type { AgentForm, IconComponentTypes } from '~/common'; import { removeFocusOutlines, @@ -18,7 +24,7 @@ import AgentCategorySelector from './AgentCategorySelector'; import Action from '~/components/SidePanel/Builder/Action'; import { useLocalize, useVisibleTools } from '~/hooks'; import { Panel, isEphemeralAgent } from '~/common'; -import { useGetAgentFiles } from '~/data-provider'; +import { useGetAgentFiles, useGetStartupConfig } from '~/data-provider'; import { icons } from '~/hooks/Endpoint/Icons'; import Instructions from './Instructions'; import AgentAvatar from './AgentAvatar'; @@ -29,6 +35,7 @@ import Artifacts from './Artifacts'; import AgentTool from './AgentTool'; import CodeForm from './Code/Form'; import MCPTools from './MCPTools'; +import ImageVision from './ImageVision'; const labelClass = 'mb-2 text-token-text-primary block text-sm font-medium'; const inputClass = cn( @@ -64,8 +71,12 @@ export default function AgentConfig() { const agent = useWatch({ control, name: 'agent' }); const tools = useWatch({ control, name: 'tools' }); const agent_id = useWatch({ control, name: 'id' }); + const vision = useWatch({ control, name: AgentCapabilities.vision }); + const modelParameters = useWatch({ control, name: 'model_parameters' }); const { data: agentFiles = [] } = useGetAgentFiles(agent_id); + const { data: startupConfig } = useGetStartupConfig(); + const { setValue, getValues } = methods; const mergedFileMap = useMemo(() => { const newFileMap = { ...fileMap }; @@ -85,7 +96,32 @@ export default function AgentConfig() { artifactsEnabled, webSearchEnabled, fileSearchEnabled, - } = useAgentCapabilities(agentsConfig?.capabilities); + visionEnabled, + } = useAgentCapabilities(agentsConfig?.capabilities ?? defaultAgentCapabilities); + + // Auto-update vision when model changes if vision was not explicitly set + useEffect(() => { + // Only update if vision is undefined (not explicitly set) + if (vision !== undefined) { + return; + } + + const agentModel = (modelParameters as { model?: string })?.model ?? model; + if (!agentModel) { + return; + } + + const autoVision = validateVisionModel({ + model: agentModel, + modelSpecs: startupConfig?.modelSpecs, + availableModels: startupConfig?.availableModels, + }); + + // Only update if the calculated value differs from current form value + if (getValues(AgentCapabilities.vision) !== autoVision) { + setValue(AgentCapabilities.vision, autoVision, { shouldDirty: false }); + } + }, [model, modelParameters, vision, startupConfig, setValue, getValues]); const context_files = useMemo(() => { if (typeof agent === 'string') { @@ -288,7 +324,8 @@ export default function AgentConfig() { fileSearchEnabled || artifactsEnabled || contextEnabled || - webSearchEnabled) && ( + webSearchEnabled || + visionEnabled) && (
)} {/* MCP Section */} diff --git a/client/src/components/SidePanel/Agents/AgentPanel.tsx b/client/src/components/SidePanel/Agents/AgentPanel.tsx index 441f5fd6e6a5..6eba7ec6d206 100644 --- a/client/src/components/SidePanel/Agents/AgentPanel.tsx +++ b/client/src/components/SidePanel/Agents/AgentPanel.tsx @@ -71,6 +71,7 @@ export function composeAgentUpdatePayload(data: AgentForm, agent_id?: string | n edges, end_after_tools, hide_sequential_outputs, + vision, recursion_limit, category, support_contact, @@ -97,6 +98,7 @@ export function composeAgentUpdatePayload(data: AgentForm, agent_id?: string | n edges, end_after_tools, hide_sequential_outputs, + vision, recursion_limit, category, support_contact, diff --git a/client/src/components/SidePanel/Agents/AgentSelect.tsx b/client/src/components/SidePanel/Agents/AgentSelect.tsx index 8655780a9c7a..39ed1675c84e 100644 --- a/client/src/components/SidePanel/Agents/AgentSelect.tsx +++ b/client/src/components/SidePanel/Agents/AgentSelect.tsx @@ -2,13 +2,17 @@ import { EarthIcon } from 'lucide-react'; import { ControlCombobox } from '@librechat/client'; import { memo, useCallback, useEffect, useRef } from 'react'; import { useFormContext, Controller } from 'react-hook-form'; -import { AgentCapabilities, defaultAgentFormValues } from 'librechat-data-provider'; +import { + AgentCapabilities, + defaultAgentFormValues, + validateVisionModel, +} from 'librechat-data-provider'; import type { UseMutationResult, QueryObserverResult } from '@tanstack/react-query'; import type { Agent, AgentCreateParams } from 'librechat-data-provider'; import type { TAgentCapabilities, AgentForm } from '~/common'; import { cn, createProviderOption, processAgentOption, getDefaultAgentFormValues } from '~/utils'; import { useLocalize, useAgentDefaultPermissionLevel } from '~/hooks'; -import { useListAgentsQuery } from '~/data-provider'; +import { useListAgentsQuery, useGetStartupConfig } from '~/data-provider'; const keys = new Set(Object.keys(defaultAgentFormValues)); @@ -27,6 +31,7 @@ function AgentSelect({ const lastSelectedAgent = useRef(null); const { control, reset } = useFormContext(); const permissionLevel = useAgentDefaultPermissionLevel(); + const { data: startupConfig } = useGetStartupConfig(); const { data: agents = null } = useListAgentsQuery( { requiredPermission: permissionLevel }, @@ -54,10 +59,28 @@ function AgentSelect({ icon: isGlobal ? : null, }; + // Get vision from top-level agent or from latest version if not present + // If not explicitly set, automatically determine from model + const explicitVision = + fullAgent.vision ?? fullAgent.versions?.[fullAgent.versions.length - 1]?.vision; + const agentModel = + (fullAgent.model_parameters as { model?: string })?.model ?? fullAgent.model; + const agentVision = + explicitVision !== undefined + ? explicitVision + : agentModel + ? validateVisionModel({ + model: agentModel, + modelSpecs: startupConfig?.modelSpecs, + availableModels: startupConfig?.availableModels, + }) + : false; + const capabilities: TAgentCapabilities = { [AgentCapabilities.web_search]: false, [AgentCapabilities.file_search]: false, [AgentCapabilities.execute_code]: false, + [AgentCapabilities.vision]: agentVision, [AgentCapabilities.end_after_tools]: false, [AgentCapabilities.hide_sequential_outputs]: false, }; diff --git a/client/src/components/SidePanel/Agents/ImageVision.tsx b/client/src/components/SidePanel/Agents/ImageVision.tsx index bc4e1178966b..9a629b068e87 100644 --- a/client/src/components/SidePanel/Agents/ImageVision.tsx +++ b/client/src/components/SidePanel/Agents/ImageVision.tsx @@ -1,40 +1,66 @@ -import { Checkbox } from '@librechat/client'; -import { Capabilities } from 'librechat-data-provider'; +import { AgentCapabilities } from 'librechat-data-provider'; import { useFormContext, Controller } from 'react-hook-form'; +import { + Checkbox, + HoverCard, + HoverCardContent, + HoverCardPortal, + HoverCardTrigger, + CircleHelpIcon, +} from '@librechat/client'; import type { AgentForm } from '~/common'; import { useLocalize } from '~/hooks'; +import { ESide } from '~/common'; export default function ImageVision() { const localize = useLocalize(); const methods = useFormContext(); - const { control, setValue, getValues } = methods; + const { control } = methods; return ( -
- ( - - )} - /> - -
+ +
+ ( + + )} + /> + + + + + + +
+

+ {localize('com_agents_image_vision_info')} +

+
+
+
+
+
); } diff --git a/client/src/hooks/Agents/useAgentCapabilities.ts b/client/src/hooks/Agents/useAgentCapabilities.ts index a0f3de025ecd..09631d409705 100644 --- a/client/src/hooks/Agents/useAgentCapabilities.ts +++ b/client/src/hooks/Agents/useAgentCapabilities.ts @@ -10,6 +10,7 @@ interface AgentCapabilitiesResult { fileSearchEnabled: boolean; webSearchEnabled: boolean; codeEnabled: boolean; + visionEnabled: boolean; deferredToolsEnabled: boolean; programmaticToolsEnabled: boolean; } @@ -57,6 +58,11 @@ export default function useAgentCapabilities( [capabilities], ); + const visionEnabled = useMemo( + () => capabilities?.includes(AgentCapabilities.vision) ?? false, + [capabilities], + ); + const deferredToolsEnabled = useMemo( () => capabilities?.includes(AgentCapabilities.deferred_tools) ?? false, [capabilities], @@ -76,6 +82,7 @@ export default function useAgentCapabilities( artifactsEnabled, webSearchEnabled, fileSearchEnabled, + visionEnabled, deferredToolsEnabled, programmaticToolsEnabled, }; diff --git a/client/src/hooks/Agents/useAgentToolPermissions.ts b/client/src/hooks/Agents/useAgentToolPermissions.ts index cff9e9635bbf..96627b5098c5 100644 --- a/client/src/hooks/Agents/useAgentToolPermissions.ts +++ b/client/src/hooks/Agents/useAgentToolPermissions.ts @@ -8,6 +8,8 @@ import { isEphemeralAgent } from '~/common'; interface AgentToolPermissionsResult { fileSearchAllowedByAgent: boolean; codeAllowedByAgent: boolean; + /** True when the current agent has image vision enabled (shows upload-to-provider in chat input) */ + visionEnabledByAgent: boolean; tools: string[] | undefined; provider?: string; } @@ -64,9 +66,21 @@ export default function useAgentToolPermissions( return tools?.includes(Tools.execute_code) ?? false; }, [agentId, selectedAgent, tools, ephemeralAgent]); + const visionEnabledByAgent = useMemo(() => { + if (agentId == null || agentId === '') return false; + const agent = agentData ?? selectedAgent; + const vision = agent?.vision; + if (vision !== undefined) return vision; + const versions = (agent as { versions?: Array<{ vision?: boolean }> })?.versions; + return versions?.length + ? (versions[versions.length - 1]?.vision ?? false) + : false; + }, [agentId, agentData, selectedAgent]); + return { fileSearchAllowedByAgent, codeAllowedByAgent, + visionEnabledByAgent, provider, tools, }; diff --git a/client/src/hooks/index.ts b/client/src/hooks/index.ts index 4b58e434c258..90b16e7ec841 100644 --- a/client/src/hooks/index.ts +++ b/client/src/hooks/index.ts @@ -37,3 +37,4 @@ export { default as useTextToSpeech } from './Input/useTextToSpeech'; export { default as useGenerationsByLatest } from './useGenerationsByLatest'; export { default as useLocalizedConfig } from './useLocalizedConfig'; export { default as useResourcePermissions } from './useResourcePermissions'; +export { useVisionModel } from './useVisionModel'; \ No newline at end of file diff --git a/client/src/hooks/useVisionModel.ts b/client/src/hooks/useVisionModel.ts new file mode 100644 index 000000000000..3f911ad686f0 --- /dev/null +++ b/client/src/hooks/useVisionModel.ts @@ -0,0 +1,25 @@ +import { useMemo } from 'react'; +import { validateVisionModel } from 'librechat-data-provider'; +import { useChatContext } from '~/Providers'; +import { useGetStartupConfig } from '~/data-provider'; + +/** + * Hook to determine if the current conversation model supports vision capabilities. + * Checks modelSpecs configuration first, then falls back to hardcoded list. + */ +export function useVisionModel(): boolean { + const { conversation } = useChatContext(); + const { data: startupConfig } = useGetStartupConfig(); + + return useMemo(() => { + const model = conversation?.model; + if (!model) { + return false; + } + + return validateVisionModel({ + model, + modelSpecs: startupConfig?.modelSpecs, + }); + }, [conversation?.model, startupConfig?.modelSpecs]); +} diff --git a/client/src/locales/en/translation.json b/client/src/locales/en/translation.json index 89cc21abc745..2a0bbb7fadc2 100644 --- a/client/src/locales/en/translation.json +++ b/client/src/locales/en/translation.json @@ -44,6 +44,7 @@ "com_agents_description_placeholder": "Optional: Describe your Agent here", "com_agents_empty_state_heading": "No agents found", "com_agents_enable_file_search": "Enable File Search", + "com_agents_enable_image_vision": "Enable Image Vision", "com_agents_error_bad_request_message": "The request could not be processed.", "com_agents_error_bad_request_suggestion": "Please check your input and try again.", "com_agents_error_category_title": "Category Error", @@ -72,6 +73,7 @@ "com_agents_file_context_label": "File Context", "com_agents_file_search_disabled": "Agent must be created before uploading files for File Search.", "com_agents_file_search_info": "When enabled, the agent will be informed of the exact filenames listed below, allowing it to retrieve relevant context from these files.", + "com_agents_image_vision_info": "Vision capability is automatically determined from the agent's model. Enable this checkbox to manually override and force vision capability on, or disable to force it off. When enabled, images generated by MCP tools will be sent back to the LLM. For non-vision models, disable to prevent context overflow errors.", "com_agents_grid_announcement": "Showing {{count}} agents in {{category}} category", "com_agents_instructions_placeholder": "The system instructions that the agent uses", "com_agents_link_copied": "Link copied", diff --git a/packages/api/src/agents/added.ts b/packages/api/src/agents/added.ts index 587f3bc437e6..5ccaabe558e6 100644 --- a/packages/api/src/agents/added.ts +++ b/packages/api/src/agents/added.ts @@ -105,14 +105,21 @@ export async function loadAddedAgent( ''; const ephemeralId = encodeEphemeralAgentId({ endpoint, model, sender, index: 1 }); - return { + const added: Record = { id: ephemeralId, instructions: promptPrefix || '', provider: endpoint, model_parameters: {}, model, tools: [...primaryAgent.tools], - } as unknown as Agent; + }; + if (modelSpec?.vision !== undefined) { + added.vision = modelSpec.vision; + } + if (spec != null && spec !== '') { + added.spec = spec; + } + return added as unknown as Agent; } const ephemeralAgent = rest.ephemeralAgent as @@ -226,5 +233,12 @@ export async function loadAddedAgent( result.artifacts = ephemeralAgent.artifacts; } + if (modelSpec?.vision !== undefined) { + result.vision = modelSpec.vision; + } + if (spec != null && spec !== '') { + result.spec = spec; + } + return result as unknown as Agent; } diff --git a/packages/api/src/agents/load.ts b/packages/api/src/agents/load.ts index 05746d1195f0..4a054d2209fd 100644 --- a/packages/api/src/agents/load.ts +++ b/packages/api/src/agents/load.ts @@ -130,6 +130,14 @@ export async function loadEphemeralAgent( if (ephemeralAgent?.artifacts) { result.artifacts = ephemeralAgent.artifacts; } + + if (modelSpec?.vision !== undefined) { + result.vision = modelSpec.vision; + } + if (spec != null && spec !== '') { + result.spec = spec; + } + return result as Agent; } diff --git a/packages/api/src/agents/run.ts b/packages/api/src/agents/run.ts index b6b5e6a14d90..d8057bb90752 100644 --- a/packages/api/src/agents/run.ts +++ b/packages/api/src/agents/run.ts @@ -1,5 +1,11 @@ import { Run, Providers, Constants } from '@librechat/agents'; -import { providerEndpointMap, KnownEndpoints } from 'librechat-data-provider'; +import { + providerEndpointMap, + KnownEndpoints, + type TSpecsConfig, + validateVisionModel, +} from 'librechat-data-provider'; +import type { BaseMessage } from '@langchain/core/messages'; import type { SummarizationConfig as AgentSummarizationConfig, MultiAgentGraphConfig, @@ -145,6 +151,7 @@ export function getReasoningKey( agentEndpoint?: string | null, ): 'reasoning_content' | 'reasoning' { let reasoningKey: 'reasoning_content' | 'reasoning' = 'reasoning_content'; + if (provider === Providers.GOOGLE) { reasoningKey = 'reasoning'; } else if ( @@ -158,9 +165,53 @@ export function getReasoningKey( ) { reasoningKey = 'reasoning'; } + return reasoningKey; } +/** + * Determines vision capability for an agent. + * + * Priority (manual specification wins over hardcoded list): + * 1. Explicit override (`agent.vision`) takes precedence + * 2. Spec-based: when agent has a `spec` and modelSpecs has that spec with vision set, use it + * 3. Auto-detection from model using `validateVisionModel()` (modelSpecs then hardcoded list) + * + * Model is resolved from `agent.model_parameters?.model` or `agent.model`. + * + * @param agent - The agent to check for vision capability + * @param modelSpecs - Optional modelSpecs configuration from librechat.yaml + * @param availableModels - Not used (kept for backwards compatibility) + * @returns true if the agent supports vision, false otherwise + */ +function determineVisionCapability( + agent: RunAgent, + modelSpecs?: TSpecsConfig, + availableModels?: string[] +): boolean { + if (agent.vision !== undefined) { + return agent.vision; + } + + const agentSpec = (agent as { spec?: string }).spec; + if (agentSpec != null && agentSpec !== '' && modelSpecs?.list?.length) { + const specByName = modelSpecs.list.find((s) => s.name === agentSpec); + if (specByName?.vision !== undefined) { + return specByName.vision === true; + } + } + + const agentModel = (agent.model_parameters as { model?: string })?.model ?? agent.model; + if (!agentModel) { + return false; + } + + return validateVisionModel({ + model: agentModel, + modelSpecs, + }); +} + type RunAgent = Omit & { tools?: GenericTool[]; maxContextTokens?: number; @@ -257,6 +308,8 @@ export async function createRun({ tokenCounter, customHandlers, indexTokenCountMap, + modelSpecs, + availableModels, summarizationConfig, initialSummary, calibrationRatio, @@ -270,6 +323,8 @@ export async function createRun({ streamUsage?: boolean; requestBody?: t.RequestBody; user?: IUser; + modelSpecs?: TSpecsConfig; + availableModels?: string[]; /** Message history for extracting previously discovered tools */ messages?: BaseMessage[]; summarizationConfig?: SummarizationConfig; @@ -347,12 +402,32 @@ export async function createRun({ /** Resolves issues with new OpenAI usage field */ if ( customProviders.has(agent.provider) || - (agent.provider === Providers.OPENAI && agent.endpoint !== agent.provider) + (agent.provider === Providers.OPENAI && + agent.endpoint != null && + agent.endpoint !== agent.provider && + agent.endpoint !== Providers.OPENAI) ) { llmConfig.streamUsage = false; llmConfig.usage = true; } + /** + * Only pass max_tokens/maxTokens when it has a valid value (number >= 1). + * Invalid or missing values are omitted so the provider uses its default. + */ + const llmConfigRecord = llmConfig as unknown as Record; + const rawMaxTokens = llmConfigRecord.maxTokens ?? llmConfigRecord.max_tokens; + const isValidMaxTokens = + typeof rawMaxTokens === 'number' && + !Number.isNaN(rawMaxTokens) && + rawMaxTokens >= 1; + if (isValidMaxTokens) { + llmConfigRecord.maxTokens = rawMaxTokens; + } else { + delete llmConfigRecord.maxTokens; + } + delete llmConfigRecord.max_tokens; + /** * Override defer_loading for tools that were discovered in previous turns. * This prevents the LLM from having to re-discover tools via tool_search. @@ -382,6 +457,8 @@ export async function createRun({ ); const reasoningKey = getReasoningKey(provider, llmConfig, agent.endpoint); + const visionCapability = determineVisionCapability(agent, modelSpecs, availableModels); + const agentInput: AgentInputs = { provider, reasoningKey, @@ -394,6 +471,7 @@ export async function createRun({ toolRegistry: agent.toolRegistry, maxContextTokens: effectiveMaxContextTokens, useLegacyContent: agent.useLegacyContent ?? false, + vision: visionCapability, discoveredTools: discoveredTools.size > 0 ? Array.from(discoveredTools) : undefined, summarizationEnabled: summarization.enabled, summarizationConfig: summarization.config, @@ -401,6 +479,7 @@ export async function createRun({ contextPruningConfig: summarization.contextPruning, maxToolResultChars: agent.maxToolResultChars, }; + agentInputs.push(agentInput); }; diff --git a/packages/api/src/agents/validation.ts b/packages/api/src/agents/validation.ts index 8119c9720455..078f43fb3db4 100644 --- a/packages/api/src/agents/validation.ts +++ b/packages/api/src/agents/validation.ts @@ -75,6 +75,7 @@ export const agentBaseSchema = z.object({ hide_sequential_outputs: z.boolean().optional(), artifacts: z.string().optional(), recursion_limit: z.number().optional(), + vision: z.boolean().optional(), conversation_starters: z.array(z.string()).optional(), tool_resources: agentToolResourcesSchema, tool_options: agentToolOptionsSchema, diff --git a/packages/api/src/utils/image-helpers.ts b/packages/api/src/utils/image-helpers.ts new file mode 100644 index 000000000000..48602c248e3f --- /dev/null +++ b/packages/api/src/utils/image-helpers.ts @@ -0,0 +1,38 @@ +import type { Agents } from 'librechat-data-provider'; + +/** + * Checks if an image_url content item contains base64 data (not an HTTP URL). + * + * IMPORTANT: This function is duplicated in two locations: + * 1. librechat/packages/api/src/utils/image-helpers.ts (for Assistants endpoint) + * 2. agents/src/messages/core.ts (for Agents endpoint) + * + * Both implementations MUST remain identical. The agents package cannot import + * from @librechat/api as it's a separate npm package. + * + * Base64 data URLs start with "data:" and can cause context overflow when + * sent to non-vision models. HTTP URLs are just text references and don't + * need filtering. + * + * @param item - Message content item to check + * @returns true if the item is an image_url with base64 data, false otherwise + */ +export function isBase64ImageUrl(item: Agents.MessageContentComplex): boolean { + if (item.type !== 'image_url') { + return false; + } + + const itemWithImageUrl = item as { image_url?: string | { url?: string } }; + const imageUrl = itemWithImageUrl.image_url; + + if (typeof imageUrl === 'string') { + return imageUrl.startsWith('data:'); + } + + if (imageUrl && typeof imageUrl === 'object' && 'url' in imageUrl) { + const url = imageUrl.url; + return typeof url === 'string' && url.startsWith('data:'); + } + + return false; +} diff --git a/packages/api/src/utils/index.ts b/packages/api/src/utils/index.ts index 2b4ac882451e..8449dc035e60 100644 --- a/packages/api/src/utils/index.ts +++ b/packages/api/src/utils/index.ts @@ -27,4 +27,5 @@ export * from './tokens'; export * from './tokenMap'; export * from './url'; export * from './message'; +export * from './image-helpers'; export * from './tracing'; diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index 9bc3822c4b60..509772c48910 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -187,6 +187,7 @@ export enum AgentCapabilities { tools = 'tools', chain = 'chain', ocr = 'ocr', + vision = 'vision', } export const defaultAssistantsVersion = { @@ -276,6 +277,7 @@ export const defaultAgentCapabilities = [ AgentCapabilities.tools, AgentCapabilities.chain, AgentCapabilities.ocr, + AgentCapabilities.vision, ]; export const agentsEndpointSchema = baseEndpointSchema @@ -1339,15 +1341,31 @@ export enum VisionModes { agents = 'agents', } +/** + * Validates whether a model supports vision capabilities. + * + * Checks in order: + * 1. Exclude known non-vision models + * 2. modelSpecs configuration (highest priority if provided) + * 3. Hardcoded visionModels list + * + * @param model - Model identifier to check + * @param modelSpecs - Optional modelSpecs configuration from librechat.yaml + * @param availableModels - Not used (kept for backwards compatibility) + * @param additionalModels - Optional additional models to include in vision check + * @returns true if the model supports vision, false otherwise + */ export function validateVisionModel({ model, additionalModels = [], availableModels, + modelSpecs, }: { model: string; additionalModels?: string[]; availableModels?: string[]; -}) { + modelSpecs?: TSpecsConfig; +}): boolean { if (!model) { return false; } @@ -1356,10 +1374,35 @@ export function validateVisionModel({ return false; } - if (availableModels && !availableModels.includes(model)) { - return false; + if (modelSpecs?.list) { + const matchingSpec = modelSpecs.list.find( + (spec) => { + // Exact match with preset.model + if (spec.preset?.model && spec.preset.model === model) { + return true; + } + // Partial match: model contains preset.model (only if preset.model is not empty) + if (spec.preset?.model && spec.preset.model.length > 0 && model.includes(spec.preset.model)) { + return true; + } + // Exact match with spec.name + if (spec.name && spec.name === model) { + return true; + } + // Partial match: model contains spec.name (only if spec.name is not empty) + if (spec.name && spec.name.length > 0 && model.includes(spec.name)) { + return true; + } + return false; + }, + ); + + if (matchingSpec?.vision !== undefined) { + return matchingSpec.vision === true; + } } + // Fall back to hardcoded visionModels list return visionModels.concat(additionalModels).some((visionModel) => model.includes(visionModel)); } diff --git a/packages/data-provider/src/models.ts b/packages/data-provider/src/models.ts index c2dbe2cf779d..f0bd5d70b7e4 100644 --- a/packages/data-provider/src/models.ts +++ b/packages/data-provider/src/models.ts @@ -35,6 +35,7 @@ export type TModelSpec = { webSearch?: boolean; fileSearch?: boolean; executeCode?: boolean; + vision?: boolean; artifacts?: string | boolean; mcpServers?: string[]; }; @@ -55,6 +56,7 @@ export const tModelSpecSchema = z.object({ webSearch: z.boolean().optional(), fileSearch: z.boolean().optional(), executeCode: z.boolean().optional(), + vision: z.boolean().optional(), artifacts: z.union([z.string(), z.boolean()]).optional(), mcpServers: z.array(z.string()).optional(), }); diff --git a/packages/data-provider/src/types/assistants.ts b/packages/data-provider/src/types/assistants.ts index 690b2e06d26b..39e800390e85 100644 --- a/packages/data-provider/src/types/assistants.ts +++ b/packages/data-provider/src/types/assistants.ts @@ -265,6 +265,7 @@ export type Agent = { end_after_tools?: boolean; hide_sequential_outputs?: boolean; artifacts?: ArtifactModes; + vision?: boolean; recursion_limit?: number; isPublic?: boolean; version?: number;