diff --git a/client/src/common/agents-types.ts b/client/src/common/agents-types.ts index 9ac6b440a397..5d43d99a8d7e 100644 --- a/client/src/common/agents-types.ts +++ b/client/src/common/agents-types.ts @@ -20,6 +20,7 @@ export type TAgentCapabilities = { [AgentCapabilities.web_search]: boolean; [AgentCapabilities.file_search]: boolean; [AgentCapabilities.execute_code]: boolean; + [AgentCapabilities.vision]: boolean; [AgentCapabilities.end_after_tools]?: boolean; [AgentCapabilities.hide_sequential_outputs]?: boolean; }; diff --git a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx index 218328b0864d..a3ee7458a85b 100644 --- a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx +++ b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx @@ -36,6 +36,7 @@ import { useGetStartupConfig } from '~/data-provider'; import { ephemeralAgentByConvoId } from '~/store'; import { MenuItemProps } from '~/common'; import { cn } from '~/utils'; +import { useVisionModel } from '~/hooks'; type FileUploadType = 'image' | 'document' | 'image_document' | 'image_document_video_audio'; @@ -74,6 +75,7 @@ const AttachFileMenu = ({ const { agentsConfig } = useGetAgentsConfig(); const { data: startupConfig } = useGetStartupConfig(); const sharePointEnabled = startupConfig?.sharePointFilePickerEnabled; + const isVisionModel = useVisionModel(); const [isSharePointDialogOpen, setIsSharePointDialogOpen] = useState(false); @@ -127,27 +129,34 @@ const AttachFileMenu = ({ isDocumentSupportedProvider(currentProvider) || isAzureWithResponsesApi ) { - items.push({ - label: localize('com_ui_upload_provider'), - onClick: () => { - setToolResource(undefined); - let fileType: Exclude = 'image_document'; - if (currentProvider === Providers.GOOGLE || currentProvider === Providers.OPENROUTER) { - fileType = 'image_document_video_audio'; - } - onAction(fileType); - }, - icon: , - }); + if (isVisionModel) { + items.push({ + label: localize('com_ui_upload_provider'), + onClick: () => { + setToolResource(undefined); + let fileType: Exclude = 'image_document'; + if ( + currentProvider === Providers.GOOGLE || + currentProvider === Providers.OPENROUTER + ) { + fileType = 'image_document_video_audio'; + } + onAction(fileType); + }, + icon: , + }); + } } else { - items.push({ - label: localize('com_ui_upload_image_input'), - onClick: () => { - setToolResource(undefined); - onAction('image'); - }, - icon: , - }); + if (isVisionModel) { + items.push({ + label: localize('com_ui_upload_image_input'), + onClick: () => { + setToolResource(undefined); + onAction('image'); + }, + icon: , + }); + } } if (capabilities.contextEnabled) { @@ -224,6 +233,7 @@ const AttachFileMenu = ({ codeAllowedByAgent, fileSearchAllowedByAgent, setIsSharePointDialogOpen, + isVisionModel, ]); const menuTrigger = ( diff --git a/client/src/components/Chat/Input/Files/DragDropModal.tsx b/client/src/components/Chat/Input/Files/DragDropModal.tsx index a59a7e3e9d61..c422c371a99b 100644 --- a/client/src/components/Chat/Input/Files/DragDropModal.tsx +++ b/client/src/components/Chat/Input/Files/DragDropModal.tsx @@ -24,6 +24,7 @@ import { } from '~/hooks'; import { ephemeralAgentByConvoId } from '~/store'; import { useDragDropContext } from '~/Providers'; +import { useVisionModel } from '~/hooks'; interface DragDropModalProps { onOptionSelect: (option: EToolResources | undefined) => void; @@ -53,6 +54,7 @@ const DragDropModal = ({ onOptionSelect, setShowModal, files, isVisible }: DragD agentId, ephemeralAgent, ); + const isVisionModel = useVisionModel(); const options = useMemo(() => { const _options: FileOption[] = []; @@ -96,15 +98,14 @@ const DragDropModal = ({ onOptionSelect, setShowModal, files, isVisible }: DragD label: localize('com_ui_upload_provider'), value: undefined, icon: , - condition: validFileTypes, + condition: validFileTypes && isVisionModel, }); } else { - // Only show image upload option if all files are images and provider doesn't support documents _options.push({ label: localize('com_ui_upload_image_input'), value: undefined, icon: , - condition: files.every((file) => getFileType(file)?.startsWith('image/')), + condition: files.every((file) => getFileType(file)?.startsWith('image/')) && isVisionModel, }); } if (capabilities.fileSearchEnabled && fileSearchAllowedByAgent) { @@ -140,6 +141,7 @@ const DragDropModal = ({ onOptionSelect, setShowModal, files, isVisible }: DragD useResponsesApi, codeAllowedByAgent, fileSearchAllowedByAgent, + isVisionModel, ]); if (!isVisible) { diff --git a/client/src/components/SidePanel/Agents/AgentConfig.tsx b/client/src/components/SidePanel/Agents/AgentConfig.tsx index a81ef780a9e3..f78e132f2e0d 100644 --- a/client/src/components/SidePanel/Agents/AgentConfig.tsx +++ b/client/src/components/SidePanel/Agents/AgentConfig.tsx @@ -1,7 +1,7 @@ import React, { useState, useMemo, useCallback } from 'react'; import { useToastContext } from '@librechat/client'; import { Controller, useWatch, useFormContext } from 'react-hook-form'; -import { EModelEndpoint, getEndpointField } from 'librechat-data-provider'; +import { EModelEndpoint, getEndpointField, defaultAgentCapabilities } from 'librechat-data-provider'; import type { AgentForm, IconComponentTypes } from '~/common'; import { removeFocusOutlines, @@ -29,6 +29,7 @@ import Artifacts from './Artifacts'; import AgentTool from './AgentTool'; import CodeForm from './Code/Form'; import MCPTools from './MCPTools'; +import ImageVision from './ImageVision'; const labelClass = 'mb-2 text-token-text-primary block font-medium'; const inputClass = cn( @@ -85,7 +86,8 @@ export default function AgentConfig() { artifactsEnabled, webSearchEnabled, fileSearchEnabled, - } = useAgentCapabilities(agentsConfig?.capabilities); + visionEnabled, + } = useAgentCapabilities(agentsConfig?.capabilities ?? defaultAgentCapabilities); const context_files = useMemo(() => { if (typeof agent === 'string') { @@ -288,7 +290,8 @@ export default function AgentConfig() { fileSearchEnabled || artifactsEnabled || contextEnabled || - webSearchEnabled) && ( + webSearchEnabled || + visionEnabled) && (
)} {/* MCP Section */} diff --git a/client/src/components/SidePanel/Agents/AgentPanel.tsx b/client/src/components/SidePanel/Agents/AgentPanel.tsx index 86ec27dc5e1d..080d89c275fe 100644 --- a/client/src/components/SidePanel/Agents/AgentPanel.tsx +++ b/client/src/components/SidePanel/Agents/AgentPanel.tsx @@ -69,6 +69,7 @@ export function composeAgentUpdatePayload(data: AgentForm, agent_id?: string | n edges, end_after_tools, hide_sequential_outputs, + vision, recursion_limit, category, support_contact, @@ -94,6 +95,7 @@ export function composeAgentUpdatePayload(data: AgentForm, agent_id?: string | n edges, end_after_tools, hide_sequential_outputs, + vision, recursion_limit, category, support_contact, diff --git a/client/src/components/SidePanel/Agents/AgentSelect.tsx b/client/src/components/SidePanel/Agents/AgentSelect.tsx index 9a3ef387c9b9..345f6356c1ac 100644 --- a/client/src/components/SidePanel/Agents/AgentSelect.tsx +++ b/client/src/components/SidePanel/Agents/AgentSelect.tsx @@ -58,6 +58,7 @@ export default function AgentSelect({ [AgentCapabilities.web_search]: false, [AgentCapabilities.file_search]: false, [AgentCapabilities.execute_code]: false, + [AgentCapabilities.vision]: false, [AgentCapabilities.end_after_tools]: false, [AgentCapabilities.hide_sequential_outputs]: false, }; diff --git a/client/src/components/SidePanel/Agents/ImageVision.tsx b/client/src/components/SidePanel/Agents/ImageVision.tsx index bc4e1178966b..643f144ec89b 100644 --- a/client/src/components/SidePanel/Agents/ImageVision.tsx +++ b/client/src/components/SidePanel/Agents/ImageVision.tsx @@ -1,40 +1,69 @@ -import { Checkbox } from '@librechat/client'; -import { Capabilities } from 'librechat-data-provider'; +import { memo } from 'react'; +import { AgentCapabilities } from 'librechat-data-provider'; import { useFormContext, Controller } from 'react-hook-form'; +import { + Checkbox, + HoverCard, + HoverCardContent, + HoverCardPortal, + HoverCardTrigger, + CircleHelpIcon, +} from '@librechat/client'; import type { AgentForm } from '~/common'; import { useLocalize } from '~/hooks'; +import { ESide } from '~/common'; -export default function ImageVision() { +function ImageVision() { const localize = useLocalize(); const methods = useFormContext(); - const { control, setValue, getValues } = methods; + const { control } = methods; return ( -
- ( - - )} - /> - -
+ +
+ ( + + )} + /> + + + + + + +
+

+ {localize('com_agents_image_vision_info')} +

+
+
+
+
+
); } + +export default memo(ImageVision); diff --git a/client/src/hooks/Agents/useAgentCapabilities.ts b/client/src/hooks/Agents/useAgentCapabilities.ts index 8d2bd6ef87ea..571bdfe175ed 100644 --- a/client/src/hooks/Agents/useAgentCapabilities.ts +++ b/client/src/hooks/Agents/useAgentCapabilities.ts @@ -10,6 +10,7 @@ interface AgentCapabilitiesResult { fileSearchEnabled: boolean; webSearchEnabled: boolean; codeEnabled: boolean; + visionEnabled: boolean; } export default function useAgentCapabilities( @@ -55,6 +56,11 @@ export default function useAgentCapabilities( [capabilities], ); + const visionEnabled = useMemo( + () => capabilities?.includes(AgentCapabilities.vision) ?? false, + [capabilities], + ); + return { ocrEnabled, codeEnabled, @@ -64,5 +70,6 @@ export default function useAgentCapabilities( artifactsEnabled, webSearchEnabled, fileSearchEnabled, + visionEnabled, }; } diff --git a/client/src/hooks/index.ts b/client/src/hooks/index.ts index 62682b84d8bb..6e1431421817 100644 --- a/client/src/hooks/index.ts +++ b/client/src/hooks/index.ts @@ -36,3 +36,4 @@ export { default as useTextToSpeech } from './Input/useTextToSpeech'; export { default as useGenerationsByLatest } from './useGenerationsByLatest'; export { default as useLocalizedConfig } from './useLocalizedConfig'; export { default as useResourcePermissions } from './useResourcePermissions'; +export { useVisionModel } from './useVisionModel'; diff --git a/client/src/hooks/useVisionModel.ts b/client/src/hooks/useVisionModel.ts new file mode 100644 index 000000000000..61c8d74b91d0 --- /dev/null +++ b/client/src/hooks/useVisionModel.ts @@ -0,0 +1,24 @@ +import { useMemo } from 'react'; +import { validateVisionModel } from 'librechat-data-provider'; +import { useChatContext } from '~/Providers'; +import { useGetStartupConfig } from '~/data-provider'; + +/** + * Hook to determine if the current conversation model supports vision capabilities. + * Checks modelSpecs configuration first, then falls back to hardcoded list. + */ +export function useVisionModel(): boolean { + const { conversation } = useChatContext(); + const { data: startupConfig } = useGetStartupConfig(); + + return useMemo(() => { + const model = conversation?.model; + if (!model) { + return false; + } + return validateVisionModel({ + model, + modelSpecs: startupConfig?.modelSpecs, + }); + }, [conversation?.model, startupConfig?.modelSpecs]); +} diff --git a/client/src/locales/en/translation.json b/client/src/locales/en/translation.json index addf32e08adb..8d1a7d5bff39 100644 --- a/client/src/locales/en/translation.json +++ b/client/src/locales/en/translation.json @@ -40,6 +40,7 @@ "com_agents_description_placeholder": "Optional: Describe your Agent here", "com_agents_empty_state_heading": "No agents found", "com_agents_enable_file_search": "Enable File Search", + "com_agents_enable_image_vision": "Enable Image Vision", "com_agents_error_bad_request_message": "The request could not be processed.", "com_agents_error_bad_request_suggestion": "Please check your input and try again.", "com_agents_error_category_title": "Category Error", @@ -68,6 +69,7 @@ "com_agents_file_context_label": "File Context", "com_agents_file_search_disabled": "Agent must be created before uploading files for File Search.", "com_agents_file_search_info": "When enabled, the agent will be informed of the exact filenames listed below, allowing it to retrieve relevant context from these files.", + "com_agents_image_vision_info": "When enabled, images generated by MCP tools (e.g., image generation tools) will be sent back to the LLM. Disable this for non-vision models to prevent context overflow errors.", "com_agents_grid_announcement": "Showing {{count}} agents in {{category}} category", "com_agents_instructions_placeholder": "The system instructions that the agent uses", "com_agents_link_copied": "Link copied", diff --git a/packages/api/src/agents/run.ts b/packages/api/src/agents/run.ts index 6b18c73799e9..80823912d9ad 100644 --- a/packages/api/src/agents/run.ts +++ b/packages/api/src/agents/run.ts @@ -146,6 +146,7 @@ export async function createRun({ instructions: systemContent, maxContextTokens: agent.maxContextTokens, useLegacyContent: agent.useLegacyContent ?? false, + vision: agent.vision, }; agentInputs.push(agentInput); }; diff --git a/packages/api/src/agents/validation.ts b/packages/api/src/agents/validation.ts index 4798ffeb80ae..d64fc11fe4cc 100644 --- a/packages/api/src/agents/validation.ts +++ b/packages/api/src/agents/validation.ts @@ -66,6 +66,7 @@ export const agentBaseSchema = z.object({ hide_sequential_outputs: z.boolean().optional(), artifacts: z.string().optional(), recursion_limit: z.number().optional(), + vision: z.boolean().optional(), conversation_starters: z.array(z.string()).optional(), tool_resources: agentToolResourcesSchema, support_contact: agentSupportContactSchema, diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index 45c964cbd8a2..6709ff3f5368 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -186,6 +186,7 @@ export enum AgentCapabilities { tools = 'tools', chain = 'chain', ocr = 'ocr', + vision = 'vision', } export const defaultAssistantsVersion = { @@ -268,6 +269,7 @@ export const defaultAgentCapabilities = [ AgentCapabilities.tools, AgentCapabilities.chain, AgentCapabilities.ocr, + AgentCapabilities.vision, ]; export const agentsEndpointSchema = baseEndpointSchema @@ -1283,27 +1285,46 @@ export enum VisionModes { agents = 'agents', } +/** + * Validates whether a model supports vision capabilities. + * Checks modelSpecs configuration first, then falls back to hardcoded list. + */ export function validateVisionModel({ model, additionalModels = [], availableModels, + modelSpecs, }: { model: string; additionalModels?: string[]; availableModels?: string[]; -}) { + modelSpecs?: TSpecsConfig; +}): boolean { if (!model) { return false; } + // Exclude known non-vision models if (model.includes('gpt-4-turbo-preview') || model.includes('o1-mini')) { return false; } + // Check if model is in available models list if (availableModels && !availableModels.includes(model)) { return false; } + // Check modelSpecs first if provided + if (modelSpecs?.list) { + const matchingSpec = modelSpecs.list.find( + (spec) => spec.preset?.model === model || model.includes(spec.preset?.model ?? ''), + ); + if (matchingSpec?.vision !== undefined) { + return matchingSpec.vision === true; + } + } + + // Fall back to hardcoded visionModels list return visionModels.concat(additionalModels).some((visionModel) => model.includes(visionModel)); } diff --git a/packages/data-provider/src/models.ts b/packages/data-provider/src/models.ts index 3c3c19766015..7b0cc3483fdb 100644 --- a/packages/data-provider/src/models.ts +++ b/packages/data-provider/src/models.ts @@ -35,6 +35,7 @@ export type TModelSpec = { webSearch?: boolean; fileSearch?: boolean; executeCode?: boolean; + vision?: boolean; mcpServers?: string[]; }; @@ -54,6 +55,7 @@ export const tModelSpecSchema = z.object({ webSearch: z.boolean().optional(), fileSearch: z.boolean().optional(), executeCode: z.boolean().optional(), + vision: z.boolean().optional(), mcpServers: z.array(z.string()).optional(), }); diff --git a/packages/data-provider/src/types/assistants.ts b/packages/data-provider/src/types/assistants.ts index da773071e777..511543c35f19 100644 --- a/packages/data-provider/src/types/assistants.ts +++ b/packages/data-provider/src/types/assistants.ts @@ -236,6 +236,7 @@ export type Agent = { end_after_tools?: boolean; hide_sequential_outputs?: boolean; artifacts?: ArtifactModes; + vision?: boolean; recursion_limit?: number; isPublic?: boolean; version?: number;