Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dimos/models/vl/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,7 @@ def create(name: VlModelName) -> VlModel:
from dimos.models.vl.moondream import MoondreamVlModel

return MoondreamVlModel()
case "litellm":
from dimos.models.vl.litellm import LiteLLMVlModel

return LiteLLMVlModel()
146 changes: 146 additions & 0 deletions dimos/models/vl/litellm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Copyright 2026 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from typing import Any

import numpy as np

from dimos.models.vl.base import VlModel, VlModelConfig
from dimos.msgs.sensor_msgs.Image import Image
from dimos.utils.logging_config import setup_logger

logger = setup_logger()


class LiteLLMVlModelConfig(VlModelConfig):
"""Configuration for LiteLLM-backed vision-language model.

LiteLLM provides a unified interface to 100+ LLM providers (OpenAI, Anthropic,
Google, Azure, Bedrock, etc.) through a single API.
"""

model_name: str = "gpt-4o-mini"
api_key: str | None = None
api_base: str | None = None


class LiteLLMVlModel(VlModel):
"""Vision-language model backed by LiteLLM.

Supports any vision-capable model available through LiteLLM, including
OpenAI GPT-4o, Anthropic Claude, Google Gemini, Azure OpenAI, AWS Bedrock,
and many more through a single unified interface.

See https://docs.litellm.ai/docs/providers for the full list of supported
providers and model identifiers.
"""

config: LiteLLMVlModelConfig

def _completion(self, **kwargs: Any) -> Any:
try:
import litellm
except ImportError as e:
raise ImportError(
"litellm is required for LiteLLMVlModel. "
"Install it with: pip install 'dimos[litellm]'"
) from e

if self.config.api_key:
kwargs["api_key"] = self.config.api_key
Comment on lines +62 to +63
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does LiteLLM work without an api_key? I assume not. So instead of conditionally adding api_key to kwargs, it should throw an error if api_key is missing.


if self.config.api_base:
kwargs["api_base"] = self.config.api_base

return litellm.completion(drop_params=True, **kwargs)

def query(
self,
image: Image | np.ndarray,
query: str,
response_format: dict[str, Any] | None = None,
**kwargs: Any,
) -> str:
if isinstance(image, np.ndarray):
import warnings

warnings.warn(
"LiteLLMVlModel.query should receive standard dimos Image type, not a numpy array",
DeprecationWarning,
stacklevel=2,
)
image = Image.from_numpy(image)

image, _ = self._prepare_image(image)
img_base64 = image.to_base64()

api_kwargs: dict[str, Any] = {
"model": self.config.model_name,
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
{"type": "text", "text": query},
],
}
],
}

if response_format:
api_kwargs["response_format"] = response_format

response = self._completion(**api_kwargs)
return response.choices[0].message.content or ""
Comment on lines +106 to +110
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 **kwargs from query() and query_batch() are silently dropped

Both methods accept **kwargs but never forward them to _completion. Any extra kwargs a caller passes (temperature, max_tokens, stream, provider-specific flags) are silently discarded. Adding api_kwargs.update(kwargs) before the _completion call preserves caller intent and matches LiteLLM's flexibility.

Suggested change
if response_format:
api_kwargs["response_format"] = response_format
response = self._completion(**api_kwargs)
return response.choices[0].message.content or ""
if response_format:
api_kwargs["response_format"] = response_format
api_kwargs.update(kwargs)
response = self._completion(**api_kwargs)
return response.choices[0].message.content or ""


def query_batch(
self,
images: list[Image],
query: str,
response_format: dict[str, Any] | None = None,
**kwargs: Any,
) -> list[str]:
"""Query VLM with multiple images using a single API call."""
if not images:
return []

content: list[dict[str, Any]] = [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{self._prepare_image(img)[0].to_base64()}"
},
}
for img in images
]
content.append({"type": "text", "text": query})

api_kwargs: dict[str, Any] = {
"model": self.config.model_name,
"messages": [{"role": "user", "content": content}],
}
if response_format:
api_kwargs["response_format"] = response_format

response = self._completion(**api_kwargs)
response_text = response.choices[0].message.content or ""
return [response_text] * len(images)
Comment on lines +112 to +143
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 query_batch sends all images in one call regardless of provider support

query_batch packs every image into a single API message and returns [response_text] * len(images). This means (a) providers that don't support multi-image inputs (many Bedrock/Vertex models available through LiteLLM) will throw an exception for any multi-image call that would otherwise succeed per-image, and (b) callers expecting per-image responses always receive the same combined description repeated — silently wrong data.

The base-class fallback (query() per image) is both safer and correct for the per-image contract. The QwenVlModel shares the same design, but LiteLLMVlModel targets a far broader provider surface where single-message multi-image is not universal.


def stop(self) -> None:
pass
Loading
Loading