Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dimos/models/vl/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,7 @@ def create(name: VlModelName) -> VlModel:
from dimos.models.vl.moondream import MoondreamVlModel

return MoondreamVlModel()
case "litellm":
from dimos.models.vl.litellm import LiteLLMVlModel

return LiteLLMVlModel()
146 changes: 146 additions & 0 deletions dimos/models/vl/litellm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Copyright 2026 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from typing import Any

import numpy as np

from dimos.models.vl.base import VlModel, VlModelConfig
from dimos.msgs.sensor_msgs.Image import Image
from dimos.utils.logging_config import setup_logger

logger = setup_logger()


class LiteLLMVlModelConfig(VlModelConfig):
"""Configuration for LiteLLM-backed vision-language model.

LiteLLM provides a unified interface to 100+ LLM providers (OpenAI, Anthropic,
Google, Azure, Bedrock, etc.) through a single API.
"""

model_name: str = "gpt-4o-mini"
api_key: str | None = None
api_base: str | None = None


class LiteLLMVlModel(VlModel):
"""Vision-language model backed by LiteLLM.

Supports any vision-capable model available through LiteLLM, including
OpenAI GPT-4o, Anthropic Claude, Google Gemini, Azure OpenAI, AWS Bedrock,
and many more through a single unified interface.

See https://docs.litellm.ai/docs/providers for the full list of supported
providers and model identifiers.
"""

config: LiteLLMVlModelConfig

def _completion(self, **kwargs: Any) -> Any:
try:
import litellm
except ImportError as e:
raise ImportError(
"litellm is required for LiteLLMVlModel. "
"Install it with: pip install 'dimos[litellm]'"
) from e

if self.config.api_key:
kwargs["api_key"] = self.config.api_key
Comment on lines +62 to +63
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does LiteLLM work without an api_key? I assume not. So instead of conditionally adding api_key to kwargs, it should throw an error if api_key is missing.


if self.config.api_base:
kwargs["api_base"] = self.config.api_base

return litellm.completion(drop_params=True, **kwargs)

def query(
self,
image: Image | np.ndarray,
query: str,
response_format: dict[str, Any] | None = None,
**kwargs: Any,
) -> str:
if isinstance(image, np.ndarray):
import warnings

warnings.warn(
"LiteLLMVlModel.query should receive standard dimos Image type, not a numpy array",
DeprecationWarning,
stacklevel=2,
)
image = Image.from_numpy(image)

image, _ = self._prepare_image(image)
img_base64 = image.to_base64()

api_kwargs: dict[str, Any] = {
"model": self.config.model_name,
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
{"type": "text", "text": query},
],
}
],
}

if response_format:
api_kwargs["response_format"] = response_format

response = self._completion(**api_kwargs)
return response.choices[0].message.content or ""
Comment on lines +106 to +110
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 **kwargs from query() and query_batch() are silently dropped

Both methods accept **kwargs but never forward them to _completion. Any extra kwargs a caller passes (temperature, max_tokens, stream, provider-specific flags) are silently discarded. Adding api_kwargs.update(kwargs) before the _completion call preserves caller intent and matches LiteLLM's flexibility.

Suggested change
if response_format:
api_kwargs["response_format"] = response_format
response = self._completion(**api_kwargs)
return response.choices[0].message.content or ""
if response_format:
api_kwargs["response_format"] = response_format
api_kwargs.update(kwargs)
response = self._completion(**api_kwargs)
return response.choices[0].message.content or ""


def query_batch(
self,
images: list[Image],
query: str,
response_format: dict[str, Any] | None = None,
**kwargs: Any,
) -> list[str]:
"""Query VLM with multiple images using a single API call."""
if not images:
return []

content: list[dict[str, Any]] = [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{self._prepare_image(img)[0].to_base64()}"
},
}
for img in images
]
content.append({"type": "text", "text": query})

api_kwargs: dict[str, Any] = {
"model": self.config.model_name,
"messages": [{"role": "user", "content": content}],
}
if response_format:
api_kwargs["response_format"] = response_format

response = self._completion(**api_kwargs)
response_text = response.choices[0].message.content or ""
return [response_text] * len(images)
Comment on lines +112 to +143
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 query_batch sends all images in one call regardless of provider support

query_batch packs every image into a single API message and returns [response_text] * len(images). This means (a) providers that don't support multi-image inputs (many Bedrock/Vertex models available through LiteLLM) will throw an exception for any multi-image call that would otherwise succeed per-image, and (b) callers expecting per-image responses always receive the same combined description repeated — silently wrong data.

The base-class fallback (query() per image) is both safer and correct for the per-image contract. The QwenVlModel shares the same design, but LiteLLMVlModel targets a far broader provider surface where single-message multi-image is not universal.


def stop(self) -> None:
pass
Loading