Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion dimos/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,19 @@

from dimos.agents.agent import Agent, deploy
from dimos.agents.spec import AgentSpec
from dimos.agents.vlm_agent import VLMAgent
from dimos.agents.vlm_stream_tester import VlmStreamTester
from dimos.protocol.skill.skill import skill
from dimos.protocol.skill.type import Output, Reducer, Stream

__all__ = ["Agent", "AgentSpec", "Output", "Reducer", "Stream", "deploy", "skill"]
__all__ = [
"Agent",
"AgentSpec",
"Output",
"Reducer",
"Stream",
"VLMAgent",
"VlmStreamTester",
"deploy",
"skill",
]
40 changes: 3 additions & 37 deletions dimos/agents/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,16 @@
from typing import Any, TypedDict
import uuid

from langchain.chat_models import init_chat_model
from langchain_core.messages import (
AIMessage,
HumanMessage,
SystemMessage,
ToolCall,
ToolMessage,
)
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline

from dimos.agents.ollama_agent import ensure_ollama_model
from dimos.agents.llm_init import build_llm, build_system_message
from dimos.agents.spec import AgentSpec, Model, Provider
from dimos.agents.system_prompt import SYSTEM_PROMPT
from dimos.core import DimosCluster, rpc
from dimos.protocol.skill.coordinator import SkillCoordinator, SkillState, SkillStateDict
from dimos.protocol.skill.skill import SkillContainer
Expand Down Expand Up @@ -175,40 +172,9 @@ def __init__( # type: ignore[no-untyped-def]
self._agent_id = str(uuid.uuid4())
self._agent_stopped = False

if self.config.system_prompt:
if isinstance(self.config.system_prompt, str):
self.system_message = SystemMessage(self.config.system_prompt + SYSTEM_MSG_APPEND)
else:
self.config.system_prompt.content += SYSTEM_MSG_APPEND # type: ignore[operator]
self.system_message = self.config.system_prompt
else:
self.system_message = SystemMessage(SYSTEM_PROMPT + SYSTEM_MSG_APPEND)

self.system_message = build_system_message(self.config, append=SYSTEM_MSG_APPEND)
self.publish(self.system_message)

# Use provided model instance if available, otherwise initialize from config
if self.config.model_instance:
self._llm = self.config.model_instance
else:
# For Ollama provider, ensure the model is available before initializing
if self.config.provider.value.lower() == "ollama":
ensure_ollama_model(self.config.model)

# For HuggingFace, we need to create a pipeline and wrap it in ChatHuggingFace
if self.config.provider.value.lower() == "huggingface":
llm = HuggingFacePipeline.from_model_id(
model_id=self.config.model,
task="text-generation",
pipeline_kwargs={
"max_new_tokens": 512,
"temperature": 0.7,
},
)
self._llm = ChatHuggingFace(llm=llm, model_id=self.config.model)
else:
self._llm = init_chat_model( # type: ignore[call-overload]
model_provider=self.config.provider, model=self.config.model
)
self._llm = build_llm(self.config)

@rpc
def get_agent_id(self) -> str:
Expand Down
62 changes: 62 additions & 0 deletions dimos/agents/llm_init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright 2025-2026 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import cast

from langchain.chat_models import init_chat_model
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import SystemMessage
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline

from dimos.agents.ollama_agent import ensure_ollama_model
from dimos.agents.spec import AgentConfig
from dimos.agents.system_prompt import SYSTEM_PROMPT


def build_llm(config: AgentConfig) -> BaseChatModel:
if config.model_instance:
return config.model_instance

if config.provider.value.lower() == "ollama":
ensure_ollama_model(config.model)

if config.provider.value.lower() == "huggingface":
llm = HuggingFacePipeline.from_model_id(
model_id=config.model,
task="text-generation",
pipeline_kwargs={
"max_new_tokens": 512,
"temperature": 0.7,
},
)
return ChatHuggingFace(llm=llm, model_id=config.model)

return cast(
"BaseChatModel",
init_chat_model( # type: ignore[call-overload]
model_provider=config.provider,
model=config.model,
),
)


def build_system_message(config: AgentConfig, *, append: str = "") -> SystemMessage:
if config.system_prompt:
if isinstance(config.system_prompt, str):
return SystemMessage(config.system_prompt + append)
if append:
config.system_prompt.content += append # type: ignore[operator]
return config.system_prompt

return SystemMessage(SYSTEM_PROMPT + append)
120 changes: 120 additions & 0 deletions dimos/agents/vlm_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Copyright 2025-2026 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

from dimos.agents.llm_init import build_llm, build_system_message
from dimos.agents.spec import AgentSpec, AnyMessage
from dimos.core import rpc
from dimos.core.stream import In, Out
from dimos.msgs.sensor_msgs import Image
from dimos.utils.logging_config import setup_logger

logger = setup_logger()


class VLMAgent(AgentSpec):
"""Stream-first agent for vision queries with optional RPC access."""

color_image: In[Image]
query_stream: In[HumanMessage]
answer_stream: Out[AIMessage]

def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def]
super().__init__(*args, **kwargs)
self._llm = build_llm(self.config)
self._latest_image: Image | None = None
self._history: list[AIMessage | HumanMessage] = []
self._system_message = build_system_message(self.config)
self.publish(self._system_message)

@rpc
def start(self) -> None:
super().start()
self._disposables.add(self.color_image.subscribe(self._on_image)) # type: ignore[arg-type]
self._disposables.add(self.query_stream.subscribe(self._on_query)) # type: ignore[arg-type]

@rpc
def stop(self) -> None:
super().stop()

def _on_image(self, image: Image) -> None:
self._latest_image = image

def _on_query(self, msg: HumanMessage) -> None:
if not self._latest_image:
self.answer_stream.publish(AIMessage(content="No image available yet."))
return

query_text = self._extract_text(msg)
response = self._invoke_image(self._latest_image, query_text)
self.answer_stream.publish(response)

def _extract_text(self, msg: HumanMessage) -> str:
content = msg.content
if isinstance(content, str):
return content
if isinstance(content, list):
for part in content:
if isinstance(part, dict) and part.get("type") == "text":
return str(part.get("text", ""))
return str(content)

def _invoke(self, msg: HumanMessage) -> AIMessage:
messages = [self._system_message, msg]
response = self._llm.invoke(messages)
self.append_history([msg, response]) # type: ignore[arg-type]
return response # type: ignore[return-value]

def _invoke_image(self, image: Image, query: str) -> AIMessage:
content = [{"type": "text", "text": query}, *image.agent_encode()]
return self._invoke(HumanMessage(content=content))

@rpc
def clear_history(self): # type: ignore[no-untyped-def]
self._history.clear()

def append_history(self, *msgs: list[AIMessage | HumanMessage]) -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

syntax: Parameter type annotation incorrect - *msgs expects individual messages, not a list. The annotation should be *msgs: AIMessage | HumanMessage instead of *msgs: list[AIMessage | HumanMessage]

Suggested change
def append_history(self, *msgs: list[AIMessage | HumanMessage]) -> None:
def append_history(self, *msgs: AIMessage | HumanMessage) -> None:

for msg_list in msgs:
for msg in msg_list:
self.publish(msg) # type: ignore[arg-type]
self._history.extend(msg_list)

def history(self) -> list[AnyMessage]:
return [self._system_message, *self._history]

@rpc
def register_skills( # type: ignore[no-untyped-def]
self, container, run_implicit_name: str | None = None
) -> None:
logger.warning(
"VLMAgent does not manage skills; register_skills is a no-op",
container=str(container),
run_implicit_name=run_implicit_name,
)
Comment on lines +98 to +105
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, so VLMAgent isn't a replacement for Agent? In that case would we be running two agent loops?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

VLMAgent in this case is meant to be run one-off yeah. But i guess we could alos change VLMAgent to inherit from Agent instead of AgentSpec, so then i would run a agent loop / skill coordinator

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah so we need to do this because agent runs its own agent loop but listens to /skill. We haven't solved multi-agent yet. So if VLMAgent also inherited from Agent it would get bogged down by tool responses in parallel.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Soon we will fix this so each agent also has its own skill coordinator topics and then we can run 5 agents that don't clash


@rpc
def query(self, query: str): # type: ignore[no-untyped-def]
response = self._invoke(HumanMessage(query))
return response.content

@rpc
def query_image(self, image: Image, query: str): # type: ignore[no-untyped-def]
response = self._invoke_image(image, query)
return response.content


vlm_agent = VLMAgent.blueprint

__all__ = ["VLMAgent", "vlm_agent"]
Loading
Loading