dimensionalOS · spomichter · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/dimos/agents/__init__.py b/dimos/agents/__init__.py
@@ -9,7 +9,19 @@
 
 from dimos.agents.agent import Agent, deploy
 from dimos.agents.spec import AgentSpec
+from dimos.agents.vlm_agent import VLMAgent
+from dimos.agents.vlm_stream_tester import VlmStreamTester
 from dimos.protocol.skill.skill import skill
 from dimos.protocol.skill.type import Output, Reducer, Stream
 
-__all__ = ["Agent", "AgentSpec", "Output", "Reducer", "Stream", "deploy", "skill"]
+__all__ = [
+    "Agent",
+    "AgentSpec",
+    "Output",
+    "Reducer",
+    "Stream",
+    "VLMAgent",
+    "VlmStreamTester",
+    "deploy",
+    "skill",
+]
diff --git a/dimos/agents/agent.py b/dimos/agents/agent.py
@@ -19,19 +19,16 @@
 from typing import Any, TypedDict
 import uuid
 
-from langchain.chat_models import init_chat_model
 from langchain_core.messages import (
     AIMessage,
     HumanMessage,
     SystemMessage,
     ToolCall,
     ToolMessage,
 )
-from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
 
-from dimos.agents.ollama_agent import ensure_ollama_model
+from dimos.agents.llm_init import build_llm, build_system_message
 from dimos.agents.spec import AgentSpec, Model, Provider
-from dimos.agents.system_prompt import SYSTEM_PROMPT
 from dimos.core import DimosCluster, rpc
 from dimos.protocol.skill.coordinator import SkillCoordinator, SkillState, SkillStateDict
 from dimos.protocol.skill.skill import SkillContainer
@@ -175,40 +172,9 @@ def __init__(  # type: ignore[no-untyped-def]
         self._agent_id = str(uuid.uuid4())
         self._agent_stopped = False
 
-        if self.config.system_prompt:
-            if isinstance(self.config.system_prompt, str):
-                self.system_message = SystemMessage(self.config.system_prompt + SYSTEM_MSG_APPEND)
-            else:
-                self.config.system_prompt.content += SYSTEM_MSG_APPEND  # type: ignore[operator]
-                self.system_message = self.config.system_prompt
-        else:
-            self.system_message = SystemMessage(SYSTEM_PROMPT + SYSTEM_MSG_APPEND)
-
+        self.system_message = build_system_message(self.config, append=SYSTEM_MSG_APPEND)
         self.publish(self.system_message)
-
-        # Use provided model instance if available, otherwise initialize from config
-        if self.config.model_instance:
-            self._llm = self.config.model_instance
-        else:
-            # For Ollama provider, ensure the model is available before initializing
-            if self.config.provider.value.lower() == "ollama":
-                ensure_ollama_model(self.config.model)
-
-            # For HuggingFace, we need to create a pipeline and wrap it in ChatHuggingFace
-            if self.config.provider.value.lower() == "huggingface":
-                llm = HuggingFacePipeline.from_model_id(
-                    model_id=self.config.model,
-                    task="text-generation",
-                    pipeline_kwargs={
-                        "max_new_tokens": 512,
-                        "temperature": 0.7,
-                    },
-                )
-                self._llm = ChatHuggingFace(llm=llm, model_id=self.config.model)
-            else:
-                self._llm = init_chat_model(  # type: ignore[call-overload]
-                    model_provider=self.config.provider, model=self.config.model
-                )
+        self._llm = build_llm(self.config)
 
     @rpc
     def get_agent_id(self) -> str:

diff --git a/dimos/agents/llm_init.py b/dimos/agents/llm_init.py
@@ -0,0 +1,62 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import cast
+
+from langchain.chat_models import init_chat_model
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import SystemMessage
+from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
+
+from dimos.agents.ollama_agent import ensure_ollama_model
+from dimos.agents.spec import AgentConfig
+from dimos.agents.system_prompt import SYSTEM_PROMPT
+
+
+def build_llm(config: AgentConfig) -> BaseChatModel:
+    if config.model_instance:
+        return config.model_instance
+
+    if config.provider.value.lower() == "ollama":
+        ensure_ollama_model(config.model)
+
+    if config.provider.value.lower() == "huggingface":
+        llm = HuggingFacePipeline.from_model_id(
+            model_id=config.model,
+            task="text-generation",
+            pipeline_kwargs={
+                "max_new_tokens": 512,
+                "temperature": 0.7,
+            },
+        )
+        return ChatHuggingFace(llm=llm, model_id=config.model)
+
+    return cast(
+        "BaseChatModel",
+        init_chat_model(  # type: ignore[call-overload]
+            model_provider=config.provider,
+            model=config.model,
+        ),
+    )
+
+
+def build_system_message(config: AgentConfig, *, append: str = "") -> SystemMessage:
+    if config.system_prompt:
+        if isinstance(config.system_prompt, str):
+            return SystemMessage(config.system_prompt + append)
+        if append:
+            config.system_prompt.content += append  # type: ignore[operator]
+        return config.system_prompt
+
+    return SystemMessage(SYSTEM_PROMPT + append)
diff --git a/dimos/agents/vlm_agent.py b/dimos/agents/vlm_agent.py
@@ -0,0 +1,120 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
+
+from dimos.agents.llm_init import build_llm, build_system_message
+from dimos.agents.spec import AgentSpec, AnyMessage
+from dimos.core import rpc
+from dimos.core.stream import In, Out
+from dimos.msgs.sensor_msgs import Image
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger()
+
+
+class VLMAgent(AgentSpec):
+    """Stream-first agent for vision queries with optional RPC access."""
+
+    color_image: In[Image]
+    query_stream: In[HumanMessage]
+    answer_stream: Out[AIMessage]
+
+    def __init__(self, *args, **kwargs) -> None:  # type: ignore[no-untyped-def]
+        super().__init__(*args, **kwargs)
+        self._llm = build_llm(self.config)
+        self._latest_image: Image | None = None
+        self._history: list[AIMessage | HumanMessage] = []
+        self._system_message = build_system_message(self.config)
+        self.publish(self._system_message)
+
+    @rpc
+    def start(self) -> None:
+        super().start()
+        self._disposables.add(self.color_image.subscribe(self._on_image))  # type: ignore[arg-type]
+        self._disposables.add(self.query_stream.subscribe(self._on_query))  # type: ignore[arg-type]
+
+    @rpc
+    def stop(self) -> None:
+        super().stop()
+
+    def _on_image(self, image: Image) -> None:
+        self._latest_image = image
+
+    def _on_query(self, msg: HumanMessage) -> None:
+        if not self._latest_image:
+            self.answer_stream.publish(AIMessage(content="No image available yet."))
+            return
+
+        query_text = self._extract_text(msg)
+        response = self._invoke_image(self._latest_image, query_text)
+        self.answer_stream.publish(response)
+
+    def _extract_text(self, msg: HumanMessage) -> str:
+        content = msg.content
+        if isinstance(content, str):
+            return content
+        if isinstance(content, list):
+            for part in content:
+                if isinstance(part, dict) and part.get("type") == "text":
+                    return str(part.get("text", ""))
+        return str(content)
+
+    def _invoke(self, msg: HumanMessage) -> AIMessage:
+        messages = [self._system_message, msg]
+        response = self._llm.invoke(messages)
+        self.append_history([msg, response])  # type: ignore[arg-type]
+        return response  # type: ignore[return-value]
+
+    def _invoke_image(self, image: Image, query: str) -> AIMessage:
+        content = [{"type": "text", "text": query}, *image.agent_encode()]
+        return self._invoke(HumanMessage(content=content))
+
+    @rpc
+    def clear_history(self):  # type: ignore[no-untyped-def]
+        self._history.clear()
+
+    def append_history(self, *msgs: list[AIMessage | HumanMessage]) -> None:
-    def append_history(self, *msgs: list[AIMessage | HumanMessage]) -> None:
+    def append_history(self, *msgs: AIMessage | HumanMessage) -> None:
-    def append_history(self, *msgs: list[AIMessage | HumanMessage]) -> None:
+    def append_history(self, *msgs: AIMessage | HumanMessage) -> None:
+        for msg_list in msgs:
+            for msg in msg_list:
+                self.publish(msg)  # type: ignore[arg-type]
+            self._history.extend(msg_list)
+
+    def history(self) -> list[AnyMessage]:
+        return [self._system_message, *self._history]
+
+    @rpc
+    def register_skills(  # type: ignore[no-untyped-def]
+        self, container, run_implicit_name: str | None = None
+    ) -> None:
+        logger.warning(
+            "VLMAgent does not manage skills; register_skills is a no-op",
+            container=str(container),
+            run_implicit_name=run_implicit_name,
+        )
+
+    @rpc
+    def query(self, query: str):  # type: ignore[no-untyped-def]
+        response = self._invoke(HumanMessage(query))
+        return response.content
+
+    @rpc
+    def query_image(self, image: Image, query: str):  # type: ignore[no-untyped-def]
+        response = self._invoke_image(image, query)
+        return response.content
+
+
+vlm_agent = VLMAgent.blueprint
+
+__all__ = ["VLMAgent", "vlm_agent"]