dimensionalOS · spomichter · Jan 15, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 10, 2026
diff --git a/dimos/agents/vlm_agent.py b/dimos/agents/vlm_agent.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from langchain_core.messages import AIMessage, HumanMessage
+from typing import Any
+
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
 
 from dimos.agents.llm_init import build_llm, build_system_message
 from dimos.agents.spec import AgentSpec, AnyMessage
@@ -31,7 +33,7 @@ class VLMAgent(AgentSpec):
     query_stream: In[HumanMessage]
     answer_stream: Out[AIMessage]
 
-    def __init__(self, *args, **kwargs) -> None:  # type: ignore[no-untyped-def]
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         self._llm = build_llm(self.config)
         self._latest_image: Image | None = None
@@ -71,18 +73,23 @@ def _extract_text(self, msg: HumanMessage) -> str:
                     return str(part.get("text", ""))
         return str(content)
 
-    def _invoke(self, msg: HumanMessage) -> AIMessage:
+    def _invoke(self, msg: HumanMessage, **kwargs: Any) -> AIMessage:
         messages = [self._system_message, msg]
-        response = self._llm.invoke(messages)
+        response = self._llm.invoke(messages, **kwargs)
         self.append_history([msg, response])  # type: ignore[arg-type]
         return response  # type: ignore[return-value]
 
-    def _invoke_image(self, image: Image, query: str) -> AIMessage:
+    def _invoke_image(
+        self, image: Image, query: str, response_format: dict[str, Any] | None = None
+    ) -> AIMessage:
         content = [{"type": "text", "text": query}, *image.agent_encode()]
-        return self._invoke(HumanMessage(content=content))
+        kwargs: dict[str, Any] = {}
+        if response_format:
+            kwargs["response_format"] = response_format
+        return self._invoke(HumanMessage(content=content), **kwargs)
 
     @rpc
-    def clear_history(self):  # type: ignore[no-untyped-def]
+    def clear_history(self) -> None:
         self._history.clear()
 
     def append_history(self, *msgs: list[AIMessage | HumanMessage]) -> None:
@@ -95,24 +102,26 @@ def history(self) -> list[AnyMessage]:
         return [self._system_message, *self._history]
 
     @rpc
-    def register_skills(  # type: ignore[no-untyped-def]
-        self, container, run_implicit_name: str | None = None
-    ) -> None:
+    def register_skills(self, container: Any, run_implicit_name: str | None = None) -> None:
         logger.warning(
             "VLMAgent does not manage skills; register_skills is a no-op",
             container=str(container),
             run_implicit_name=run_implicit_name,
         )
 
     @rpc
-    def query(self, query: str):  # type: ignore[no-untyped-def]
+    def query(self, query: str) -> str:
         response = self._invoke(HumanMessage(query))
-        return response.content
+        content = response.content
+        return content if isinstance(content, str) else str(content)
 
     @rpc
-    def query_image(self, image: Image, query: str):  # type: ignore[no-untyped-def]
-        response = self._invoke_image(image, query)
-        return response.content
+    def query_image(
+        self, image: Image, query: str, response_format: dict[str, Any] | None = None
+    ) -> str:
+        response = self._invoke_image(image, query, response_format=response_format)
+        content = response.content
+        return content if isinstance(content, str) else str(content)
 
 
 vlm_agent = VLMAgent.blueprint

diff --git a/dimos/models/vl/__init__.py b/dimos/models/vl/__init__.py
@@ -2,13 +2,15 @@
 from dimos.models.vl.florence import Florence2Model
 from dimos.models.vl.moondream import MoondreamVlModel
 from dimos.models.vl.moondream_hosted import MoondreamHostedVlModel
+from dimos.models.vl.openai import OpenAIVlModel
 from dimos.models.vl.qwen import QwenVlModel
 
 __all__ = [
     "Captioner",
     "Florence2Model",
     "MoondreamHostedVlModel",
     "MoondreamVlModel",
+    "OpenAIVlModel",
     "QwenVlModel",
     "VlModel",
 ]
diff --git a/dimos/models/vl/openai.py b/dimos/models/vl/openai.py
@@ -0,0 +1,106 @@
+from dataclasses import dataclass
+from functools import cached_property
+import os
+from typing import Any
+
+import numpy as np
+from openai import OpenAI
+
+from dimos.models.vl.base import VlModel, VlModelConfig
+from dimos.msgs.sensor_msgs import Image
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger()
+
+
+@dataclass
+class OpenAIVlModelConfig(VlModelConfig):
+    model_name: str = "gpt-4o-mini"
+    api_key: str | None = None
+
+
+class OpenAIVlModel(VlModel):
+    default_config = OpenAIVlModelConfig
+    config: OpenAIVlModelConfig
+
+    @cached_property
+    def _client(self) -> OpenAI:
+        api_key = self.config.api_key or os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "OpenAI API key must be provided or set in OPENAI_API_KEY environment variable"
+            )
+
+        return OpenAI(api_key=api_key)
+
+    def query(self, image: Image | np.ndarray, query: str, response_format: dict | None = None, **kwargs) -> str:  # type: ignore[override, type-arg, no-untyped-def]
+        if isinstance(image, np.ndarray):
+            import warnings
+
+            warnings.warn(
+                "OpenAIVlModel.query should receive standard dimos Image type, not a numpy array",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
+            image = Image.from_numpy(image)
+
+        # Apply auto_resize if configured
+        image, _ = self._prepare_image(image)
+
+        img_base64 = image.to_base64()
+
+        api_kwargs: dict[str, Any] = {
+            "model": self.config.model_name,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{img_base64}"},
+                        },
+                        {"type": "text", "text": query},
+                    ],
+                }
+            ],
+        }
+
+        if response_format:
+            api_kwargs["response_format"] = response_format
+
+        response = self._client.chat.completions.create(**api_kwargs)
+
+        return response.choices[0].message.content  # type: ignore[return-value,no-any-return]
+
+    def query_batch(
+        self, images: list[Image], query: str, response_format: dict[str, Any] | None = None, **kwargs: Any
+    ) -> list[str]:  # type: ignore[override]
+        """Query VLM with multiple images using a single API call."""
+        if not images:
+            return []
+
+        content: list[dict[str, Any]] = [
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{self._prepare_image(img)[0].to_base64()}"},
+            }
+            for img in images
+        ]
+        content.append({"type": "text", "text": query})
+
+        messages = [{"role": "user", "content": content}]
+        api_kwargs: dict[str, Any] = {"model": self.config.model_name, "messages": messages}
+        if response_format:
+            api_kwargs["response_format"] = response_format
+
+        response = self._client.chat.completions.create(**api_kwargs)
+        response_text = response.choices[0].message.content or ""
+        # Return one response per image (same response since API analyzes all images together)
+        return [response_text] * len(images)
+
+    def stop(self) -> None:
+        """Release the OpenAI client."""
+        if "_client" in self.__dict__:
+            del self.__dict__["_client"]
+
diff --git a/dimos/models/vl/qwen.py b/dimos/models/vl/qwen.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from functools import cached_property
 import os
+from typing import Any
 
 import numpy as np
 from openai import OpenAI
@@ -69,6 +70,32 @@ def query(self, image: Image | np.ndarray, query: str) -> str:  # type: ignore[o
 
         return response.choices[0].message.content  # type: ignore[return-value]
 
+    def query_batch(
+        self, images: list[Image], query: str, response_format: dict[str, Any] | None = None, **kwargs: Any
+    ) -> list[str]:  # type: ignore[override]
+        """Query VLM with multiple images using a single API call."""
+        if not images:
+            return []
+
+        content: list[dict[str, Any]] = [
+                {
+                    "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{self._prepare_image(img)[0].to_base64()}"},
+                }
+            for img in images
+        ]
+        content.append({"type": "text", "text": query})
+
+        messages = [{"role": "user", "content": content}]
+        api_kwargs: dict[str, Any] = {"model": self.config.model_name, "messages": messages}
+        if response_format:
+            api_kwargs["response_format"] = response_format
+
+        response = self._client.chat.completions.create(**api_kwargs)
+        response_text = response.choices[0].message.content or ""
+        # Return one response per image (same response since API analyzes all images together)
+        return [response_text] * len(images)
+
     def stop(self) -> None:
         """Release the OpenAI client."""
         if "_client" in self.__dict__:

diff --git a/dimos/perception/experimental/__init__.py b/dimos/perception/experimental/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Experimental perception modules."""
diff --git a/dimos/perception/experimental/temporal_memory/README.md b/dimos/perception/experimental/temporal_memory/README.md
@@ -0,0 +1,32 @@
+Temporal memory runs "Temporal/Spatial RAG" on streamed videos building an continuous entity-based
+memory over time. It uses a VLM to extract evidence in sliding windows, tracks
+entities across windows, maintains a rolling summary, and stores relations in a graph network.
+
+Methodology
+1) Sample frames at a target FPS and analyze them in sliding windows.
+2) Extract dense evidence with a VLM (caption + entities + relations).
+3) Update rolling summary for global context.
+4) Persist per-window evidence + entity graph for query-time context.
+
+Setup
+- Put your OpenAI key in `.env`:
+  `OPENAI_API_KEY=...`
+- Install dimensional dependencies
+
+Quickstart
+To run: `dimos --replay run unitree-go2-temporal-memory`
+
+In another terminal: `humancli` to chat with the agent and run memory queries.
+
+Artifacts
+By default, artifacts are written under `assets/temporal_memory`:
+- `evidence.jsonl` (window evidence: captions, entities, relations)
+- `state.json` (rolling summary + roster state)
+- `entities.json` (current entity roster)
+- `frames_index.jsonl` (timestamps for saved frames; written on stop)
+- `entity_graph.db` (SQLite graph of relations/distances)
+
+Notes
+- Evidence is extracted in sliding windows, so queries can refer to recent or past entities.
+- Distance estimation can run in the background to enrich graph relations.
+- If you want a different output directory, set `TemporalMemoryConfig(output_dir=...)`.
diff --git a/dimos/perception/experimental/temporal_memory/__init__.py b/dimos/perception/experimental/temporal_memory/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Temporal memory package."""
+
+from .temporal_memory import Frame, TemporalMemory, TemporalMemoryConfig, temporal_memory
+
+__all__ = [
+    "Frame",
+    "TemporalMemory",
+    "TemporalMemoryConfig",
+    "temporal_memory",
+]