Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7e99d3f
temporal memory + vlm agent + blueprints
ClaireBookworm Jan 9, 2026
0076db9
fixing module issue and style
ClaireBookworm Jan 9, 2026
6a68b5b
fix skill registration
paul-nechifor Jan 10, 2026
56fd322
removing state functions unpickable
ClaireBookworm Jan 10, 2026
0c6111b
Merge branch 'dev-memory-clean' of https://github.com/dimensionalOS/d…
ClaireBookworm Jan 10, 2026
5ef91c2
inheritancefixes and memory management
ClaireBookworm Jan 11, 2026
20bf28e
docstring for query
ClaireBookworm Jan 11, 2026
b76d801
microcommit: fixing memory buffer
ClaireBookworm Jan 11, 2026
2184a34
sharpness filter and simplified frame filtering
spomichter Jan 13, 2026
42fd629
CI code cleanup
spomichter Jan 13, 2026
5f1116b
initial graph database implementation
ClaireBookworm Jan 13, 2026
3a70039
db implementation, working and stylized, best reply is unitree_go2_of…
ClaireBookworm Jan 14, 2026
581471f
type checking issues
ClaireBookworm Jan 14, 2026
79cc039
Merge branch 'dev' of https://github.com/dimensionalOS/dimos into dev…
ClaireBookworm Jan 14, 2026
e0112e2
final edits, move into experimental, revert non-memory code edits, ty…
ClaireBookworm Jan 14, 2026
7003288
persistent db flag enabled in config
ClaireBookworm Jan 15, 2026
fef34b7
Fix test to not run in CI due to LFS pull
spomichter Jan 15, 2026
7347693
Fix CLIP filter to use dimensional clip
spomichter Jan 15, 2026
52c2fd8
Add path to temporal memory
spomichter Jan 15, 2026
0aa8545
revert video operators
spomichter Jan 15, 2026
6a01797
Revert moondream
spomichter Jan 15, 2026
6b2175d
added temporal memory docs
spomichter Jan 15, 2026
0d15655
Refactor move to /experimental/temporal_memory
spomichter Jan 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 24 additions & 15 deletions dimos/agents/vlm_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from langchain_core.messages import AIMessage, HumanMessage
from typing import Any

from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

from dimos.agents.llm_init import build_llm, build_system_message
from dimos.agents.spec import AgentSpec, AnyMessage
Expand All @@ -31,7 +33,7 @@ class VLMAgent(AgentSpec):
query_stream: In[HumanMessage]
answer_stream: Out[AIMessage]

def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def]
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self._llm = build_llm(self.config)
self._latest_image: Image | None = None
Expand Down Expand Up @@ -71,18 +73,23 @@ def _extract_text(self, msg: HumanMessage) -> str:
return str(part.get("text", ""))
return str(content)

def _invoke(self, msg: HumanMessage) -> AIMessage:
def _invoke(self, msg: HumanMessage, **kwargs: Any) -> AIMessage:
messages = [self._system_message, msg]
response = self._llm.invoke(messages)
response = self._llm.invoke(messages, **kwargs)
self.append_history([msg, response]) # type: ignore[arg-type]
return response # type: ignore[return-value]

def _invoke_image(self, image: Image, query: str) -> AIMessage:
def _invoke_image(
self, image: Image, query: str, response_format: dict[str, Any] | None = None
) -> AIMessage:
content = [{"type": "text", "text": query}, *image.agent_encode()]
return self._invoke(HumanMessage(content=content))
kwargs: dict[str, Any] = {}
if response_format:
kwargs["response_format"] = response_format
return self._invoke(HumanMessage(content=content), **kwargs)

@rpc
def clear_history(self): # type: ignore[no-untyped-def]
def clear_history(self) -> None:
self._history.clear()

def append_history(self, *msgs: list[AIMessage | HumanMessage]) -> None:
Expand All @@ -95,24 +102,26 @@ def history(self) -> list[AnyMessage]:
return [self._system_message, *self._history]

@rpc
def register_skills( # type: ignore[no-untyped-def]
self, container, run_implicit_name: str | None = None
) -> None:
def register_skills(self, container: Any, run_implicit_name: str | None = None) -> None:
logger.warning(
"VLMAgent does not manage skills; register_skills is a no-op",
container=str(container),
run_implicit_name=run_implicit_name,
)

@rpc
def query(self, query: str): # type: ignore[no-untyped-def]
def query(self, query: str) -> str:
response = self._invoke(HumanMessage(query))
return response.content
content = response.content
return content if isinstance(content, str) else str(content)

@rpc
def query_image(self, image: Image, query: str): # type: ignore[no-untyped-def]
response = self._invoke_image(image, query)
return response.content
def query_image(
self, image: Image, query: str, response_format: dict[str, Any] | None = None
) -> str:
response = self._invoke_image(image, query, response_format=response_format)
content = response.content
return content if isinstance(content, str) else str(content)


vlm_agent = VLMAgent.blueprint
Expand Down
2 changes: 2 additions & 0 deletions dimos/models/vl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
from dimos.models.vl.florence import Florence2Model
from dimos.models.vl.moondream import MoondreamVlModel
from dimos.models.vl.moondream_hosted import MoondreamHostedVlModel
from dimos.models.vl.openai import OpenAIVlModel
from dimos.models.vl.qwen import QwenVlModel

__all__ = [
"Captioner",
"Florence2Model",
"MoondreamHostedVlModel",
"MoondreamVlModel",
"OpenAIVlModel",
"QwenVlModel",
"VlModel",
]
106 changes: 106 additions & 0 deletions dimos/models/vl/openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from dataclasses import dataclass
from functools import cached_property
import os
from typing import Any

import numpy as np
from openai import OpenAI

from dimos.models.vl.base import VlModel, VlModelConfig
from dimos.msgs.sensor_msgs import Image
from dimos.utils.logging_config import setup_logger

logger = setup_logger()


@dataclass
class OpenAIVlModelConfig(VlModelConfig):
model_name: str = "gpt-4o-mini"
api_key: str | None = None


class OpenAIVlModel(VlModel):
default_config = OpenAIVlModelConfig
config: OpenAIVlModelConfig

@cached_property
def _client(self) -> OpenAI:
api_key = self.config.api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError(
"OpenAI API key must be provided or set in OPENAI_API_KEY environment variable"
)

return OpenAI(api_key=api_key)

def query(self, image: Image | np.ndarray, query: str, response_format: dict | None = None, **kwargs) -> str: # type: ignore[override, type-arg, no-untyped-def]
if isinstance(image, np.ndarray):
import warnings

warnings.warn(
"OpenAIVlModel.query should receive standard dimos Image type, not a numpy array",
DeprecationWarning,
stacklevel=2,
)

image = Image.from_numpy(image)

# Apply auto_resize if configured
image, _ = self._prepare_image(image)

img_base64 = image.to_base64()

api_kwargs: dict[str, Any] = {
"model": self.config.model_name,
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{img_base64}"},
},
{"type": "text", "text": query},
],
}
],
}

if response_format:
api_kwargs["response_format"] = response_format

response = self._client.chat.completions.create(**api_kwargs)

return response.choices[0].message.content # type: ignore[return-value,no-any-return]

def query_batch(
self, images: list[Image], query: str, response_format: dict[str, Any] | None = None, **kwargs: Any
) -> list[str]: # type: ignore[override]
"""Query VLM with multiple images using a single API call."""
if not images:
return []

content: list[dict[str, Any]] = [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{self._prepare_image(img)[0].to_base64()}"},
}
for img in images
]
content.append({"type": "text", "text": query})

messages = [{"role": "user", "content": content}]
api_kwargs: dict[str, Any] = {"model": self.config.model_name, "messages": messages}
if response_format:
api_kwargs["response_format"] = response_format

response = self._client.chat.completions.create(**api_kwargs)
response_text = response.choices[0].message.content or ""
# Return one response per image (same response since API analyzes all images together)
return [response_text] * len(images)

def stop(self) -> None:
"""Release the OpenAI client."""
if "_client" in self.__dict__:
del self.__dict__["_client"]

27 changes: 27 additions & 0 deletions dimos/models/vl/qwen.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass
from functools import cached_property
import os
from typing import Any

import numpy as np
from openai import OpenAI
Expand Down Expand Up @@ -69,6 +70,32 @@ def query(self, image: Image | np.ndarray, query: str) -> str: # type: ignore[o

return response.choices[0].message.content # type: ignore[return-value]

def query_batch(
self, images: list[Image], query: str, response_format: dict[str, Any] | None = None, **kwargs: Any
) -> list[str]: # type: ignore[override]
"""Query VLM with multiple images using a single API call."""
if not images:
return []

content: list[dict[str, Any]] = [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{self._prepare_image(img)[0].to_base64()}"},
}
for img in images
]
content.append({"type": "text", "text": query})

messages = [{"role": "user", "content": content}]
api_kwargs: dict[str, Any] = {"model": self.config.model_name, "messages": messages}
if response_format:
api_kwargs["response_format"] = response_format

response = self._client.chat.completions.create(**api_kwargs)
response_text = response.choices[0].message.content or ""
# Return one response per image (same response since API analyzes all images together)
return [response_text] * len(images)

def stop(self) -> None:
"""Release the OpenAI client."""
if "_client" in self.__dict__:
Expand Down
15 changes: 15 additions & 0 deletions dimos/perception/experimental/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright 2026 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Experimental perception modules."""
32 changes: 32 additions & 0 deletions dimos/perception/experimental/temporal_memory/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
Temporal memory runs "Temporal/Spatial RAG" on streamed videos building an continuous entity-based
memory over time. It uses a VLM to extract evidence in sliding windows, tracks
entities across windows, maintains a rolling summary, and stores relations in a graph network.

Methodology
1) Sample frames at a target FPS and analyze them in sliding windows.
2) Extract dense evidence with a VLM (caption + entities + relations).
3) Update rolling summary for global context.
4) Persist per-window evidence + entity graph for query-time context.

Setup
- Put your OpenAI key in `.env`:
`OPENAI_API_KEY=...`
- Install dimensional dependencies

Quickstart
To run: `dimos --replay run unitree-go2-temporal-memory`

In another terminal: `humancli` to chat with the agent and run memory queries.

Artifacts
By default, artifacts are written under `assets/temporal_memory`:
- `evidence.jsonl` (window evidence: captions, entities, relations)
- `state.json` (rolling summary + roster state)
- `entities.json` (current entity roster)
- `frames_index.jsonl` (timestamps for saved frames; written on stop)
- `entity_graph.db` (SQLite graph of relations/distances)

Notes
- Evidence is extracted in sliding windows, so queries can refer to recent or past entities.
- Distance estimation can run in the background to enrich graph relations.
- If you want a different output directory, set `TemporalMemoryConfig(output_dir=...)`.
24 changes: 24 additions & 0 deletions dimos/perception/experimental/temporal_memory/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright 2026 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Temporal memory package."""

from .temporal_memory import Frame, TemporalMemory, TemporalMemoryConfig, temporal_memory

__all__ = [
"Frame",
"TemporalMemory",
"TemporalMemoryConfig",
"temporal_memory",
]
Loading
Loading