dimensionalOS · leshy · Aug 4, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 5, 2025
diff --git a/bin/foxglove-bridge b/bin/foxglove-bridge
diff --git a/bin/lcmspy b/bin/lcmspy
diff --git a/dimos/agents/agent_message.py b/dimos/agents/agent_message.py
@@ -0,0 +1,101 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AgentMessage type for multimodal agent communication."""
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+import time
+
+from dimos.msgs.sensor_msgs.Image import Image
+from dimos.agents.agent_types import AgentImage
+
+
+@dataclass
+class AgentMessage:
+    """Message type for agent communication with text and images.
+
+    This type supports multimodal messages containing both text strings
+    and AgentImage objects (base64 encoded) for vision-enabled agents.
+
+    The messages field contains multiple text strings that will be combined
+    into a single message when sent to the LLM.
+    """
+
+    messages: List[str] = field(default_factory=list)
+    images: List[AgentImage] = field(default_factory=list)
+    sender_id: Optional[str] = None
+    timestamp: float = field(default_factory=time.time)
+
+    def add_text(self, text: str) -> None:
+        """Add a text message."""
+        if text:  # Only add non-empty text
+            self.messages.append(text)
+
+    def add_image(self, image: Union[Image, AgentImage]) -> None:
+        """Add an image. Converts Image to AgentImage if needed."""
+        if isinstance(image, Image):
+            # Convert to AgentImage
+            agent_image = AgentImage(
+                base64_jpeg=image.agent_encode(),
+                width=image.width,
+                height=image.height,
+                metadata={"format": image.format.value, "frame_id": image.frame_id},
+            )
+            self.images.append(agent_image)
+        elif isinstance(image, AgentImage):
+            self.images.append(image)
+        else:
+            raise TypeError(f"Expected Image or AgentImage, got {type(image)}")
+
+    def has_text(self) -> bool:
+        """Check if message contains text."""
+        # Check if we have any non-empty messages
+        return any(msg for msg in self.messages if msg)
+
+    def has_images(self) -> bool:
+        """Check if message contains images."""
+        return len(self.images) > 0
+
+    def is_multimodal(self) -> bool:
+        """Check if message contains both text and images."""
+        return self.has_text() and self.has_images()
+
+    def get_primary_text(self) -> Optional[str]:
+        """Get the first text message, if any."""
+        return self.messages[0] if self.messages else None
+
+    def get_primary_image(self) -> Optional[AgentImage]:
+        """Get the first image, if any."""
+        return self.images[0] if self.images else None
+
+    def get_combined_text(self) -> str:
+        """Get all text messages combined into a single string."""
+        # Filter out any empty strings and join
+        return " ".join(msg for msg in self.messages if msg)
+
+    def clear(self) -> None:
+        """Clear all content."""
+        self.messages.clear()
+        self.images.clear()
+
+    def __repr__(self) -> str:
+        """String representation."""
+        return (
+            f"AgentMessage("
+            f"texts={len(self.messages)}, "
+            f"images={len(self.images)}, "
+            f"sender='{self.sender_id}', "
+            f"timestamp={self.timestamp})"
+        )
diff --git a/dimos/agents/agent_types.py b/dimos/agents/agent_types.py
@@ -0,0 +1,76 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Agent-specific types for message passing."""
+
+import time
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, TypedDict
+
+
+@dataclass
+class AgentImage:
+    """Image data encoded for agent consumption.
+
+    Images are stored as base64-encoded JPEG strings ready for
+    direct use by LLM/vision models.
+    """
+
+    base64_jpeg: str
+    width: Optional[int] = None
+    height: Optional[int] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+    def __repr__(self) -> str:
+        return f"AgentImage(size={self.width}x{self.height}, metadata={list(self.metadata.keys())})"
+
+
+@dataclass
+class ToolCall:
+    """Represents a tool/function call request from the LLM."""
+
+    id: str
+    name: str
+    arguments: Dict[str, Any]
+    status: str = "pending"  # pending, executing, completed, failed
+
+    def __repr__(self) -> str:
+        return f"ToolCall(id='{self.id}', name='{self.name}', status='{self.status}')"
+
+
+@dataclass
+class AgentResponse:
+    """Enhanced response from an agent query with tool support.
+
+    Based on common LLM response patterns, includes content and metadata.
+    """
+
+    content: str
+    role: str = "assistant"
+    tool_calls: Optional[List[ToolCall]] = None
+    requires_follow_up: bool = False  # Indicates if tool execution is needed
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    timestamp: float = field(default_factory=time.time)
+
+    def __repr__(self) -> str:
+        content_preview = self.content[:50] + "..." if len(self.content) > 50 else self.content
+        tool_info = f", tools={len(self.tool_calls)}" if self.tool_calls else ""
+        return f"AgentResponse(role='{self.role}', content='{content_preview}'{tool_info})"
+
+
+class ToolMessage(TypedDict):
+    role = "tool"
+    tool_call_id: str
+    content: str
+    name: str
diff --git a/dimos/agents/memory/image_embedding.py b/dimos/agents/memory/image_embedding.py
@@ -54,6 +54,7 @@ def __init__(self, model_name: str = "clip", dimensions: int = 512):
         self.dimensions = dimensions
         self.model = None
         self.processor = None
+        self.model_path = None
 
         self._initialize_model()
 
@@ -68,10 +69,16 @@ def _initialize_model(self):
 
             if self.model_name == "clip":
                 model_id = get_data("models_clip") / "model.onnx"
+                self.model_path = str(model_id)  # Store for pickling
                 processor_id = "openai/clip-vit-base-patch32"
-                self.model = ort.InferenceSession(model_id)
+
+                providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+
+                self.model = ort.InferenceSession(str(model_id), providers=providers)
+
+                actual_providers = self.model.get_providers()
                 self.processor = CLIPProcessor.from_pretrained(processor_id)
-                logger.info(f"Loaded CLIP model: {model_id}")
+                logger.info(f"Loaded CLIP model: {model_id} with providers: {actual_providers}")
             elif self.model_name == "resnet":
                 model_id = "microsoft/resnet-50"
                 self.model = AutoModel.from_pretrained(model_id)

diff --git a/dimos/agents/memory/spatial_vector_db.py b/dimos/agents/memory/spatial_vector_db.py
@@ -38,7 +38,11 @@ class SpatialVectorDB:
     """
 
     def __init__(
-        self, collection_name: str = "spatial_memory", chroma_client=None, visual_memory=None
+        self,
+        collection_name: str = "spatial_memory",
+        chroma_client=None,
+        visual_memory=None,
+        embedding_provider=None,
     ):
         """
         Initialize the spatial vector database.
@@ -47,6 +51,7 @@ def __init__(
             collection_name: Name of the vector database collection
             chroma_client: Optional ChromaDB client for persistence. If None, an in-memory client is used.
             visual_memory: Optional VisualMemory instance for storing images. If None, a new one is created.
+            embedding_provider: Optional ImageEmbeddingProvider instance for computing embeddings. If None, one will be created.
         """
         self.collection_name = collection_name
 
@@ -77,6 +82,9 @@ def __init__(
         # Use provided visual memory or create a new one
         self.visual_memory = visual_memory if visual_memory is not None else VisualMemory()
 
+        # Store the embedding provider to reuse for all operations
+        self.embedding_provider = embedding_provider
+
         # Log initialization info with details about whether using existing collection
         client_type = "persistent" if chroma_client is not None else "in-memory"
         try:
@@ -223,11 +231,12 @@ def query_by_text(self, text: str, limit: int = 5) -> List[Dict]:
         Returns:
             List of results, each containing the image, its metadata, and similarity score
         """
-        from dimos.agents.memory.image_embedding import ImageEmbeddingProvider
+        if self.embedding_provider is None:
+            from dimos.agents.memory.image_embedding import ImageEmbeddingProvider
 
-        embedding_provider = ImageEmbeddingProvider(model_name="clip")
+            self.embedding_provider = ImageEmbeddingProvider(model_name="clip")
 
-        text_embedding = embedding_provider.get_text_embedding(text)
+        text_embedding = self.embedding_provider.get_text_embedding(text)
 
         results = self.image_collection.query(
             query_embeddings=[text_embedding.tolist()],

diff --git a/dimos/agents/modules/__init__.py b/dimos/agents/modules/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Agent modules for DimOS."""