dimensionalOS · spomichter · Jul 2, 2025 · May 16, 2025 · May 22, 2025 · May 22, 2025
diff --git a/.gitignore b/.gitignore
@@ -17,8 +17,10 @@ __pycache__
 
 # Ignore default runtime output folder
 /assets/output/
+/assets/rgbd_data/
+/assets/saved_maps/
 /assets/model-cache/
-assets/agent/memory.txt
+/assets/agent/memory.txt
 
 .bash_history
 
@@ -33,3 +35,8 @@ package-lock.json
 
 # Ignore build artifacts
 dist/
+
+# Ignore data and modelfiles
+data/
+FastSAM-x.pt
+yolo11n.pt
diff --git a/dimos/perception/detection2d/utils.py b/dimos/perception/detection2d/utils.py
@@ -14,8 +14,8 @@
 
 import numpy as np
 import cv2
-
-from dimos.utils.ros_utils import distance_angle_to_goal_xy
+from dimos.types.vector import Vector
+from dimos.utils.transform_utils import distance_angle_to_goal_xy
 
 
 def filter_detections(
@@ -206,24 +206,19 @@ def plot_results(image, bboxes, track_ids, class_ids, confidences, names, alpha=
     return vis_img
 
 
-def calculate_depth_from_bbox(depth_model, frame, bbox):
+def calculate_depth_from_bbox(depth_map, bbox):
     """
     Calculate the average depth of an object within a bounding box.
     Uses the 25th to 75th percentile range to filter outliers.
 
     Args:
-        depth_model: Depth model
-        frame: The image frame
+        depth_map: The depth map
         bbox: Bounding box in format [x1, y1, x2, y2]
 
     Returns:
         float: Average depth in meters, or None if depth estimation fails
     """
     try:
-        # Get depth map for the entire frame
-        depth_map = depth_model.infer_depth(frame)
-        depth_map = np.array(depth_map)
-
         # Extract region of interest from the depth map
         x1, y1, x2, y2 = map(int, bbox)
         roi_depth = depth_map[y1:y2, x1:x2]
@@ -323,7 +318,8 @@ def calculate_position_rotation_from_bbox(bbox, depth, camera_intrinsics):
         camera_intrinsics: List [fx, fy, cx, cy] with camera parameters
 
     Returns:
-        Tuple of (position_dict, rotation_dict)
+        Vector: position
+        Vector: rotation
     """
     # Calculate distance and angle to object
     distance, angle = calculate_distance_angle_from_bbox(bbox, depth, camera_intrinsics)
@@ -336,11 +332,7 @@ def calculate_position_rotation_from_bbox(bbox, depth, camera_intrinsics):
     # For now, rotation is only in yaw (around z-axis)
     # We can use the negative of the angle as an estimate of the object's yaw
     # assuming objects tend to face the camera
-    position = {"x": x, "y": y, "z": 0.0}  # z=0 assuming objects are on the ground
-    rotation = {
-        "roll": 0.0,
-        "pitch": 0.0,
-        "yaw": -angle,
-    }  # Only yaw is meaningful with monocular camera
+    position = Vector([x, y, 0.0])
+    rotation = Vector([0.0, 0.0, -angle])
 
     return position, rotation
diff --git a/dimos/perception/object_detection_stream.py b/dimos/perception/object_detection_stream.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import cv2
+import time
 import numpy as np
 from reactivex import Observable
 from reactivex import operators as ops
@@ -26,8 +27,9 @@
     calculate_position_rotation_from_bbox,
 )
 from dimos.types.vector import Vector
-from typing import Optional, Union
+from typing import Optional, Union, Callable
 from dimos.types.manipulation import ObjectData
+from dimos.utils.transform_utils import transform_robot_to_map
 
 from dimos.utils.logging_config import setup_logger
 
@@ -54,7 +56,7 @@ def __init__(
         gt_depth_scale=1000.0,
         min_confidence=0.7,
         class_filter=None,  # Optional list of class names to filter (e.g., ["person", "car"])
-        transform_to_map=None,  # Optional function to transform coordinates to map frame
+        get_pose: Callable = None,  # Optional function to transform coordinates to map frame
         detector: Optional[Union[Detic2DDetector, Yolo2DDetector]] = None,
         video_stream: Observable = None,
         disable_depth: bool = False,  # Flag to disable monocular Metric3D depth estimation
@@ -69,15 +71,15 @@ def __init__(
             gt_depth_scale: Ground truth depth scale for Metric3D
             min_confidence: Minimum confidence for detections
             class_filter: Optional list of class names to filter
-            transform_to_map: Optional function to transform pose to map coordinates
+            get_pose: Optional function to transform pose to map coordinates
             detector: Optional detector instance (Detic or Yolo)
             video_stream: Observable of video frames to process (if provided, returns a stream immediately)
             disable_depth: Flag to disable monocular Metric3D depth estimation
             draw_masks: Flag to enable drawing segmentation masks
         """
         self.min_confidence = min_confidence
         self.class_filter = class_filter
-        self.transform_to_map = transform_to_map
+        self.get_pose = get_pose
         self.disable_depth = disable_depth
         self.draw_masks = draw_masks
         # Initialize object detector
@@ -131,6 +133,11 @@ def process_frame(frame):
 
             # Process detections
             objects = []
+            if not self.disable_depth:
+                depth_map = self.depth_model.infer_depth(frame)
+                depth_map = np.array(depth_map)
+            else:
+                depth_map = None
 
             for i, bbox in enumerate(bboxes):
                 # Skip if confidence is too low
@@ -142,9 +149,9 @@ def process_frame(frame):
                 if self.class_filter and class_name not in self.class_filter:
                     continue
 
-                if not self.disable_depth:
+                if not self.disable_depth and depth_map is not None:
                     # Get depth for this object
-                    depth = calculate_depth_from_bbox(self.depth_model, frame, bbox)
+                    depth = calculate_depth_from_bbox(depth_map, bbox)
                     if depth is None:
                         # Skip objects with invalid depth
                         continue
@@ -159,13 +166,11 @@ def process_frame(frame):
 
                     # Transform to map frame if a transform function is provided
                     try:
-                        if self.transform_to_map:
-                            position = Vector([position["x"], position["y"], position["z"]])
-                            rotation = Vector(
-                                [rotation["roll"], rotation["pitch"], rotation["yaw"]]
-                            )
-                            position, rotation = self.transform_to_map(
-                                position, rotation, source_frame="base_link"
+                        if self.get_pose:
+                            # position and rotation are already Vector objects, no need to convert
+                            robot_pose = self.get_pose()
+                            position, rotation = transform_robot_to_map(
+                                robot_pose, position, rotation
                             )
                     except Exception as e:
                         logger.error(f"Error transforming to map frame: {e}")

diff --git a/dimos/perception/object_tracker.py b/dimos/perception/object_tracker.py
@@ -102,7 +102,9 @@ def track(self, bbox, frame=None, distance=None, size=None):
 
         # Calculate depth only if distance and size not provided
         if frame is not None and distance is None and size is None:
-            depth_estimate = calculate_depth_from_bbox(self.depth_model, frame, bbox)
+            depth_map = self.depth_model.infer_depth(frame)
+            depth_map = np.array(depth_map)
+            depth_estimate = calculate_depth_from_bbox(depth_map, bbox)
             if depth_estimate is not None:
                 print(f"Estimated depth for object: {depth_estimate:.2f}m")
 

diff --git a/dimos/perception/spatial_perception.py b/dimos/perception/spatial_perception.py
@@ -61,9 +61,7 @@ def __init__(
             "VisualMemory"
         ] = None,  # Optional VisualMemory instance for storing images
         video_stream: Optional[Observable] = None,  # Video stream to process
-        transform_provider: Optional[
-            callable
-        ] = None,  # Function that returns position and rotation
+        get_pose: Optional[callable] = None,  # Function that returns position and rotation
     ):
         """
         Initialize the spatial perception system.
@@ -162,8 +160,8 @@ def __init__(
         logger.info(f"SpatialMemory initialized with model {embedding_model}")
 
         # Start processing video stream if provided
-        if video_stream is not None and transform_provider is not None:
-            self.start_continuous_processing(video_stream, transform_provider)
+        if video_stream is not None and get_pose is not None:
+            self.start_continuous_processing(video_stream, get_pose)
 
     def query_by_location(
         self, x: float, y: float, radius: float = 2.0, limit: int = 5
@@ -183,14 +181,14 @@ def query_by_location(
         return self.vector_db.query_by_location(x, y, radius, limit)
 
     def start_continuous_processing(
-        self, video_stream: Observable, transform_provider: callable
+        self, video_stream: Observable, get_pose: callable
     ) -> disposable.Disposable:
         """
         Start continuous processing of video frames from an Observable stream.
 
         Args:
             video_stream: Observable of video frames
-            transform_provider: Callable that returns position and rotation for each frame
+            get_pose: Callable that returns position and rotation for each frame
 
         Returns:
             Disposable subscription that can be used to stop processing
@@ -200,7 +198,7 @@ def start_continuous_processing(
 
         # Map each video frame to include transform data
         combined_stream = video_stream.pipe(
-            ops.map(lambda video_frame: {"frame": video_frame, **transform_provider()}),
+            ops.map(lambda video_frame: {"frame": video_frame, **get_pose()}),
             # Filter out bad transforms
             ops.filter(
                 lambda data: data.get("position") is not None and data.get("rotation") is not None

diff --git a/dimos/robot/abstract_robot.py b/dimos/robot/abstract_robot.py
diff --git a/dimos/robot/connection_interface.py b/dimos/robot/connection_interface.py
@@ -0,0 +1,70 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Optional
+from reactivex.observable import Observable
+from dimos.types.vector import Vector
+
+__all__ = ["ConnectionInterface"]
+
+
+class ConnectionInterface(ABC):
+    """Abstract base class for robot connection interfaces.
+
+    This class defines the minimal interface that all connection types (ROS, WebRTC, etc.)
+    must implement to provide robot control and data streaming capabilities.
+    """
+
+    @abstractmethod
+    def move(self, velocity: Vector, duration: float = 0.0) -> bool:
+        """Send movement command to the robot using velocity commands.
+
+        Args:
+            velocity: Velocity vector [x, y, yaw] where:
+                     x: Forward/backward velocity (m/s)
+                     y: Left/right velocity (m/s)
+                     yaw: Rotational velocity (rad/s)
+            duration: How long to move (seconds). If 0, command is continuous
+
+        Returns:
+            bool: True if command was sent successfully
+        """
+        pass
+
+    @abstractmethod
+    def get_video_stream(self, fps: int = 30) -> Optional[Observable]:
+        """Get the video stream from the robot's camera.
+
+        Args:
+            fps: Frames per second for the video stream
+
+        Returns:
+            Observable: An observable stream of video frames or None if not available
+        """
+        pass
+
+    @abstractmethod
+    def stop(self) -> bool:
+        """Stop the robot's movement.
+
+        Returns:
+            bool: True if stop command was sent successfully
+        """
+        pass
+
+    @abstractmethod
+    def disconnect(self) -> None:
+        """Disconnect from the robot and clean up resources."""
+        pass
diff --git a/dimos/robot/frontier_exploration/__init__.py b/dimos/robot/frontier_exploration/__init__.py
@@ -0,0 +1 @@
+from utils import *