dimensionalOS · alexlin2 · Aug 13, 2025 · Aug 12, 2025 · Aug 12, 2025 · Jul 30, 2025
diff --git a/assets/agent/prompt.txt b/assets/agent/prompt.txt
@@ -29,35 +29,12 @@ PERCEPTION & TEMPORAL AWARENESS:
 - You can recognize and track humans and objects in your field of view
 
 NAVIGATION & MOVEMENT:
-- You can navigate to semantically described locations using Navigate (e.g., "go to the kitchen")
-- You can navigate to visually identified objects using NavigateToObject (e.g., "go to the red chair")
+- You can navigate to semantically described locations using NavigateWithText (e.g., "go to the kitchen")
+- You can navigate to visually identified objects using NavigateWithText (e.g., "go to the red chair")
 - You can follow humans through complex environments using FollowHuman
-- You can execute precise movement to specific coordinates using NavigateToGoal like if you're navigating to a GetPose waypoint
 - You can perform various body movements and gestures (sit, stand, dance, etc.)
-- When navigating to a location like Kitchen or Bathroom or couch, use the generic Navigate skill to query spatial memory and navigate
 - You can stop any navigation process that is currently running using KillSkill
-- Appended to every query you will find current objects detection and Saved Locations like this:
-
-Current objects detected:
-[DETECTED OBJECTS]
-Object 1: refrigerator
-  ID: 1
-  Confidence: 0.88
-  Position: x=9.44m, y=5.87m, z=-0.13m
-  Rotation: yaw=0.11 rad
-  Size: width=1.00m, height=1.46m
-  Depth: 4.92m
-  Bounding box: [606, 212, 773, 456]
-----------------------------------
-Object 2: box
-  ID: 2
-  Confidence: 0.84
-  Position: x=11.30m, y=5.10m, z=-0.19m
-  Rotation: yaw=-0.03 rad
-  Size: width=0.91m, height=0.37m
-  Depth: 6.60m
-  Bounding box: [753, 149, 867, 195]
-----------------------------------
+
 
 Saved Robot Locations:
 - LOCATION_NAME: Position (X, Y, Z), Rotation (X, Y, Z)
@@ -70,8 +47,6 @@ Saved Robot Locations:
 
 ***When navigating to an object not in current object detected, run NavigateWithText, DO NOT EXPLORE with raw move commands!!!***
 
-***The object detection list is not a comprehensive source of information, when given a visual query like "go to the person wearing a hat" or "Do you see a dog", always Prioritize running observe skill and NavigateWithText***
-
 PLANNING & REASONING:
 - You can develop both short-term and long-term plans to achieve complex goals
 - You can reason about spatial relationships and plan efficient navigation paths

diff --git a/dimos/core/__init__.py b/dimos/core/__init__.py
@@ -54,7 +54,7 @@ def __getattr__(self, name: str):
 
         if name in self.rpcs:
             return lambda *args, **kwargs: self.rpc.call_sync(
-                f"{self.remote_name}/{name}", (args, kwargs)
+                f"{self.remote_name}/{name}", (args, kwargs), timeout=2.0
             )
 
         # return super().__getattr__(name)

diff --git a/dimos/hardware/zed_camera.py b/dimos/hardware/zed_camera.py
@@ -31,13 +31,14 @@
 from dimos.hardware.stereo_camera import StereoCamera
 from dimos.core import Module, Out, rpc
 from dimos.utils.logging_config import setup_logger
+from dimos.protocol.tf import TF
+from dimos.msgs.geometry_msgs import Transform, Vector3, Quaternion
 
 # Import LCM message types
-from dimos_lcm.sensor_msgs import Image
+from dimos.msgs.sensor_msgs import Image, ImageFormat
 from dimos_lcm.sensor_msgs import CameraInfo
-from dimos_lcm.geometry_msgs import PoseStamped
-from dimos_lcm.std_msgs import Header, Time
-from dimos_lcm.geometry_msgs import Pose, Point, Quaternion
+from dimos.msgs.geometry_msgs import PoseStamped
+from dimos.msgs.std_msgs import Header
 
 logger = setup_logger(__name__)
 
@@ -591,6 +592,9 @@ def __init__(
         self._subscription = None
         self._sequence = 0
 
+        # Initialize TF publisher
+        self.tf = TF()
+
         logger.info(f"ZEDModule initialized for camera {camera_id}")
 
     @rpc
@@ -675,12 +679,8 @@ def _capture_and_publish(self):
             if left_img is None or depth is None:
                 return
 
-            # Get timestamp
-            timestamp_ns = time.time_ns()
-            timestamp = Time(sec=timestamp_ns // 1_000_000_000, nsec=timestamp_ns % 1_000_000_000)
-
             # Create header
-            header = Header(seq=self._sequence, stamp=timestamp, frame_id=self.frame_id)
+            header = Header(self.frame_id)
             self._sequence += 1
 
             # Publish color image
@@ -709,20 +709,11 @@ def _publish_color_image(self, image: np.ndarray, header: Header):
                 image_rgb = image
 
             # Create LCM Image message
-            height, width = image_rgb.shape[:2]
-            encoding = "rgb8" if len(image_rgb.shape) == 3 else "mono8"
-            step = width * (3 if len(image_rgb.shape) == 3 else 1)
-            data = image_rgb.tobytes()
-
             msg = Image(
-                data_length=len(data),
-                header=header,
-                height=height,
-                width=width,
-                encoding=encoding,
-                is_bigendian=0,
-                step=step,
-                data=data,
+                data=image_rgb,
+                format=ImageFormat.RGB,
+                frame_id=header.frame_id,
+                ts=header.ts,
             )
 
             self.color_image.publish(msg)
@@ -734,22 +725,12 @@ def _publish_depth_image(self, depth: np.ndarray, header: Header):
         """Publish depth image as LCM message."""
         try:
             # Depth is float32 in meters
-            height, width = depth.shape[:2]
-            encoding = "32FC1"  # 32-bit float, single channel
-            step = width * 4  # 4 bytes per float
-            data = depth.astype(np.float32).tobytes()
-
             msg = Image(
-                data_length=len(data),
-                header=header,
-                height=height,
-                width=width,
-                encoding=encoding,
-                is_bigendian=0,
-                step=step,
-                data=data,
+                data=depth,
+                format=ImageFormat.DEPTH,
+                frame_id=header.frame_id,
+                ts=header.ts,
             )
-
             self.depth_image.publish(msg)
 
         except Exception as e:
@@ -767,7 +748,7 @@ def _publish_camera_info(self):
             resolution = info.get("resolution", {})
 
             # Create CameraInfo message
-            header = Header(seq=0, stamp=Time(sec=int(time.time()), nsec=0), frame_id=self.frame_id)
+            header = Header(self.frame_id)
 
             # Create camera matrix K (3x3)
             K = [
@@ -830,22 +811,25 @@ def _publish_camera_info(self):
             logger.error(f"Error publishing camera info: {e}")
 
     def _publish_pose(self, pose_data: Dict[str, Any], header: Header):
-        """Publish camera pose as PoseStamped message."""
+        """Publish camera pose as PoseStamped message and TF transform."""
         try:
             position = pose_data.get("position", [0, 0, 0])
             rotation = pose_data.get("rotation", [0, 0, 0, 1])  # quaternion [x,y,z,w]
 
-            # Create Pose message
-            pose = Pose(
-                position=Point(x=position[0], y=position[1], z=position[2]),
-                orientation=Quaternion(x=rotation[0], y=rotation[1], z=rotation[2], w=rotation[3]),
-            )
-
             # Create PoseStamped message
-            msg = PoseStamped(header=header, pose=pose)
-
+            msg = PoseStamped(ts=header.ts, position=position, orientation=rotation)
             self.pose.publish(msg)
 
+            # Publish TF transform
+            camera_tf = Transform(
+                translation=Vector3(position),
+                rotation=Quaternion(rotation),
+                frame_id="zed_world",
+                child_frame_id="zed_camera_link",
+                ts=header.ts,
+            )
+            self.tf.publish(camera_tf)
+
         except Exception as e:
             logger.error(f"Error publishing pose: {e}")
 

diff --git a/dimos/manipulation/visual_servoing/detection3d.py b/dimos/manipulation/visual_servoing/detection3d.py
@@ -26,7 +26,8 @@
 from dimos.perception.detection2d.utils import calculate_object_size_from_bbox
 from dimos.perception.common.utils import bbox2d_to_corners
 
-from dimos_lcm.geometry_msgs import Pose, Vector3, Quaternion, Point
+from dimos.msgs.geometry_msgs import Pose, Vector3, Quaternion
+from dimos.msgs.std_msgs import Header
 from dimos_lcm.vision_msgs import (
     Detection3D,
     Detection3DArray,
@@ -39,7 +40,6 @@
     Pose2D,
     Point2D,
 )
-from dimos_lcm.std_msgs import Header
 from dimos.manipulation.visual_servoing.utils import (
     estimate_object_depth,
     visualize_detections_3d,
@@ -179,8 +179,8 @@ def process_frame(
             else:
                 # If no transform, use camera coordinates
                 center_pose = Pose(
-                    Point(obj_cam_pos[0], obj_cam_pos[1], obj_cam_pos[2]),
-                    Quaternion(0.0, 0.0, 0.0, 1.0),  # Default orientation
+                    position=Vector3(obj_cam_pos[0], obj_cam_pos[1], obj_cam_pos[2]),
+                    orientation=Quaternion(0.0, 0.0, 0.0, 1.0),  # Default orientation
                 )
 
             # Create Detection3D object

diff --git a/dimos/manipulation/visual_servoing/manipulation_module.py b/dimos/manipulation/visual_servoing/manipulation_module.py
@@ -27,8 +27,9 @@
 import numpy as np
 
 from dimos.core import Module, In, Out, rpc
-from dimos_lcm.sensor_msgs import Image, CameraInfo
-from dimos_lcm.geometry_msgs import Vector3, Pose, Point, Quaternion
+from dimos.msgs.sensor_msgs import Image
+from dimos.msgs.geometry_msgs import Vector3, Pose, Quaternion
+from dimos_lcm.sensor_msgs import CameraInfo
 from dimos_lcm.vision_msgs import Detection3DArray, Detection2DArray
 
 from dimos.hardware.piper_arm import PiperArm
@@ -207,7 +208,9 @@ def __init__(
         self.target_click = None
 
         # Place target position and object info
-        self.home_pose = Pose(Point(0.0, 0.0, 0.0), Quaternion(0.0, 0.0, 0.0, 1.0))
+        self.home_pose = Pose(
+            position=Vector3(0.0, 0.0, 0.0), orientation=Quaternion(0.0, 0.0, 0.0, 1.0)
+        )
         self.place_target_position = None
         self.target_object_height = None
         self.retract_distance = 0.12
@@ -239,22 +242,14 @@ def stop(self):
     def _on_rgb_image(self, msg: Image):
         """Handle RGB image messages."""
         try:
-            data = np.frombuffer(msg.data, dtype=np.uint8)
-            if msg.encoding == "rgb8":
-                self.latest_rgb = data.reshape((msg.height, msg.width, 3))
-            else:
-                logger.warning(f"Unsupported RGB encoding: {msg.encoding}")
+            self.latest_rgb = msg.data
         except Exception as e:
             logger.error(f"Error processing RGB image: {e}")
 
     def _on_depth_image(self, msg: Image):
         """Handle depth image messages."""
         try:
-            if msg.encoding == "32FC1":
-                data = np.frombuffer(msg.data, dtype=np.float32)
-                self.latest_depth = data.reshape((msg.height, msg.width))
-            else:
-                logger.warning(f"Unsupported depth encoding: {msg.encoding}")
+            self.latest_depth = msg.data
         except Exception as e:
             logger.error(f"Error processing depth image: {e}")
 
@@ -896,19 +891,7 @@ def _publish_visualization(self, viz_image: np.ndarray):
         """Publish visualization image to LCM."""
         try:
             viz_rgb = cv2.cvtColor(viz_image, cv2.COLOR_BGR2RGB)
-            height, width = viz_rgb.shape[:2]
-            data = viz_rgb.tobytes()
-
-            msg = Image(
-                data_length=len(data),
-                height=height,
-                width=width,
-                encoding="rgb8",
-                is_bigendian=0,
-                step=width * 3,
-                data=data,
-            )
-
+            msg = Image.from_numpy(viz_rgb)
             self.viz_image.publish(msg)
         except Exception as e:
             logger.error(f"Error publishing visualization: {e}")
@@ -935,7 +918,8 @@ def get_place_target_pose(self) -> Optional[Pose]:
             place_pos[2] += z_offset + 0.1
 
         place_center_pose = Pose(
-            Point(place_pos[0], place_pos[1], place_pos[2]), Quaternion(0.0, 0.0, 0.0, 1.0)
+            position=Vector3(place_pos[0], place_pos[1], place_pos[2]),
+            orientation=Quaternion(0.0, 0.0, 0.0, 1.0),
         )
 
         ee_pose = self.arm.get_ee_pose()

diff --git a/dimos/manipulation/visual_servoing/pbvs.py b/dimos/manipulation/visual_servoing/pbvs.py
@@ -21,7 +21,7 @@
 from typing import Optional, Tuple, List
 from collections import deque
 from scipy.spatial.transform import Rotation as R
-from dimos_lcm.geometry_msgs import Pose, Vector3, Quaternion, Point
+from dimos.msgs.geometry_msgs import Pose, Vector3, Quaternion
 from dimos_lcm.vision_msgs import Detection3D, Detection3DArray
 from dimos.utils.logging_config import setup_logger
 from dimos.manipulation.visual_servoing.utils import (
@@ -158,11 +158,7 @@ def update_tracking(self, new_detections: Optional[Detection3DArray] = None) ->
             True if target was successfully tracked, False if lost (but target is kept)
         """
         # Check if we have a current target
-        if (
-            not self.current_target
-            or not self.current_target.bbox
-            or not self.current_target.bbox.center
-        ):
+        if not self.current_target:
             return False
 
         # Add new detections to history if provided