From 8cf46ae5ccf3b9bbaffebf46ec6aba94045d7c2c Mon Sep 17 00:00:00 2001
From: alexlin2 <alex.lin416@outlook.com>
Date: Sat, 27 Sep 2025 02:38:33 -0700
Subject: [PATCH 1/8] onboard unitree changes

---
 dimos/perception/detection2d/module2D.py | 2 +-
 dimos/perception/detection2d/moduleDB.py | 9 +++++----
 dimos/robot/unitree_webrtc/unitree_g1.py | 3 ++-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/dimos/perception/detection2d/module2D.py b/dimos/perception/detection2d/module2D.py
index 2ccba7466a..02507bd6da 100644
--- a/dimos/perception/detection2d/module2D.py
+++ b/dimos/perception/detection2d/module2D.py
@@ -138,7 +138,7 @@ def sharp_image_stream(self) -> Observable[Image]:
     def detection_stream_2d(self) -> Observable[ImageDetections2D]:
         return self.vlm_detections_subject
         # Regular detection stream from the detector
-        regular_detections = self.sharp_image_stream().pipe(ops.map(self.process_image_frame))
+        regular_detections = self.image.observable().pipe(ops.map(self.process_image_frame))
         # Merge with VL model detections
         return backpressure(regular_detections.pipe(ops.merge(self.vlm_detections_subject)))
 
diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py
index 8cc096ffa8..88af73d9b7 100644
--- a/dimos/perception/detection2d/moduleDB.py
+++ b/dimos/perception/detection2d/moduleDB.py
@@ -119,7 +119,7 @@ def to_pose(self) -> PoseStamped:
         print("Global pose:", global_pose)
         global_pose.frame_id = self.best_detection.frame_id
         print("remap to", self.best_detection.frame_id)
-        return PoseStamped(position=self.center, orientation=Quaternion(), frame_id="world")
+        return PoseStamped(position=self.center, orientation=Quaternion(), frame_id=self.best_detection.frame_id)
 
 
 class ObjectDBModule(Detection3DModule, TableStr):
@@ -202,16 +202,17 @@ def agent_encode(self) -> List[Any]:
 
     def vlm_query(self, description: str) -> str:
         imageDetections2D = super().vlm_query(description)
-        time.sleep(1.5)
+        time.sleep(3)
 
         print("VLM query found", imageDetections2D, "detections")
         ret = []
         for obj in self.objects.values():
-            if obj.ts != imageDetections2D.ts:
-                continue
+            #if obj.ts != imageDetections2D.ts:
+            #    continue
             if obj.class_id != -100:
                 continue
             ret.append(obj)
+        ret.sort(key=lambda x: x.ts)
         return ret
 
     @skill()
diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py
index 3bbb5245ba..b0fc87a064 100644
--- a/dimos/robot/unitree_webrtc/unitree_g1.py
+++ b/dimos/robot/unitree_webrtc/unitree_g1.py
@@ -202,8 +202,9 @@ def _deploy_detection(self, goto):
         detection.annotations.transport = core.LCMTransport("/annotations", ImageAnnotations)
         detection.detections.transport = core.LCMTransport("/detections", Detection2DArray)
 
-        detection.scene_update.transport = core.LCMTransport("/scene_update", SceneUpdate)
 
+        detection.scene_update.transport = core.LCMTransport("/scene_update", SceneUpdate)
+        detection.target.transport = core.LCMTransport("/target", PoseStamped)
         detection.detected_pointcloud_0.transport = core.LCMTransport(
             "/detected/pointcloud/0", PointCloud2
         )

From 2b0a1791a94bfcc57f4f637e41cdfcd3dc823c31 Mon Sep 17 00:00:00 2001
From: alexlin2 <44330195+alexlin2@users.noreply.github.com>
Date: Sat, 27 Sep 2025 09:39:23 +0000
Subject: [PATCH 2/8] CI code cleanup

---
 dimos/perception/detection2d/moduleDB.py | 6 ++++--
 dimos/robot/unitree_webrtc/unitree_g1.py | 1 -
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py
index 88af73d9b7..e8d0b99af7 100644
--- a/dimos/perception/detection2d/moduleDB.py
+++ b/dimos/perception/detection2d/moduleDB.py
@@ -119,7 +119,9 @@ def to_pose(self) -> PoseStamped:
         print("Global pose:", global_pose)
         global_pose.frame_id = self.best_detection.frame_id
         print("remap to", self.best_detection.frame_id)
-        return PoseStamped(position=self.center, orientation=Quaternion(), frame_id=self.best_detection.frame_id)
+        return PoseStamped(
+            position=self.center, orientation=Quaternion(), frame_id=self.best_detection.frame_id
+        )
 
 
 class ObjectDBModule(Detection3DModule, TableStr):
@@ -207,7 +209,7 @@ def vlm_query(self, description: str) -> str:
         print("VLM query found", imageDetections2D, "detections")
         ret = []
         for obj in self.objects.values():
-            #if obj.ts != imageDetections2D.ts:
+            # if obj.ts != imageDetections2D.ts:
             #    continue
             if obj.class_id != -100:
                 continue
diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py
index b0fc87a064..742af9e0cb 100644
--- a/dimos/robot/unitree_webrtc/unitree_g1.py
+++ b/dimos/robot/unitree_webrtc/unitree_g1.py
@@ -202,7 +202,6 @@ def _deploy_detection(self, goto):
         detection.annotations.transport = core.LCMTransport("/annotations", ImageAnnotations)
         detection.detections.transport = core.LCMTransport("/detections", Detection2DArray)
 
-
         detection.scene_update.transport = core.LCMTransport("/scene_update", SceneUpdate)
         detection.target.transport = core.LCMTransport("/target", PoseStamped)
         detection.detected_pointcloud_0.transport = core.LCMTransport(

From ab10747c449336b948fcf55f1dd75a2d79a35477 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 27 Sep 2025 18:41:44 -0700
Subject: [PATCH 3/8] quick fixes

---
 dimos/perception/detection2d/module3D.py         | 7 ++++---
 dimos/perception/detection2d/test_moduleDB.py    | 2 --
 dimos/perception/detection2d/testing.py          | 6 ------
 dimos/perception/detection2d/type/detection3d.py | 6 +++---
 4 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/dimos/perception/detection2d/module3D.py b/dimos/perception/detection2d/module3D.py
index aa120db421..f7b0e6ec14 100644
--- a/dimos/perception/detection2d/module3D.py
+++ b/dimos/perception/detection2d/module3D.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
 
 from dimos_lcm.sensor_msgs import CameraInfo
 from reactivex import operators as ops
@@ -57,7 +56,7 @@ def process_frame(
         if not transform:
             return ImageDetections3D(detections.image, [])
 
-        print("PROCESS FRAME", detections)
+        print("3d projection", detections, pointcloud, transform)
         detection3d_list = []
         for detection in detections:
             detection3d = Detection3D.from_2d(
@@ -69,7 +68,9 @@ def process_frame(
             if detection3d is not None:
                 detection3d_list.append(detection3d)
 
-        return ImageDetections3D(detections.image, detection3d_list)
+        ret = ImageDetections3D(detections.image, detection3d_list)
+        print("3d projection finished", ret)
+        return ret
 
     @rpc
     def start(self):
diff --git a/dimos/perception/detection2d/test_moduleDB.py b/dimos/perception/detection2d/test_moduleDB.py
index 81446601e9..081d3343b4 100644
--- a/dimos/perception/detection2d/test_moduleDB.py
+++ b/dimos/perception/detection2d/test_moduleDB.py
@@ -21,8 +21,6 @@
 from dimos.msgs.sensor_msgs import Image, PointCloud2
 from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.perception.detection2d import testing
-from dimos.perception.detection2d.module2D import Detection2DModule
-from dimos.perception.detection2d.module3D import Detection3DModule
 from dimos.perception.detection2d.moduleDB import ObjectDBModule
 from dimos.protocol.service import lcmservice as lcm
 from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation
diff --git a/dimos/perception/detection2d/testing.py b/dimos/perception/detection2d/testing.py
index 160fe40604..0fe5b4adcd 100644
--- a/dimos/perception/detection2d/testing.py
+++ b/dimos/perception/detection2d/testing.py
@@ -11,11 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import functools
-import hashlib
-import os
 import time
-from pathlib import Path
 from typing import Optional, TypedDict, Union
 
 from dimos_lcm.foxglove_msgs.ImageAnnotations import ImageAnnotations
@@ -32,7 +28,6 @@
 from dimos.perception.detection2d.module3D import Detection3DModule
 from dimos.perception.detection2d.moduleDB import ObjectDBModule
 from dimos.perception.detection2d.type import ImageDetections2D, ImageDetections3D
-from dimos.protocol.service import lcmservice as lcm
 from dimos.protocol.tf import TF
 from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule
 from dimos.robot.unitree_webrtc.type.lidar import LidarMessage
@@ -83,7 +78,6 @@ def get_g1_moment(**kwargs):
     for timestamp, tf_frame in tf_replay.iterate_ts(seek=seek - tf_window, duration=tf_window):
         tf.publish(*TFMessage.lcm_decode(tf_frame).transforms)
 
-    print(tf)
     image_frame = Image.lcm_decode(
         TimedSensorReplay(f"{data_dir}/image#sensor_msgs.Image").find_closest_seek(seek)
     )
diff --git a/dimos/perception/detection2d/type/detection3d.py b/dimos/perception/detection2d/type/detection3d.py
index 261199e388..e03c9aeb75 100644
--- a/dimos/perception/detection2d/type/detection3d.py
+++ b/dimos/perception/detection2d/type/detection3d.py
@@ -41,7 +41,7 @@ def height_filter(height=0.1) -> Detection3DFilter:
     return lambda det, pc, ci, tf: pc.filter_by_height(height)
 
 
-def statistical(nb_neighbors=20, std_ratio=0.5) -> Detection3DFilter:
+def statistical(nb_neighbors=40, std_ratio=0.5) -> Detection3DFilter:
     def filter_func(
         det: Detection2D, pc: PointCloud2, ci: CameraInfo, tf: Transform
     ) -> Optional[PointCloud2]:
@@ -74,7 +74,7 @@ def filter_func(
     return filter_func
 
 
-def radius_outlier(min_neighbors: int = 5, radius: float = 0.3) -> Detection3DFilter:
+def radius_outlier(min_neighbors: int = 20, radius: float = 0.3) -> Detection3DFilter:
     """
     Remove isolated points: keep only points that have at least `min_neighbors`
     neighbors within `radius` meters (same units as your point cloud).
@@ -109,8 +109,8 @@ def from_2d(
         filters: list[Callable[[PointCloud2], PointCloud2]] = [
             # height_filter(0.1),
             raycast(),
-            statistical(),
             radius_outlier(),
+            statistical(),
         ],
     ) -> Optional["Detection3D"]:
         """Create a Detection3D from a 2D detection by projecting world pointcloud.

From 7b58306d8cc14596f4f03def62f37a0eb7660529 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 27 Sep 2025 19:11:46 -0700
Subject: [PATCH 4/8] moduledb hack

---
 dimos/perception/detection2d/moduleDB.py | 31 ++++++++++++++++--------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py
index e8d0b99af7..952cfb371b 100644
--- a/dimos/perception/detection2d/moduleDB.py
+++ b/dimos/perception/detection2d/moduleDB.py
@@ -205,17 +205,28 @@ def agent_encode(self) -> List[Any]:
     def vlm_query(self, description: str) -> str:
         imageDetections2D = super().vlm_query(description)
         time.sleep(3)
-
         print("VLM query found", imageDetections2D, "detections")
-        ret = []
-        for obj in self.objects.values():
-            # if obj.ts != imageDetections2D.ts:
-            #    continue
-            if obj.class_id != -100:
-                continue
-            ret.append(obj)
-        ret.sort(key=lambda x: x.ts)
-        return ret
+
+        transform = self.tf.get("camera_optical", "map")
+        detections3d = self.process_frame(imageDetections2D, self.lidar.get_next(), transform)
+        print("3D detections from VLM query:", detections3d)
+        if detections3d.detections:
+            target_pose = detections3d.detections[0].to_pose()
+            self.target.publish(target_pose)
+            self.goto(target_pose)
+
+        # ret = []
+        # for obj in self.objects.values():
+        # if obj.ts != imageDetections2D.ts:
+        #    continue
+        #    if obj.class_id != -100:
+        #        continue
+        #    if obj.name != imageDetections2D.detections[0].name:
+        #        print("Skipping", obj.name, "!=", imageDetections2D.detections[0].name)
+        #        continue
+        #    ret.append(obj)
+        # ret.sort(key=lambda x: x.ts)
+        # return ret
 
     @skill()
     def navigate_to_object_in_view(self, description: str) -> str:

From b219f3bdf1ba8de532a843227ef20daf317faf45 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 27 Sep 2025 19:14:00 -0700
Subject: [PATCH 5/8] moduledb hack

---
 dimos/perception/detection2d/moduleDB.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py
index 952cfb371b..0d893eefa7 100644
--- a/dimos/perception/detection2d/moduleDB.py
+++ b/dimos/perception/detection2d/moduleDB.py
@@ -208,7 +208,7 @@ def vlm_query(self, description: str) -> str:
         print("VLM query found", imageDetections2D, "detections")
 
         transform = self.tf.get("camera_optical", "map")
-        detections3d = self.process_frame(imageDetections2D, self.lidar.get_next(), transform)
+        detections3d = self.process_frame(imageDetections2D, self.pointcloud.get_next(), transform)
         print("3D detections from VLM query:", detections3d)
         if detections3d.detections:
             target_pose = detections3d.detections[0].to_pose()

From 938137078ff4ea16a0cf104b26272c67b8ea1b73 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 27 Sep 2025 23:43:53 -0700
Subject: [PATCH 6/8] current g1

---
 dimos/perception/detection2d/module2D.py |  7 +-
 dimos/perception/detection2d/module3D.py | 20 +++---
 dimos/perception/detection2d/moduleDB.py | 85 +++++++++++++++++-------
 dimos/robot/unitree_webrtc/unitree_g1.py | 43 ++++++------
 4 files changed, 97 insertions(+), 58 deletions(-)

diff --git a/dimos/perception/detection2d/module2D.py b/dimos/perception/detection2d/module2D.py
index 02507bd6da..82fb181be9 100644
--- a/dimos/perception/detection2d/module2D.py
+++ b/dimos/perception/detection2d/module2D.py
@@ -48,7 +48,7 @@ def process_image(self, image: np.ndarray) -> InconvinientDetectionFormat: ...
 @dataclass
 class Config:
     detector: Optional[Callable[[Any], Detector]] = Yolo2DDetector
-    max_freq: float = 3.0  # hz
+    max_freq: float = 0.5  # hz
     vlmodel: VlModel = QwenVlModel
 
 
@@ -122,6 +122,7 @@ def vlm_query(self, query: str) -> ImageDetections2D:
         return imageDetections
 
     def process_image_frame(self, image: Image) -> ImageDetections2D:
+        print("Processing image frame for detections", image)
         return ImageDetections2D.from_detector(
             image, self.detector.process_image(image.to_opencv())
         )
@@ -136,9 +137,9 @@ def sharp_image_stream(self) -> Observable[Image]:
 
     @functools.cache
     def detection_stream_2d(self) -> Observable[ImageDetections2D]:
-        return self.vlm_detections_subject
+        # self.vlm_detections_subject
         # Regular detection stream from the detector
-        regular_detections = self.image.observable().pipe(ops.map(self.process_image_frame))
+        regular_detections = self.sharp_image_stream().pipe(ops.map(self.process_image_frame))
         # Merge with VL model detections
         return backpressure(regular_detections.pipe(ops.merge(self.vlm_detections_subject)))
 
diff --git a/dimos/perception/detection2d/module3D.py b/dimos/perception/detection2d/module3D.py
index f7b0e6ec14..15e671e171 100644
--- a/dimos/perception/detection2d/module3D.py
+++ b/dimos/perception/detection2d/module3D.py
@@ -81,16 +81,16 @@ def detection2d_to_3d(args):
             transform = self.tf.get("camera_optical", pc.frame_id, detections.image.ts, 5.0)
             return self.process_frame(detections, pc, transform)
 
-        # self.detection_stream_3d = align_timestamped(
-        #     backpressure(self.detection_stream_2d()),
-        #     self.pointcloud.observable(),
-        #     match_tolerance=0.25,
-        #     buffer_size=20.0,
-        # ).pipe(ops.map(detection2d_to_3d))
-
-        self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe(
-            ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d)
-        )
+        self.detection_stream_3d = align_timestamped(
+            backpressure(self.detection_stream_2d()),
+            self.pointcloud.observable(),
+            match_tolerance=0.25,
+            buffer_size=20.0,
+        ).pipe(ops.map(detection2d_to_3d))
+
+        # self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe(
+        #    ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d)
+        # )
 
         self.detection_stream_3d.subscribe(self._publish_detections)
 
diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py
index 0d893eefa7..3ff3e72797 100644
--- a/dimos/perception/detection2d/moduleDB.py
+++ b/dimos/perception/detection2d/moduleDB.py
@@ -149,10 +149,13 @@ class ObjectDBModule(Detection3DModule, TableStr):
 
     target: Out[PoseStamped] = None  # type: ignore
 
+    remembered_locations: Dict[str, PoseStamped]
+
     def __init__(self, goto: Callable[[PoseStamped], Any], *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.goto = goto
         self.objects = {}
+        self.remembered_locations = {}
 
     def closest_object(self, detection: Detection3D) -> Optional[Object3D]:
         # Filter objects to only those with matching names
@@ -204,36 +207,67 @@ def agent_encode(self) -> List[Any]:
 
     def vlm_query(self, description: str) -> str:
         imageDetections2D = super().vlm_query(description)
-        time.sleep(3)
         print("VLM query found", imageDetections2D, "detections")
+        time.sleep(3)
 
-        transform = self.tf.get("camera_optical", "map")
-        detections3d = self.process_frame(imageDetections2D, self.pointcloud.get_next(), transform)
-        print("3D detections from VLM query:", detections3d)
-        if detections3d.detections:
-            target_pose = detections3d.detections[0].to_pose()
-            self.target.publish(target_pose)
-            self.goto(target_pose)
-
-        # ret = []
-        # for obj in self.objects.values():
-        # if obj.ts != imageDetections2D.ts:
-        #    continue
-        #    if obj.class_id != -100:
-        #        continue
-        #    if obj.name != imageDetections2D.detections[0].name:
-        #        print("Skipping", obj.name, "!=", imageDetections2D.detections[0].name)
-        #        continue
-        #    ret.append(obj)
-        # ret.sort(key=lambda x: x.ts)
-        # return ret
+        ret = []
+        for obj in self.objects.values():
+            if obj.ts != imageDetections2D.ts:
+                print(
+                    "Skipping",
+                    obj.track_id,
+                    "ts",
+                    obj.ts,
+                    "!=",
+                    imageDetections2D.ts,
+                )
+                continue
+            if obj.class_id != -100:
+                continue
+            if obj.name != imageDetections2D.detections[0].name:
+                print("Skipping", obj.name, "!=", imageDetections2D.detections[0].name)
+                continue
+            ret.append(obj)
+        ret.sort(key=lambda x: x.ts)
+
+        return ret[0]
+
+    @skill()
+    def remember_location(self, name: str) -> str:
+        """Remember the current location with a name."""
+        pose = self.tf.lookup("map", "base_link").to_pose()
+        pose.frame_id = "map"
+        self.remembered_locations[name] = pose
 
     @skill()
-    def navigate_to_object_in_view(self, description: str) -> str:
+    def goto_remembered_location(self, name: str) -> str:
+        """Go to a remembered location by name."""
+        pose = self.remembered_locations.get(name, None)
+        if not pose:
+            return f"Location {name} not found. Known locations: {list(self.remembered_locations.keys())}"
+        self.target.publish(pose)
+        time.sleep(0.1)
+        self.target.publish(pose)
+        return f"Navigating to remembered location {name}"
+
+    @skill()
+    def list_remembered_locations(self) -> List[str]:
+        """List all remembered locations."""
+        return str(list(self.remembered_locations.keys()))
+
+    def nav_to(self, target_pose) -> str:
+        target_pose.orientation = Quaternion(0.0, 0.0, 0.0, 0.0)
+        self.target.publish(target_pose)
+        time.sleep(0.1)
+        self.target.publish(target_pose)
+        self.goto(target_pose)
+
+    #    @skill()
+    def navigate_to_object_in_view(self, query: str) -> str:
         """Navigate to an object by description using vision-language model to find it."""
-        objects = self.vlm_query(description)
+        objects = self.vlm_query(query)
         if not objects:
-            return f"No objects found matching '{description}'"
+            return f"No objects found matching '{query}'"
         target_obj = objects[0]
         return self.navigate_to_object_by_id(target_obj.track_id)
 
@@ -250,10 +284,11 @@ def navigate_to_object_by_id(self, object_id: str):
         if not target_obj:
             return f"Object {object_id} not found\nHere are the known objects:\n{str(self.agent_encode())}"
         target_pose = target_obj.to_pose()
+        target_pose.frame_id = "map"
         self.target.publish(target_pose)
         time.sleep(0.1)
         self.target.publish(target_pose)
-        self.goto(target_pose)
+        self.nav_to(target_pose)
         return f"Navigating to f{object_id} f{target_obj.name}"
 
     def lookup(self, label: str) -> List[Detection3D]:
diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py
index 742af9e0cb..e7256a8524 100644
--- a/dimos/robot/unitree_webrtc/unitree_g1.py
+++ b/dimos/robot/unitree_webrtc/unitree_g1.py
@@ -22,17 +22,14 @@
 import os
 import time
 from typing import Optional
-from dimos import core
-from dimos.core import In, Module, Out, rpc
-from geometry_msgs.msg import PoseStamped as ROSPoseStamped
 
-from dimos.msgs.sensor_msgs import Joy
-from dimos.msgs.std_msgs.Bool import Bool
-from dimos.robot.unitree_webrtc.rosnav import NavigationModule
+from geometry_msgs.msg import PoseStamped as ROSPoseStamped
 from geometry_msgs.msg import TwistStamped as ROSTwistStamped
-from lcm_msgs.foxglove_msgs import SceneUpdate
+
+from dimos_lcm.foxglove_msgs import SceneUpdate
 from nav_msgs.msg import Odometry as ROSOdometry
-from sensor_msgs.msg import PointCloud2 as ROSPointCloud2, Joy as ROSJoy
+from sensor_msgs.msg import Joy as ROSJoy
+from sensor_msgs.msg import PointCloud2 as ROSPointCloud2
 from tf2_msgs.msg import TFMessage as ROSTFMessage
 
 from dimos import core
@@ -50,7 +47,8 @@
     Vector3,
 )
 from dimos.msgs.nav_msgs.Odometry import Odometry
-from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2
+from dimos.msgs.sensor_msgs import CameraInfo, Image, Joy, PointCloud2
+from dimos.msgs.std_msgs.Bool import Bool
 from dimos.msgs.tf2_msgs.TFMessage import TFMessage
 from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.perception.detection2d import Detection3DModule
@@ -251,9 +249,9 @@ def start(self):
 
         self.lcm.start()
 
-        from dimos.agents2.spec import Model, Provider
         from dimos.agents2 import Agent, Output, Reducer, Stream, skill
         from dimos.agents2.cli.human import HumanInput
+        from dimos.agents2.spec import Model, Provider
 
         agent = Agent(
             system_prompt="You are a helpful assistant for controlling a humanoid robot. ",
@@ -289,7 +287,7 @@ def _deploy_camera(self):
             CameraModule,
             transform=Transform(
                 translation=Vector3(0.05, 0.0, 0.0),
-                rotation=Quaternion(0.0, 0.0, 0.0, 1.0),
+                rotation=Quaternion.from_euler(Vector3(0.0, 0.2, 0.0)),
                 frame_id="sensor",
                 child_frame_id="camera_link",
             ),
@@ -385,7 +383,7 @@ def _start_modules(self):
         #    self.joystick.start()
 
         self.camera.start()
-        self.detection.start()
+        # self.detection.start()
 
         # Initialize skills after connection is established
         if self.skill_library is not None:
@@ -461,14 +459,19 @@ def main():
     )
     robot.start()
 
-    # time.sleep(7)
-    # print("Starting navigation...")
-    # print(
-    #    robot.nav.go_to(
-    #        PoseStamped(ts=time.time(), frame_id="map", position=Vector3(0.0, 0.0, 0.03)),
-    #        timeout=10,
-    #    ),
-    # )
+    time.sleep(7)
+    print("Starting navigation...")
+    print(
+        robot.nav.go_to(
+            PoseStamped(
+                ts=time.time(),
+                frame_id="map",
+                position=Vector3(0.0, 0.0, 0.0),
+                orientation=Quaternion(0.0, 0.0, 0.0, 0.0),
+            ),
+            timeout=10,
+        ),
+    )
     try:
         if args.joystick:
             print("\n" + "=" * 50)

From 7aaeed703c935f8d476a491d0054ebdddce11eba Mon Sep 17 00:00:00 2001
From: Stash Pomichter <pomichterstash@gmail.com>
Date: Sun, 28 Sep 2025 03:13:41 -0700
Subject: [PATCH 7/8] Re enabled detections, removed go to origin on startup

---
 dimos/robot/unitree_webrtc/unitree_g1.py | 28 ++++++++++++------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py
index a80aee777a..916943223c 100644
--- a/dimos/robot/unitree_webrtc/unitree_g1.py
+++ b/dimos/robot/unitree_webrtc/unitree_g1.py
@@ -383,7 +383,7 @@ def _start_modules(self):
         #    self.joystick.start()
 
         self.camera.start()
-        # self.detection.start()
+        self.detection.start()
 
         # Initialize skills after connection is established
         if self.skill_library is not None:
@@ -459,19 +459,19 @@ def main():
     )
     robot.start()
 
-    time.sleep(7)
-    print("Starting navigation...")
-    print(
-        robot.nav.go_to(
-            PoseStamped(
-                ts=time.time(),
-                frame_id="map",
-                position=Vector3(0.0, 0.0, 0.03),
-                orientation=Quaternion(0, 0, 0, 0),
-            ),
-            timeout=10,
-        ),
-    )
+    # time.sleep(7)
+    # print("Starting navigation...")
+    # print(
+    #     robot.nav.go_to(
+    #         PoseStamped(
+    #             ts=time.time(),
+    #             frame_id="map",
+    #             position=Vector3(0.0, 0.0, 0.03),
+    #             orientation=Quaternion(0, 0, 0, 0),
+    #         ),
+    #         timeout=10,
+    #     ),
+    # )
     try:
         if args.joystick:
             print("\n" + "=" * 50)

From 291657182fa79a7d2bc966fa9a94f96d52379908 Mon Sep 17 00:00:00 2001
From: Stash Pomichter <pomichterstash@gmail.com>
Date: Sun, 28 Sep 2025 03:39:21 -0700
Subject: [PATCH 8/8] Fully working save location and navigate to saved
 location

---
 dimos/perception/detection2d/moduleDB.py | 27 ++++++++++++++----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py
index 3ff3e72797..052b65d6c7 100644
--- a/dimos/perception/detection2d/moduleDB.py
+++ b/dimos/perception/detection2d/moduleDB.py
@@ -210,6 +210,9 @@ def vlm_query(self, description: str) -> str:
         print("VLM query found", imageDetections2D, "detections")
         time.sleep(3)
 
+        if not imageDetections2D.detections:
+            return None
+
         ret = []
         for obj in self.objects.values():
             if obj.ts != imageDetections2D.ts:
@@ -230,14 +233,19 @@ def vlm_query(self, description: str) -> str:
             ret.append(obj)
         ret.sort(key=lambda x: x.ts)
 
-        return ret[0]
+        return ret[0] if ret else None
 
     @skill()
     def remember_location(self, name: str) -> str:
         """Remember the current location with a name."""
-        pose = self.tf.lookup("map", "base_link").to_pose()
+        transform = self.tf.get("map", "sensor", time_point=time.time(), time_tolerance=1.0)
+        if not transform:
+            return f"Could not get current location transform from map to sensor"
+
+        pose = transform.to_pose()
         pose.frame_id = "map"
         self.remembered_locations[name] = pose
+        return f"Location '{name}' saved at position: {pose.position}"
 
     @skill()
     def goto_remembered_location(self, name: str) -> str:
@@ -245,10 +253,8 @@ def goto_remembered_location(self, name: str) -> str:
         pose = self.remembered_locations.get(name, None)
         if not pose:
             return f"Location {name} not found. Known locations: {list(self.remembered_locations.keys())}"
-        self.target.publish(pose)
-        time.sleep(0.1)
-        self.target.publish(pose)
-        return f"Navigating to remembered location {name}"
+        self.goto(pose)
+        return f"Navigating to remembered location {name} and pose {pose}"
 
     @skill()
     def list_remembered_locations(self) -> List[str]:
@@ -262,13 +268,12 @@ def nav_to(self, target_pose) -> str:
         self.target.publish(target_pose)
         self.goto(target_pose)
 
-    #    @skill()
+    @skill()
     def navigate_to_object_in_view(self, query: str) -> str:
-        """Navigate to an object by description using vision-language model to find it."""
-        objects = self.vlm_query(query)
-        if not objects:
+        """Navigate to an object in your current image view via natural language query using vision-language model to find it."""
+        target_obj = self.vlm_query(query)
+        if not target_obj:
             return f"No objects found matching '{query}'"
-        target_obj = objects[0]
         return self.navigate_to_object_by_id(target_obj.track_id)
 
     @skill(reducer=Reducer.all)