dimensionalOS · spomichter · Sep 28, 2025 · Sep 27, 2025 · Sep 27, 2025 · Sep 28, 2025
diff --git a/dimos/perception/detection2d/module2D.py b/dimos/perception/detection2d/module2D.py
@@ -48,7 +48,7 @@ def process_image(self, image: np.ndarray) -> InconvinientDetectionFormat: ...
 @dataclass
 class Config:
     detector: Optional[Callable[[Any], Detector]] = Yolo2DDetector
-    max_freq: float = 3.0  # hz
+    max_freq: float = 0.5  # hz
     vlmodel: VlModel = QwenVlModel
 
 
@@ -122,6 +122,7 @@ def vlm_query(self, query: str) -> ImageDetections2D:
         return imageDetections
 
     def process_image_frame(self, image: Image) -> ImageDetections2D:
+        print("Processing image frame for detections", image)
         return ImageDetections2D.from_detector(
             image, self.detector.process_image(image.to_opencv())
         )
@@ -136,7 +137,7 @@ def sharp_image_stream(self) -> Observable[Image]:
 
     @functools.cache
     def detection_stream_2d(self) -> Observable[ImageDetections2D]:
-        return self.vlm_detections_subject
+        # self.vlm_detections_subject
         # Regular detection stream from the detector
         regular_detections = self.sharp_image_stream().pipe(ops.map(self.process_image_frame))
         # Merge with VL model detections

diff --git a/dimos/perception/detection2d/module3D.py b/dimos/perception/detection2d/module3D.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
 
 from dimos_lcm.sensor_msgs import CameraInfo
 from reactivex import operators as ops
@@ -57,7 +56,7 @@ def process_frame(
         if not transform:
             return ImageDetections3D(detections.image, [])
 
-        print("PROCESS FRAME", detections)
+        print("3d projection", detections, pointcloud, transform)
         detection3d_list = []
         for detection in detections:
             detection3d = Detection3D.from_2d(
@@ -69,7 +68,9 @@ def process_frame(
             if detection3d is not None:
                 detection3d_list.append(detection3d)
 
-        return ImageDetections3D(detections.image, detection3d_list)
+        ret = ImageDetections3D(detections.image, detection3d_list)
+        print("3d projection finished", ret)
+        return ret
 
     @rpc
     def start(self):
@@ -80,16 +81,16 @@ def detection2d_to_3d(args):
             transform = self.tf.get("camera_optical", pc.frame_id, detections.image.ts, 5.0)
             return self.process_frame(detections, pc, transform)
 
-        # self.detection_stream_3d = align_timestamped(
-        #     backpressure(self.detection_stream_2d()),
-        #     self.pointcloud.observable(),
-        #     match_tolerance=0.25,
-        #     buffer_size=20.0,
-        # ).pipe(ops.map(detection2d_to_3d))
+        self.detection_stream_3d = align_timestamped(
+            backpressure(self.detection_stream_2d()),
+            self.pointcloud.observable(),
+            match_tolerance=0.25,
+            buffer_size=20.0,
+        ).pipe(ops.map(detection2d_to_3d))
 
-        self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe(
-            ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d)
-        )
+        # self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe(
+        #    ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d)
+        # )
 
         self.detection_stream_3d.subscribe(self._publish_detections)
 

diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py
@@ -119,7 +119,9 @@ def to_pose(self) -> PoseStamped:
         print("Global pose:", global_pose)
         global_pose.frame_id = self.best_detection.frame_id
         print("remap to", self.best_detection.frame_id)
-        return PoseStamped(position=self.center, orientation=Quaternion(), frame_id="world")
+        return PoseStamped(
+            position=self.center, orientation=Quaternion(), frame_id=self.best_detection.frame_id
+        )
 
 
 class ObjectDBModule(Detection3DModule, TableStr):
@@ -147,10 +149,13 @@ class ObjectDBModule(Detection3DModule, TableStr):
 
     target: Out[PoseStamped] = None  # type: ignore
 
+    remembered_locations: Dict[str, PoseStamped]
+
     def __init__(self, goto: Callable[[PoseStamped], Any], *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.goto = goto
         self.objects = {}
+        self.remembered_locations = {}
 
     def closest_object(self, detection: Detection3D) -> Optional[Object3D]:
         # Filter objects to only those with matching names
@@ -202,25 +207,73 @@ def agent_encode(self) -> List[Any]:
 
     def vlm_query(self, description: str) -> str:
         imageDetections2D = super().vlm_query(description)
-        time.sleep(1.5)
-
         print("VLM query found", imageDetections2D, "detections")
+        time.sleep(3)
+
+        if not imageDetections2D.detections:
+            return None
+
         ret = []
         for obj in self.objects.values():
             if obj.ts != imageDetections2D.ts:
+                print(
+                    "Skipping",
+                    obj.track_id,
+                    "ts",
+                    obj.ts,
+                    "!=",
+                    imageDetections2D.ts,
+                )
                 continue
             if obj.class_id != -100:
                 continue
+            if obj.name != imageDetections2D.detections[0].name:
+                print("Skipping", obj.name, "!=", imageDetections2D.detections[0].name)
+                continue
             ret.append(obj)
-        return ret
+        ret.sort(key=lambda x: x.ts)
+
+        return ret[0] if ret else None
+
+    @skill()
+    def remember_location(self, name: str) -> str:
+        """Remember the current location with a name."""
+        transform = self.tf.get("map", "sensor", time_point=time.time(), time_tolerance=1.0)
+        if not transform:
+            return f"Could not get current location transform from map to sensor"
+
+        pose = transform.to_pose()
+        pose.frame_id = "map"
+        self.remembered_locations[name] = pose
+        return f"Location '{name}' saved at position: {pose.position}"
 
     @skill()
-    def navigate_to_object_in_view(self, description: str) -> str:
-        """Navigate to an object by description using vision-language model to find it."""
-        objects = self.vlm_query(description)
-        if not objects:
-            return f"No objects found matching '{description}'"
-        target_obj = objects[0]
+    def goto_remembered_location(self, name: str) -> str:
+        """Go to a remembered location by name."""
+        pose = self.remembered_locations.get(name, None)
+        if not pose:
+            return f"Location {name} not found. Known locations: {list(self.remembered_locations.keys())}"
+        self.goto(pose)
+        return f"Navigating to remembered location {name} and pose {pose}"
+
+    @skill()
+    def list_remembered_locations(self) -> List[str]:
+        """List all remembered locations."""
+        return str(list(self.remembered_locations.keys()))
+
+    def nav_to(self, target_pose) -> str:
+        target_pose.orientation = Quaternion(0.0, 0.0, 0.0, 0.0)
+        self.target.publish(target_pose)
+        time.sleep(0.1)
+        self.target.publish(target_pose)
+        self.goto(target_pose)
+
+    @skill()
+    def navigate_to_object_in_view(self, query: str) -> str:
+        """Navigate to an object in your current image view via natural language query using vision-language model to find it."""
+        target_obj = self.vlm_query(query)
+        if not target_obj:
+            return f"No objects found matching '{query}'"
         return self.navigate_to_object_by_id(target_obj.track_id)
 
     @skill(reducer=Reducer.all)
@@ -236,10 +289,11 @@ def navigate_to_object_by_id(self, object_id: str):
         if not target_obj:
             return f"Object {object_id} not found\nHere are the known objects:\n{str(self.agent_encode())}"
         target_pose = target_obj.to_pose()
+        target_pose.frame_id = "map"
         self.target.publish(target_pose)
         time.sleep(0.1)
         self.target.publish(target_pose)
-        self.goto(target_pose)
+        self.nav_to(target_pose)
         return f"Navigating to f{object_id} f{target_obj.name}"
 
     def lookup(self, label: str) -> List[Detection3D]:

diff --git a/dimos/perception/detection2d/test_moduleDB.py b/dimos/perception/detection2d/test_moduleDB.py
@@ -21,8 +21,6 @@
 from dimos.msgs.sensor_msgs import Image, PointCloud2
 from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.perception.detection2d import testing
-from dimos.perception.detection2d.module2D import Detection2DModule
-from dimos.perception.detection2d.module3D import Detection3DModule
 from dimos.perception.detection2d.moduleDB import ObjectDBModule
 from dimos.protocol.service import lcmservice as lcm
 from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation

diff --git a/dimos/perception/detection2d/testing.py b/dimos/perception/detection2d/testing.py
@@ -11,11 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import functools
-import hashlib
-import os
 import time
-from pathlib import Path
 from typing import Optional, TypedDict, Union
 
 from dimos_lcm.foxglove_msgs.ImageAnnotations import ImageAnnotations
@@ -32,7 +28,6 @@
 from dimos.perception.detection2d.module3D import Detection3DModule
 from dimos.perception.detection2d.moduleDB import ObjectDBModule
 from dimos.perception.detection2d.type import ImageDetections2D, ImageDetections3D
-from dimos.protocol.service import lcmservice as lcm
 from dimos.protocol.tf import TF
 from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule
 from dimos.robot.unitree_webrtc.type.lidar import LidarMessage
@@ -83,7 +78,6 @@ def get_g1_moment(**kwargs):
     for timestamp, tf_frame in tf_replay.iterate_ts(seek=seek - tf_window, duration=tf_window):
         tf.publish(*TFMessage.lcm_decode(tf_frame).transforms)
 
-    print(tf)
     image_frame = Image.lcm_decode(
         TimedSensorReplay(f"{data_dir}/image#sensor_msgs.Image").find_closest_seek(seek)
     )

diff --git a/dimos/perception/detection2d/type/detection3d.py b/dimos/perception/detection2d/type/detection3d.py
@@ -41,7 +41,7 @@ def height_filter(height=0.1) -> Detection3DFilter:
     return lambda det, pc, ci, tf: pc.filter_by_height(height)
 
 
-def statistical(nb_neighbors=20, std_ratio=0.5) -> Detection3DFilter:
+def statistical(nb_neighbors=40, std_ratio=0.5) -> Detection3DFilter:
     def filter_func(
         det: Detection2D, pc: PointCloud2, ci: CameraInfo, tf: Transform
     ) -> Optional[PointCloud2]:
@@ -74,7 +74,7 @@ def filter_func(
     return filter_func
 
 
-def radius_outlier(min_neighbors: int = 5, radius: float = 0.3) -> Detection3DFilter:
+def radius_outlier(min_neighbors: int = 20, radius: float = 0.3) -> Detection3DFilter:
     """
     Remove isolated points: keep only points that have at least `min_neighbors`
     neighbors within `radius` meters (same units as your point cloud).
@@ -109,8 +109,8 @@ def from_2d(
         filters: list[Callable[[PointCloud2], PointCloud2]] = [
             # height_filter(0.1),
             raycast(),
-            statistical(),
             radius_outlier(),
+            statistical(),
         ],
     ) -> Optional["Detection3D"]:
         """Create a Detection3D from a 2D detection by projecting world pointcloud.

diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py
@@ -22,17 +22,14 @@
 import os
 import time
 from typing import Optional
-from dimos import core
-from dimos.core import In, Module, Out, rpc
-from geometry_msgs.msg import PoseStamped as ROSPoseStamped
 
-from dimos.msgs.sensor_msgs import Joy
-from dimos.msgs.std_msgs.Bool import Bool
-from dimos.robot.unitree_webrtc.rosnav import NavigationModule
+from geometry_msgs.msg import PoseStamped as ROSPoseStamped
 from geometry_msgs.msg import TwistStamped as ROSTwistStamped
-from lcm_msgs.foxglove_msgs import SceneUpdate
+
+from dimos_lcm.foxglove_msgs import SceneUpdate
 from nav_msgs.msg import Odometry as ROSOdometry
-from sensor_msgs.msg import PointCloud2 as ROSPointCloud2, Joy as ROSJoy
+from sensor_msgs.msg import Joy as ROSJoy
+from sensor_msgs.msg import PointCloud2 as ROSPointCloud2
 from tf2_msgs.msg import TFMessage as ROSTFMessage
 
 from dimos import core
@@ -50,7 +47,8 @@
     Vector3,
 )
 from dimos.msgs.nav_msgs.Odometry import Odometry
-from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2
+from dimos.msgs.sensor_msgs import CameraInfo, Image, Joy, PointCloud2
+from dimos.msgs.std_msgs.Bool import Bool
 from dimos.msgs.tf2_msgs.TFMessage import TFMessage
 from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.perception.detection2d import Detection3DModule
@@ -203,7 +201,7 @@ def _deploy_detection(self, goto):
         detection.detections.transport = core.LCMTransport("/detections", Detection2DArray)
 
         detection.scene_update.transport = core.LCMTransport("/scene_update", SceneUpdate)
-
+        detection.target.transport = core.LCMTransport("/target", PoseStamped)
         detection.detected_pointcloud_0.transport = core.LCMTransport(
             "/detected/pointcloud/0", PointCloud2
         )
@@ -251,9 +249,9 @@ def start(self):
 
         self.lcm.start()
 
-        from dimos.agents2.spec import Model, Provider
         from dimos.agents2 import Agent, Output, Reducer, Stream, skill
         from dimos.agents2.cli.human import HumanInput
+        from dimos.agents2.spec import Model, Provider
 
         agent = Agent(
             system_prompt="You are a helpful assistant for controlling a humanoid robot. ",
@@ -289,7 +287,7 @@ def _deploy_camera(self):
             CameraModule,
             transform=Transform(
                 translation=Vector3(0.05, 0.0, 0.0),
-                rotation=Quaternion(0.0, 0.0, 0.0, 1.0),
+                rotation=Quaternion.from_euler(Vector3(0.0, 0.2, 0.0)),
                 frame_id="sensor",
                 child_frame_id="camera_link",
             ),
@@ -461,19 +459,19 @@ def main():
     )
     robot.start()
 
-    time.sleep(7)
-    print("Starting navigation...")
-    print(
-        robot.nav.go_to(
-            PoseStamped(
-                ts=time.time(),
-                frame_id="map",
-                position=Vector3(0.0, 0.0, 0.03),
-                orientation=Quaternion(0, 0, 0, 0),
-            ),
-            timeout=10,
-        ),
-    )
+    # time.sleep(7)
+    # print("Starting navigation...")
+    # print(
+    #     robot.nav.go_to(
+    #         PoseStamped(
+    #             ts=time.time(),
+    #             frame_id="map",
+    #             position=Vector3(0.0, 0.0, 0.03),
+    #             orientation=Quaternion(0, 0, 0, 0),
+    #         ),
+    #         timeout=10,
+    #     ),
+    # )
     try:
         if args.joystick:
             print("\n" + "=" * 50)