From 8cf46ae5ccf3b9bbaffebf46ec6aba94045d7c2c Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Sat, 27 Sep 2025 02:38:33 -0700 Subject: [PATCH 1/8] onboard unitree changes --- dimos/perception/detection2d/module2D.py | 2 +- dimos/perception/detection2d/moduleDB.py | 9 +++++---- dimos/robot/unitree_webrtc/unitree_g1.py | 3 ++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/dimos/perception/detection2d/module2D.py b/dimos/perception/detection2d/module2D.py index 2ccba7466a..02507bd6da 100644 --- a/dimos/perception/detection2d/module2D.py +++ b/dimos/perception/detection2d/module2D.py @@ -138,7 +138,7 @@ def sharp_image_stream(self) -> Observable[Image]: def detection_stream_2d(self) -> Observable[ImageDetections2D]: return self.vlm_detections_subject # Regular detection stream from the detector - regular_detections = self.sharp_image_stream().pipe(ops.map(self.process_image_frame)) + regular_detections = self.image.observable().pipe(ops.map(self.process_image_frame)) # Merge with VL model detections return backpressure(regular_detections.pipe(ops.merge(self.vlm_detections_subject))) diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py index 8cc096ffa8..88af73d9b7 100644 --- a/dimos/perception/detection2d/moduleDB.py +++ b/dimos/perception/detection2d/moduleDB.py @@ -119,7 +119,7 @@ def to_pose(self) -> PoseStamped: print("Global pose:", global_pose) global_pose.frame_id = self.best_detection.frame_id print("remap to", self.best_detection.frame_id) - return PoseStamped(position=self.center, orientation=Quaternion(), frame_id="world") + return PoseStamped(position=self.center, orientation=Quaternion(), frame_id=self.best_detection.frame_id) class ObjectDBModule(Detection3DModule, TableStr): @@ -202,16 +202,17 @@ def agent_encode(self) -> List[Any]: def vlm_query(self, description: str) -> str: imageDetections2D = super().vlm_query(description) - time.sleep(1.5) + time.sleep(3) print("VLM query found", imageDetections2D, "detections") ret = [] for obj in self.objects.values(): - if obj.ts != imageDetections2D.ts: - continue + #if obj.ts != imageDetections2D.ts: + # continue if obj.class_id != -100: continue ret.append(obj) + ret.sort(key=lambda x: x.ts) return ret @skill() diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py index 3bbb5245ba..b0fc87a064 100644 --- a/dimos/robot/unitree_webrtc/unitree_g1.py +++ b/dimos/robot/unitree_webrtc/unitree_g1.py @@ -202,8 +202,9 @@ def _deploy_detection(self, goto): detection.annotations.transport = core.LCMTransport("/annotations", ImageAnnotations) detection.detections.transport = core.LCMTransport("/detections", Detection2DArray) - detection.scene_update.transport = core.LCMTransport("/scene_update", SceneUpdate) + detection.scene_update.transport = core.LCMTransport("/scene_update", SceneUpdate) + detection.target.transport = core.LCMTransport("/target", PoseStamped) detection.detected_pointcloud_0.transport = core.LCMTransport( "/detected/pointcloud/0", PointCloud2 ) From 2b0a1791a94bfcc57f4f637e41cdfcd3dc823c31 Mon Sep 17 00:00:00 2001 From: alexlin2 <44330195+alexlin2@users.noreply.github.com> Date: Sat, 27 Sep 2025 09:39:23 +0000 Subject: [PATCH 2/8] CI code cleanup --- dimos/perception/detection2d/moduleDB.py | 6 ++++-- dimos/robot/unitree_webrtc/unitree_g1.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py index 88af73d9b7..e8d0b99af7 100644 --- a/dimos/perception/detection2d/moduleDB.py +++ b/dimos/perception/detection2d/moduleDB.py @@ -119,7 +119,9 @@ def to_pose(self) -> PoseStamped: print("Global pose:", global_pose) global_pose.frame_id = self.best_detection.frame_id print("remap to", self.best_detection.frame_id) - return PoseStamped(position=self.center, orientation=Quaternion(), frame_id=self.best_detection.frame_id) + return PoseStamped( + position=self.center, orientation=Quaternion(), frame_id=self.best_detection.frame_id + ) class ObjectDBModule(Detection3DModule, TableStr): @@ -207,7 +209,7 @@ def vlm_query(self, description: str) -> str: print("VLM query found", imageDetections2D, "detections") ret = [] for obj in self.objects.values(): - #if obj.ts != imageDetections2D.ts: + # if obj.ts != imageDetections2D.ts: # continue if obj.class_id != -100: continue diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py index b0fc87a064..742af9e0cb 100644 --- a/dimos/robot/unitree_webrtc/unitree_g1.py +++ b/dimos/robot/unitree_webrtc/unitree_g1.py @@ -202,7 +202,6 @@ def _deploy_detection(self, goto): detection.annotations.transport = core.LCMTransport("/annotations", ImageAnnotations) detection.detections.transport = core.LCMTransport("/detections", Detection2DArray) - detection.scene_update.transport = core.LCMTransport("/scene_update", SceneUpdate) detection.target.transport = core.LCMTransport("/target", PoseStamped) detection.detected_pointcloud_0.transport = core.LCMTransport( From ab10747c449336b948fcf55f1dd75a2d79a35477 Mon Sep 17 00:00:00 2001 From: lesh Date: Sat, 27 Sep 2025 18:41:44 -0700 Subject: [PATCH 3/8] quick fixes --- dimos/perception/detection2d/module3D.py | 7 ++++--- dimos/perception/detection2d/test_moduleDB.py | 2 -- dimos/perception/detection2d/testing.py | 6 ------ dimos/perception/detection2d/type/detection3d.py | 6 +++--- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/dimos/perception/detection2d/module3D.py b/dimos/perception/detection2d/module3D.py index aa120db421..f7b0e6ec14 100644 --- a/dimos/perception/detection2d/module3D.py +++ b/dimos/perception/detection2d/module3D.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional from dimos_lcm.sensor_msgs import CameraInfo from reactivex import operators as ops @@ -57,7 +56,7 @@ def process_frame( if not transform: return ImageDetections3D(detections.image, []) - print("PROCESS FRAME", detections) + print("3d projection", detections, pointcloud, transform) detection3d_list = [] for detection in detections: detection3d = Detection3D.from_2d( @@ -69,7 +68,9 @@ def process_frame( if detection3d is not None: detection3d_list.append(detection3d) - return ImageDetections3D(detections.image, detection3d_list) + ret = ImageDetections3D(detections.image, detection3d_list) + print("3d projection finished", ret) + return ret @rpc def start(self): diff --git a/dimos/perception/detection2d/test_moduleDB.py b/dimos/perception/detection2d/test_moduleDB.py index 81446601e9..081d3343b4 100644 --- a/dimos/perception/detection2d/test_moduleDB.py +++ b/dimos/perception/detection2d/test_moduleDB.py @@ -21,8 +21,6 @@ from dimos.msgs.sensor_msgs import Image, PointCloud2 from dimos.msgs.vision_msgs import Detection2DArray from dimos.perception.detection2d import testing -from dimos.perception.detection2d.module2D import Detection2DModule -from dimos.perception.detection2d.module3D import Detection3DModule from dimos.perception.detection2d.moduleDB import ObjectDBModule from dimos.protocol.service import lcmservice as lcm from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation diff --git a/dimos/perception/detection2d/testing.py b/dimos/perception/detection2d/testing.py index 160fe40604..0fe5b4adcd 100644 --- a/dimos/perception/detection2d/testing.py +++ b/dimos/perception/detection2d/testing.py @@ -11,11 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import functools -import hashlib -import os import time -from pathlib import Path from typing import Optional, TypedDict, Union from dimos_lcm.foxglove_msgs.ImageAnnotations import ImageAnnotations @@ -32,7 +28,6 @@ from dimos.perception.detection2d.module3D import Detection3DModule from dimos.perception.detection2d.moduleDB import ObjectDBModule from dimos.perception.detection2d.type import ImageDetections2D, ImageDetections3D -from dimos.protocol.service import lcmservice as lcm from dimos.protocol.tf import TF from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule from dimos.robot.unitree_webrtc.type.lidar import LidarMessage @@ -83,7 +78,6 @@ def get_g1_moment(**kwargs): for timestamp, tf_frame in tf_replay.iterate_ts(seek=seek - tf_window, duration=tf_window): tf.publish(*TFMessage.lcm_decode(tf_frame).transforms) - print(tf) image_frame = Image.lcm_decode( TimedSensorReplay(f"{data_dir}/image#sensor_msgs.Image").find_closest_seek(seek) ) diff --git a/dimos/perception/detection2d/type/detection3d.py b/dimos/perception/detection2d/type/detection3d.py index 261199e388..e03c9aeb75 100644 --- a/dimos/perception/detection2d/type/detection3d.py +++ b/dimos/perception/detection2d/type/detection3d.py @@ -41,7 +41,7 @@ def height_filter(height=0.1) -> Detection3DFilter: return lambda det, pc, ci, tf: pc.filter_by_height(height) -def statistical(nb_neighbors=20, std_ratio=0.5) -> Detection3DFilter: +def statistical(nb_neighbors=40, std_ratio=0.5) -> Detection3DFilter: def filter_func( det: Detection2D, pc: PointCloud2, ci: CameraInfo, tf: Transform ) -> Optional[PointCloud2]: @@ -74,7 +74,7 @@ def filter_func( return filter_func -def radius_outlier(min_neighbors: int = 5, radius: float = 0.3) -> Detection3DFilter: +def radius_outlier(min_neighbors: int = 20, radius: float = 0.3) -> Detection3DFilter: """ Remove isolated points: keep only points that have at least `min_neighbors` neighbors within `radius` meters (same units as your point cloud). @@ -109,8 +109,8 @@ def from_2d( filters: list[Callable[[PointCloud2], PointCloud2]] = [ # height_filter(0.1), raycast(), - statistical(), radius_outlier(), + statistical(), ], ) -> Optional["Detection3D"]: """Create a Detection3D from a 2D detection by projecting world pointcloud. From 7b58306d8cc14596f4f03def62f37a0eb7660529 Mon Sep 17 00:00:00 2001 From: lesh Date: Sat, 27 Sep 2025 19:11:46 -0700 Subject: [PATCH 4/8] moduledb hack --- dimos/perception/detection2d/moduleDB.py | 31 ++++++++++++++++-------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py index e8d0b99af7..952cfb371b 100644 --- a/dimos/perception/detection2d/moduleDB.py +++ b/dimos/perception/detection2d/moduleDB.py @@ -205,17 +205,28 @@ def agent_encode(self) -> List[Any]: def vlm_query(self, description: str) -> str: imageDetections2D = super().vlm_query(description) time.sleep(3) - print("VLM query found", imageDetections2D, "detections") - ret = [] - for obj in self.objects.values(): - # if obj.ts != imageDetections2D.ts: - # continue - if obj.class_id != -100: - continue - ret.append(obj) - ret.sort(key=lambda x: x.ts) - return ret + + transform = self.tf.get("camera_optical", "map") + detections3d = self.process_frame(imageDetections2D, self.lidar.get_next(), transform) + print("3D detections from VLM query:", detections3d) + if detections3d.detections: + target_pose = detections3d.detections[0].to_pose() + self.target.publish(target_pose) + self.goto(target_pose) + + # ret = [] + # for obj in self.objects.values(): + # if obj.ts != imageDetections2D.ts: + # continue + # if obj.class_id != -100: + # continue + # if obj.name != imageDetections2D.detections[0].name: + # print("Skipping", obj.name, "!=", imageDetections2D.detections[0].name) + # continue + # ret.append(obj) + # ret.sort(key=lambda x: x.ts) + # return ret @skill() def navigate_to_object_in_view(self, description: str) -> str: From b219f3bdf1ba8de532a843227ef20daf317faf45 Mon Sep 17 00:00:00 2001 From: lesh Date: Sat, 27 Sep 2025 19:14:00 -0700 Subject: [PATCH 5/8] moduledb hack --- dimos/perception/detection2d/moduleDB.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py index 952cfb371b..0d893eefa7 100644 --- a/dimos/perception/detection2d/moduleDB.py +++ b/dimos/perception/detection2d/moduleDB.py @@ -208,7 +208,7 @@ def vlm_query(self, description: str) -> str: print("VLM query found", imageDetections2D, "detections") transform = self.tf.get("camera_optical", "map") - detections3d = self.process_frame(imageDetections2D, self.lidar.get_next(), transform) + detections3d = self.process_frame(imageDetections2D, self.pointcloud.get_next(), transform) print("3D detections from VLM query:", detections3d) if detections3d.detections: target_pose = detections3d.detections[0].to_pose() From 938137078ff4ea16a0cf104b26272c67b8ea1b73 Mon Sep 17 00:00:00 2001 From: lesh Date: Sat, 27 Sep 2025 23:43:53 -0700 Subject: [PATCH 6/8] current g1 --- dimos/perception/detection2d/module2D.py | 7 +- dimos/perception/detection2d/module3D.py | 20 +++--- dimos/perception/detection2d/moduleDB.py | 85 +++++++++++++++++------- dimos/robot/unitree_webrtc/unitree_g1.py | 43 ++++++------ 4 files changed, 97 insertions(+), 58 deletions(-) diff --git a/dimos/perception/detection2d/module2D.py b/dimos/perception/detection2d/module2D.py index 02507bd6da..82fb181be9 100644 --- a/dimos/perception/detection2d/module2D.py +++ b/dimos/perception/detection2d/module2D.py @@ -48,7 +48,7 @@ def process_image(self, image: np.ndarray) -> InconvinientDetectionFormat: ... @dataclass class Config: detector: Optional[Callable[[Any], Detector]] = Yolo2DDetector - max_freq: float = 3.0 # hz + max_freq: float = 0.5 # hz vlmodel: VlModel = QwenVlModel @@ -122,6 +122,7 @@ def vlm_query(self, query: str) -> ImageDetections2D: return imageDetections def process_image_frame(self, image: Image) -> ImageDetections2D: + print("Processing image frame for detections", image) return ImageDetections2D.from_detector( image, self.detector.process_image(image.to_opencv()) ) @@ -136,9 +137,9 @@ def sharp_image_stream(self) -> Observable[Image]: @functools.cache def detection_stream_2d(self) -> Observable[ImageDetections2D]: - return self.vlm_detections_subject + # self.vlm_detections_subject # Regular detection stream from the detector - regular_detections = self.image.observable().pipe(ops.map(self.process_image_frame)) + regular_detections = self.sharp_image_stream().pipe(ops.map(self.process_image_frame)) # Merge with VL model detections return backpressure(regular_detections.pipe(ops.merge(self.vlm_detections_subject))) diff --git a/dimos/perception/detection2d/module3D.py b/dimos/perception/detection2d/module3D.py index f7b0e6ec14..15e671e171 100644 --- a/dimos/perception/detection2d/module3D.py +++ b/dimos/perception/detection2d/module3D.py @@ -81,16 +81,16 @@ def detection2d_to_3d(args): transform = self.tf.get("camera_optical", pc.frame_id, detections.image.ts, 5.0) return self.process_frame(detections, pc, transform) - # self.detection_stream_3d = align_timestamped( - # backpressure(self.detection_stream_2d()), - # self.pointcloud.observable(), - # match_tolerance=0.25, - # buffer_size=20.0, - # ).pipe(ops.map(detection2d_to_3d)) - - self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe( - ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d) - ) + self.detection_stream_3d = align_timestamped( + backpressure(self.detection_stream_2d()), + self.pointcloud.observable(), + match_tolerance=0.25, + buffer_size=20.0, + ).pipe(ops.map(detection2d_to_3d)) + + # self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe( + # ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d) + # ) self.detection_stream_3d.subscribe(self._publish_detections) diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py index 0d893eefa7..3ff3e72797 100644 --- a/dimos/perception/detection2d/moduleDB.py +++ b/dimos/perception/detection2d/moduleDB.py @@ -149,10 +149,13 @@ class ObjectDBModule(Detection3DModule, TableStr): target: Out[PoseStamped] = None # type: ignore + remembered_locations: Dict[str, PoseStamped] + def __init__(self, goto: Callable[[PoseStamped], Any], *args, **kwargs): super().__init__(*args, **kwargs) self.goto = goto self.objects = {} + self.remembered_locations = {} def closest_object(self, detection: Detection3D) -> Optional[Object3D]: # Filter objects to only those with matching names @@ -204,36 +207,67 @@ def agent_encode(self) -> List[Any]: def vlm_query(self, description: str) -> str: imageDetections2D = super().vlm_query(description) - time.sleep(3) print("VLM query found", imageDetections2D, "detections") + time.sleep(3) - transform = self.tf.get("camera_optical", "map") - detections3d = self.process_frame(imageDetections2D, self.pointcloud.get_next(), transform) - print("3D detections from VLM query:", detections3d) - if detections3d.detections: - target_pose = detections3d.detections[0].to_pose() - self.target.publish(target_pose) - self.goto(target_pose) - - # ret = [] - # for obj in self.objects.values(): - # if obj.ts != imageDetections2D.ts: - # continue - # if obj.class_id != -100: - # continue - # if obj.name != imageDetections2D.detections[0].name: - # print("Skipping", obj.name, "!=", imageDetections2D.detections[0].name) - # continue - # ret.append(obj) - # ret.sort(key=lambda x: x.ts) - # return ret + ret = [] + for obj in self.objects.values(): + if obj.ts != imageDetections2D.ts: + print( + "Skipping", + obj.track_id, + "ts", + obj.ts, + "!=", + imageDetections2D.ts, + ) + continue + if obj.class_id != -100: + continue + if obj.name != imageDetections2D.detections[0].name: + print("Skipping", obj.name, "!=", imageDetections2D.detections[0].name) + continue + ret.append(obj) + ret.sort(key=lambda x: x.ts) + + return ret[0] + + @skill() + def remember_location(self, name: str) -> str: + """Remember the current location with a name.""" + pose = self.tf.lookup("map", "base_link").to_pose() + pose.frame_id = "map" + self.remembered_locations[name] = pose @skill() - def navigate_to_object_in_view(self, description: str) -> str: + def goto_remembered_location(self, name: str) -> str: + """Go to a remembered location by name.""" + pose = self.remembered_locations.get(name, None) + if not pose: + return f"Location {name} not found. Known locations: {list(self.remembered_locations.keys())}" + self.target.publish(pose) + time.sleep(0.1) + self.target.publish(pose) + return f"Navigating to remembered location {name}" + + @skill() + def list_remembered_locations(self) -> List[str]: + """List all remembered locations.""" + return str(list(self.remembered_locations.keys())) + + def nav_to(self, target_pose) -> str: + target_pose.orientation = Quaternion(0.0, 0.0, 0.0, 0.0) + self.target.publish(target_pose) + time.sleep(0.1) + self.target.publish(target_pose) + self.goto(target_pose) + + # @skill() + def navigate_to_object_in_view(self, query: str) -> str: """Navigate to an object by description using vision-language model to find it.""" - objects = self.vlm_query(description) + objects = self.vlm_query(query) if not objects: - return f"No objects found matching '{description}'" + return f"No objects found matching '{query}'" target_obj = objects[0] return self.navigate_to_object_by_id(target_obj.track_id) @@ -250,10 +284,11 @@ def navigate_to_object_by_id(self, object_id: str): if not target_obj: return f"Object {object_id} not found\nHere are the known objects:\n{str(self.agent_encode())}" target_pose = target_obj.to_pose() + target_pose.frame_id = "map" self.target.publish(target_pose) time.sleep(0.1) self.target.publish(target_pose) - self.goto(target_pose) + self.nav_to(target_pose) return f"Navigating to f{object_id} f{target_obj.name}" def lookup(self, label: str) -> List[Detection3D]: diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py index 742af9e0cb..e7256a8524 100644 --- a/dimos/robot/unitree_webrtc/unitree_g1.py +++ b/dimos/robot/unitree_webrtc/unitree_g1.py @@ -22,17 +22,14 @@ import os import time from typing import Optional -from dimos import core -from dimos.core import In, Module, Out, rpc -from geometry_msgs.msg import PoseStamped as ROSPoseStamped -from dimos.msgs.sensor_msgs import Joy -from dimos.msgs.std_msgs.Bool import Bool -from dimos.robot.unitree_webrtc.rosnav import NavigationModule +from geometry_msgs.msg import PoseStamped as ROSPoseStamped from geometry_msgs.msg import TwistStamped as ROSTwistStamped -from lcm_msgs.foxglove_msgs import SceneUpdate + +from dimos_lcm.foxglove_msgs import SceneUpdate from nav_msgs.msg import Odometry as ROSOdometry -from sensor_msgs.msg import PointCloud2 as ROSPointCloud2, Joy as ROSJoy +from sensor_msgs.msg import Joy as ROSJoy +from sensor_msgs.msg import PointCloud2 as ROSPointCloud2 from tf2_msgs.msg import TFMessage as ROSTFMessage from dimos import core @@ -50,7 +47,8 @@ Vector3, ) from dimos.msgs.nav_msgs.Odometry import Odometry -from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2 +from dimos.msgs.sensor_msgs import CameraInfo, Image, Joy, PointCloud2 +from dimos.msgs.std_msgs.Bool import Bool from dimos.msgs.tf2_msgs.TFMessage import TFMessage from dimos.msgs.vision_msgs import Detection2DArray from dimos.perception.detection2d import Detection3DModule @@ -251,9 +249,9 @@ def start(self): self.lcm.start() - from dimos.agents2.spec import Model, Provider from dimos.agents2 import Agent, Output, Reducer, Stream, skill from dimos.agents2.cli.human import HumanInput + from dimos.agents2.spec import Model, Provider agent = Agent( system_prompt="You are a helpful assistant for controlling a humanoid robot. ", @@ -289,7 +287,7 @@ def _deploy_camera(self): CameraModule, transform=Transform( translation=Vector3(0.05, 0.0, 0.0), - rotation=Quaternion(0.0, 0.0, 0.0, 1.0), + rotation=Quaternion.from_euler(Vector3(0.0, 0.2, 0.0)), frame_id="sensor", child_frame_id="camera_link", ), @@ -385,7 +383,7 @@ def _start_modules(self): # self.joystick.start() self.camera.start() - self.detection.start() + # self.detection.start() # Initialize skills after connection is established if self.skill_library is not None: @@ -461,14 +459,19 @@ def main(): ) robot.start() - # time.sleep(7) - # print("Starting navigation...") - # print( - # robot.nav.go_to( - # PoseStamped(ts=time.time(), frame_id="map", position=Vector3(0.0, 0.0, 0.03)), - # timeout=10, - # ), - # ) + time.sleep(7) + print("Starting navigation...") + print( + robot.nav.go_to( + PoseStamped( + ts=time.time(), + frame_id="map", + position=Vector3(0.0, 0.0, 0.0), + orientation=Quaternion(0.0, 0.0, 0.0, 0.0), + ), + timeout=10, + ), + ) try: if args.joystick: print("\n" + "=" * 50) From 7aaeed703c935f8d476a491d0054ebdddce11eba Mon Sep 17 00:00:00 2001 From: Stash Pomichter Date: Sun, 28 Sep 2025 03:13:41 -0700 Subject: [PATCH 7/8] Re enabled detections, removed go to origin on startup --- dimos/robot/unitree_webrtc/unitree_g1.py | 28 ++++++++++++------------ 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py index a80aee777a..916943223c 100644 --- a/dimos/robot/unitree_webrtc/unitree_g1.py +++ b/dimos/robot/unitree_webrtc/unitree_g1.py @@ -383,7 +383,7 @@ def _start_modules(self): # self.joystick.start() self.camera.start() - # self.detection.start() + self.detection.start() # Initialize skills after connection is established if self.skill_library is not None: @@ -459,19 +459,19 @@ def main(): ) robot.start() - time.sleep(7) - print("Starting navigation...") - print( - robot.nav.go_to( - PoseStamped( - ts=time.time(), - frame_id="map", - position=Vector3(0.0, 0.0, 0.03), - orientation=Quaternion(0, 0, 0, 0), - ), - timeout=10, - ), - ) + # time.sleep(7) + # print("Starting navigation...") + # print( + # robot.nav.go_to( + # PoseStamped( + # ts=time.time(), + # frame_id="map", + # position=Vector3(0.0, 0.0, 0.03), + # orientation=Quaternion(0, 0, 0, 0), + # ), + # timeout=10, + # ), + # ) try: if args.joystick: print("\n" + "=" * 50) From 291657182fa79a7d2bc966fa9a94f96d52379908 Mon Sep 17 00:00:00 2001 From: Stash Pomichter Date: Sun, 28 Sep 2025 03:39:21 -0700 Subject: [PATCH 8/8] Fully working save location and navigate to saved location --- dimos/perception/detection2d/moduleDB.py | 27 ++++++++++++++---------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py index 3ff3e72797..052b65d6c7 100644 --- a/dimos/perception/detection2d/moduleDB.py +++ b/dimos/perception/detection2d/moduleDB.py @@ -210,6 +210,9 @@ def vlm_query(self, description: str) -> str: print("VLM query found", imageDetections2D, "detections") time.sleep(3) + if not imageDetections2D.detections: + return None + ret = [] for obj in self.objects.values(): if obj.ts != imageDetections2D.ts: @@ -230,14 +233,19 @@ def vlm_query(self, description: str) -> str: ret.append(obj) ret.sort(key=lambda x: x.ts) - return ret[0] + return ret[0] if ret else None @skill() def remember_location(self, name: str) -> str: """Remember the current location with a name.""" - pose = self.tf.lookup("map", "base_link").to_pose() + transform = self.tf.get("map", "sensor", time_point=time.time(), time_tolerance=1.0) + if not transform: + return f"Could not get current location transform from map to sensor" + + pose = transform.to_pose() pose.frame_id = "map" self.remembered_locations[name] = pose + return f"Location '{name}' saved at position: {pose.position}" @skill() def goto_remembered_location(self, name: str) -> str: @@ -245,10 +253,8 @@ def goto_remembered_location(self, name: str) -> str: pose = self.remembered_locations.get(name, None) if not pose: return f"Location {name} not found. Known locations: {list(self.remembered_locations.keys())}" - self.target.publish(pose) - time.sleep(0.1) - self.target.publish(pose) - return f"Navigating to remembered location {name}" + self.goto(pose) + return f"Navigating to remembered location {name} and pose {pose}" @skill() def list_remembered_locations(self) -> List[str]: @@ -262,13 +268,12 @@ def nav_to(self, target_pose) -> str: self.target.publish(target_pose) self.goto(target_pose) - # @skill() + @skill() def navigate_to_object_in_view(self, query: str) -> str: - """Navigate to an object by description using vision-language model to find it.""" - objects = self.vlm_query(query) - if not objects: + """Navigate to an object in your current image view via natural language query using vision-language model to find it.""" + target_obj = self.vlm_query(query) + if not target_obj: return f"No objects found matching '{query}'" - target_obj = objects[0] return self.navigate_to_object_by_id(target_obj.track_id) @skill(reducer=Reducer.all)