diff --git a/dimos/perception/detection2d/module2D.py b/dimos/perception/detection2d/module2D.py index 2ccba7466a..82fb181be9 100644 --- a/dimos/perception/detection2d/module2D.py +++ b/dimos/perception/detection2d/module2D.py @@ -48,7 +48,7 @@ def process_image(self, image: np.ndarray) -> InconvinientDetectionFormat: ... @dataclass class Config: detector: Optional[Callable[[Any], Detector]] = Yolo2DDetector - max_freq: float = 3.0 # hz + max_freq: float = 0.5 # hz vlmodel: VlModel = QwenVlModel @@ -122,6 +122,7 @@ def vlm_query(self, query: str) -> ImageDetections2D: return imageDetections def process_image_frame(self, image: Image) -> ImageDetections2D: + print("Processing image frame for detections", image) return ImageDetections2D.from_detector( image, self.detector.process_image(image.to_opencv()) ) @@ -136,7 +137,7 @@ def sharp_image_stream(self) -> Observable[Image]: @functools.cache def detection_stream_2d(self) -> Observable[ImageDetections2D]: - return self.vlm_detections_subject + # self.vlm_detections_subject # Regular detection stream from the detector regular_detections = self.sharp_image_stream().pipe(ops.map(self.process_image_frame)) # Merge with VL model detections diff --git a/dimos/perception/detection2d/module3D.py b/dimos/perception/detection2d/module3D.py index aa120db421..15e671e171 100644 --- a/dimos/perception/detection2d/module3D.py +++ b/dimos/perception/detection2d/module3D.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional from dimos_lcm.sensor_msgs import CameraInfo from reactivex import operators as ops @@ -57,7 +56,7 @@ def process_frame( if not transform: return ImageDetections3D(detections.image, []) - print("PROCESS FRAME", detections) + print("3d projection", detections, pointcloud, transform) detection3d_list = [] for detection in detections: detection3d = Detection3D.from_2d( @@ -69,7 +68,9 @@ def process_frame( if detection3d is not None: detection3d_list.append(detection3d) - return ImageDetections3D(detections.image, detection3d_list) + ret = ImageDetections3D(detections.image, detection3d_list) + print("3d projection finished", ret) + return ret @rpc def start(self): @@ -80,16 +81,16 @@ def detection2d_to_3d(args): transform = self.tf.get("camera_optical", pc.frame_id, detections.image.ts, 5.0) return self.process_frame(detections, pc, transform) - # self.detection_stream_3d = align_timestamped( - # backpressure(self.detection_stream_2d()), - # self.pointcloud.observable(), - # match_tolerance=0.25, - # buffer_size=20.0, - # ).pipe(ops.map(detection2d_to_3d)) + self.detection_stream_3d = align_timestamped( + backpressure(self.detection_stream_2d()), + self.pointcloud.observable(), + match_tolerance=0.25, + buffer_size=20.0, + ).pipe(ops.map(detection2d_to_3d)) - self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe( - ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d) - ) + # self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe( + # ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d) + # ) self.detection_stream_3d.subscribe(self._publish_detections) diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py index 8cc096ffa8..052b65d6c7 100644 --- a/dimos/perception/detection2d/moduleDB.py +++ b/dimos/perception/detection2d/moduleDB.py @@ -119,7 +119,9 @@ def to_pose(self) -> PoseStamped: print("Global pose:", global_pose) global_pose.frame_id = self.best_detection.frame_id print("remap to", self.best_detection.frame_id) - return PoseStamped(position=self.center, orientation=Quaternion(), frame_id="world") + return PoseStamped( + position=self.center, orientation=Quaternion(), frame_id=self.best_detection.frame_id + ) class ObjectDBModule(Detection3DModule, TableStr): @@ -147,10 +149,13 @@ class ObjectDBModule(Detection3DModule, TableStr): target: Out[PoseStamped] = None # type: ignore + remembered_locations: Dict[str, PoseStamped] + def __init__(self, goto: Callable[[PoseStamped], Any], *args, **kwargs): super().__init__(*args, **kwargs) self.goto = goto self.objects = {} + self.remembered_locations = {} def closest_object(self, detection: Detection3D) -> Optional[Object3D]: # Filter objects to only those with matching names @@ -202,25 +207,73 @@ def agent_encode(self) -> List[Any]: def vlm_query(self, description: str) -> str: imageDetections2D = super().vlm_query(description) - time.sleep(1.5) - print("VLM query found", imageDetections2D, "detections") + time.sleep(3) + + if not imageDetections2D.detections: + return None + ret = [] for obj in self.objects.values(): if obj.ts != imageDetections2D.ts: + print( + "Skipping", + obj.track_id, + "ts", + obj.ts, + "!=", + imageDetections2D.ts, + ) continue if obj.class_id != -100: continue + if obj.name != imageDetections2D.detections[0].name: + print("Skipping", obj.name, "!=", imageDetections2D.detections[0].name) + continue ret.append(obj) - return ret + ret.sort(key=lambda x: x.ts) + + return ret[0] if ret else None + + @skill() + def remember_location(self, name: str) -> str: + """Remember the current location with a name.""" + transform = self.tf.get("map", "sensor", time_point=time.time(), time_tolerance=1.0) + if not transform: + return f"Could not get current location transform from map to sensor" + + pose = transform.to_pose() + pose.frame_id = "map" + self.remembered_locations[name] = pose + return f"Location '{name}' saved at position: {pose.position}" @skill() - def navigate_to_object_in_view(self, description: str) -> str: - """Navigate to an object by description using vision-language model to find it.""" - objects = self.vlm_query(description) - if not objects: - return f"No objects found matching '{description}'" - target_obj = objects[0] + def goto_remembered_location(self, name: str) -> str: + """Go to a remembered location by name.""" + pose = self.remembered_locations.get(name, None) + if not pose: + return f"Location {name} not found. Known locations: {list(self.remembered_locations.keys())}" + self.goto(pose) + return f"Navigating to remembered location {name} and pose {pose}" + + @skill() + def list_remembered_locations(self) -> List[str]: + """List all remembered locations.""" + return str(list(self.remembered_locations.keys())) + + def nav_to(self, target_pose) -> str: + target_pose.orientation = Quaternion(0.0, 0.0, 0.0, 0.0) + self.target.publish(target_pose) + time.sleep(0.1) + self.target.publish(target_pose) + self.goto(target_pose) + + @skill() + def navigate_to_object_in_view(self, query: str) -> str: + """Navigate to an object in your current image view via natural language query using vision-language model to find it.""" + target_obj = self.vlm_query(query) + if not target_obj: + return f"No objects found matching '{query}'" return self.navigate_to_object_by_id(target_obj.track_id) @skill(reducer=Reducer.all) @@ -236,10 +289,11 @@ def navigate_to_object_by_id(self, object_id: str): if not target_obj: return f"Object {object_id} not found\nHere are the known objects:\n{str(self.agent_encode())}" target_pose = target_obj.to_pose() + target_pose.frame_id = "map" self.target.publish(target_pose) time.sleep(0.1) self.target.publish(target_pose) - self.goto(target_pose) + self.nav_to(target_pose) return f"Navigating to f{object_id} f{target_obj.name}" def lookup(self, label: str) -> List[Detection3D]: diff --git a/dimos/perception/detection2d/test_moduleDB.py b/dimos/perception/detection2d/test_moduleDB.py index 81446601e9..081d3343b4 100644 --- a/dimos/perception/detection2d/test_moduleDB.py +++ b/dimos/perception/detection2d/test_moduleDB.py @@ -21,8 +21,6 @@ from dimos.msgs.sensor_msgs import Image, PointCloud2 from dimos.msgs.vision_msgs import Detection2DArray from dimos.perception.detection2d import testing -from dimos.perception.detection2d.module2D import Detection2DModule -from dimos.perception.detection2d.module3D import Detection3DModule from dimos.perception.detection2d.moduleDB import ObjectDBModule from dimos.protocol.service import lcmservice as lcm from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation diff --git a/dimos/perception/detection2d/testing.py b/dimos/perception/detection2d/testing.py index 160fe40604..0fe5b4adcd 100644 --- a/dimos/perception/detection2d/testing.py +++ b/dimos/perception/detection2d/testing.py @@ -11,11 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import functools -import hashlib -import os import time -from pathlib import Path from typing import Optional, TypedDict, Union from dimos_lcm.foxglove_msgs.ImageAnnotations import ImageAnnotations @@ -32,7 +28,6 @@ from dimos.perception.detection2d.module3D import Detection3DModule from dimos.perception.detection2d.moduleDB import ObjectDBModule from dimos.perception.detection2d.type import ImageDetections2D, ImageDetections3D -from dimos.protocol.service import lcmservice as lcm from dimos.protocol.tf import TF from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule from dimos.robot.unitree_webrtc.type.lidar import LidarMessage @@ -83,7 +78,6 @@ def get_g1_moment(**kwargs): for timestamp, tf_frame in tf_replay.iterate_ts(seek=seek - tf_window, duration=tf_window): tf.publish(*TFMessage.lcm_decode(tf_frame).transforms) - print(tf) image_frame = Image.lcm_decode( TimedSensorReplay(f"{data_dir}/image#sensor_msgs.Image").find_closest_seek(seek) ) diff --git a/dimos/perception/detection2d/type/detection3d.py b/dimos/perception/detection2d/type/detection3d.py index 261199e388..e03c9aeb75 100644 --- a/dimos/perception/detection2d/type/detection3d.py +++ b/dimos/perception/detection2d/type/detection3d.py @@ -41,7 +41,7 @@ def height_filter(height=0.1) -> Detection3DFilter: return lambda det, pc, ci, tf: pc.filter_by_height(height) -def statistical(nb_neighbors=20, std_ratio=0.5) -> Detection3DFilter: +def statistical(nb_neighbors=40, std_ratio=0.5) -> Detection3DFilter: def filter_func( det: Detection2D, pc: PointCloud2, ci: CameraInfo, tf: Transform ) -> Optional[PointCloud2]: @@ -74,7 +74,7 @@ def filter_func( return filter_func -def radius_outlier(min_neighbors: int = 5, radius: float = 0.3) -> Detection3DFilter: +def radius_outlier(min_neighbors: int = 20, radius: float = 0.3) -> Detection3DFilter: """ Remove isolated points: keep only points that have at least `min_neighbors` neighbors within `radius` meters (same units as your point cloud). @@ -109,8 +109,8 @@ def from_2d( filters: list[Callable[[PointCloud2], PointCloud2]] = [ # height_filter(0.1), raycast(), - statistical(), radius_outlier(), + statistical(), ], ) -> Optional["Detection3D"]: """Create a Detection3D from a 2D detection by projecting world pointcloud. diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py index f00522bfde..916943223c 100644 --- a/dimos/robot/unitree_webrtc/unitree_g1.py +++ b/dimos/robot/unitree_webrtc/unitree_g1.py @@ -22,17 +22,14 @@ import os import time from typing import Optional -from dimos import core -from dimos.core import In, Module, Out, rpc -from geometry_msgs.msg import PoseStamped as ROSPoseStamped -from dimos.msgs.sensor_msgs import Joy -from dimos.msgs.std_msgs.Bool import Bool -from dimos.robot.unitree_webrtc.rosnav import NavigationModule +from geometry_msgs.msg import PoseStamped as ROSPoseStamped from geometry_msgs.msg import TwistStamped as ROSTwistStamped -from lcm_msgs.foxglove_msgs import SceneUpdate + +from dimos_lcm.foxglove_msgs import SceneUpdate from nav_msgs.msg import Odometry as ROSOdometry -from sensor_msgs.msg import PointCloud2 as ROSPointCloud2, Joy as ROSJoy +from sensor_msgs.msg import Joy as ROSJoy +from sensor_msgs.msg import PointCloud2 as ROSPointCloud2 from tf2_msgs.msg import TFMessage as ROSTFMessage from dimos import core @@ -50,7 +47,8 @@ Vector3, ) from dimos.msgs.nav_msgs.Odometry import Odometry -from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2 +from dimos.msgs.sensor_msgs import CameraInfo, Image, Joy, PointCloud2 +from dimos.msgs.std_msgs.Bool import Bool from dimos.msgs.tf2_msgs.TFMessage import TFMessage from dimos.msgs.vision_msgs import Detection2DArray from dimos.perception.detection2d import Detection3DModule @@ -203,7 +201,7 @@ def _deploy_detection(self, goto): detection.detections.transport = core.LCMTransport("/detections", Detection2DArray) detection.scene_update.transport = core.LCMTransport("/scene_update", SceneUpdate) - + detection.target.transport = core.LCMTransport("/target", PoseStamped) detection.detected_pointcloud_0.transport = core.LCMTransport( "/detected/pointcloud/0", PointCloud2 ) @@ -251,9 +249,9 @@ def start(self): self.lcm.start() - from dimos.agents2.spec import Model, Provider from dimos.agents2 import Agent, Output, Reducer, Stream, skill from dimos.agents2.cli.human import HumanInput + from dimos.agents2.spec import Model, Provider agent = Agent( system_prompt="You are a helpful assistant for controlling a humanoid robot. ", @@ -289,7 +287,7 @@ def _deploy_camera(self): CameraModule, transform=Transform( translation=Vector3(0.05, 0.0, 0.0), - rotation=Quaternion(0.0, 0.0, 0.0, 1.0), + rotation=Quaternion.from_euler(Vector3(0.0, 0.2, 0.0)), frame_id="sensor", child_frame_id="camera_link", ), @@ -461,19 +459,19 @@ def main(): ) robot.start() - time.sleep(7) - print("Starting navigation...") - print( - robot.nav.go_to( - PoseStamped( - ts=time.time(), - frame_id="map", - position=Vector3(0.0, 0.0, 0.03), - orientation=Quaternion(0, 0, 0, 0), - ), - timeout=10, - ), - ) + # time.sleep(7) + # print("Starting navigation...") + # print( + # robot.nav.go_to( + # PoseStamped( + # ts=time.time(), + # frame_id="map", + # position=Vector3(0.0, 0.0, 0.03), + # orientation=Quaternion(0, 0, 0, 0), + # ), + # timeout=10, + # ), + # ) try: if args.joystick: print("\n" + "=" * 50)