diff --git a/.gitignore b/.gitignore index c8e4eb199d..adc50a7ef6 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,5 @@ data/* !data/.lfs/ FastSAM-x.pt yolo11n.pt + +/thread_monitor_report.csv diff --git a/assets/foxglove_unitree_yolo.json b/assets/foxglove_unitree_yolo.json index e8156a2a05..ab53e4a71e 100644 --- a/assets/foxglove_unitree_yolo.json +++ b/assets/foxglove_unitree_yolo.json @@ -25,14 +25,13 @@ "color": "#248eff57" }, "ff758451-8c06-4419-a995-e93c825eb8be": { - "visible": true, + "visible": false, "frameLocked": true, "label": "Grid", "instanceId": "ff758451-8c06-4419-a995-e93c825eb8be", "layerId": "foxglove.Grid", "frameId": "base_link", - "size": 3, - "divisions": 3, + "divisions": 6, "lineWidth": 1.5, "color": "#24fff4ff", "position": [ @@ -45,18 +44,19 @@ 0, 0 ], - "order": 2 + "order": 2, + "size": 6 } }, "cameraState": { "perspective": true, - "distance": 72.10076346479195, - "phi": 58.649061347608615, - "thetaOffset": 141.2348270943936, + "distance": 13.268408624096915, + "phi": 26.658696672199024, + "thetaOffset": 99.69918626426482, "targetOffset": [ - 1.4329507220695694, - -1.682769241958899, - 0.3118341583211758 + 1.740213570345715, + 0.7318803628974015, + -1.5060700211358968 ], "target": [ 0, @@ -89,6 +89,15 @@ "transforms": { "frame:camera_link": { "visible": false + }, + "frame:sensor": { + "visible": false + }, + "frame:sensor_at_scan": { + "visible": false + }, + "frame:map": { + "visible": true } }, "topics": { @@ -99,28 +108,27 @@ "colorMode": "colormap", "colorMap": "turbo", "pointShape": "circle", - "pointSize": 10, - "explicitAlpha": 1, + "pointSize": 2, + "explicitAlpha": 0.8, "decayTime": 0, - "cubeSize": 0.1, - "minValue": -0.3, - "cubeOutline": false + "cubeSize": 0.05, + "cubeOutline": false, + "minValue": -2 }, "/odom": { - "visible": false, + "visible": true, "axisScale": 1 }, "/video": { "visible": false }, "/global_map": { - "visible": false, + "visible": true, "colorField": "z", "colorMode": "colormap", "colorMap": "turbo", - "pointSize": 10, "decayTime": 0, - "pointShape": "cube", + "pointShape": "square", "cubeOutline": false, "cubeSize": 0.08, "gradient": [ @@ -128,8 +136,9 @@ "#d1e2e2ff" ], "stixelsEnabled": false, - "explicitAlpha": 1, - "minValue": -0.2 + "explicitAlpha": 0.339, + "minValue": -0.2, + "pointSize": 5 }, "/global_path": { "visible": true, @@ -152,7 +161,7 @@ "visible": false }, "/global_costmap": { - "visible": true, + "visible": false, "maxColor": "#6b2b2bff", "frameLocked": false, "unknownColor": "#80808000", @@ -199,6 +208,197 @@ }, "/navigation_goal": { "visible": true + }, + "/debug_camera_optical_points": { + "stixelsEnabled": false, + "visible": false, + "pointSize": 0.07, + "pointShape": "cube", + "colorField": "z", + "colorMode": "colormap", + "colorMap": "turbo" + }, + "/debug_world_points": { + "visible": false, + "colorField": "z", + "colorMode": "colormap", + "colorMap": "rainbow", + "pointShape": "cube" + }, + "/filtered_points_suitcase_0": { + "visible": false, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "pointShape": "cube", + "flatColor": "#ff0808ff", + "cubeSize": 0.149, + "pointSize": 28.57 + }, + "/filtered_points_combined": { + "visible": true, + "flatColor": "#ff0000ff", + "pointShape": "cube", + "pointSize": 6.63, + "colorField": "z", + "colorMode": "gradient", + "colorMap": "rainbow", + "cubeSize": 0.35, + "gradient": [ + "#d100caff", + "#ff0000ff" + ] + }, + "/filtered_points_box_7": { + "visible": true, + "flatColor": "#fbfaffff", + "colorField": "intensity", + "colorMode": "colormap", + "colorMap": "turbo" + }, + "/filtered_pointcloud": { + "visible": true, + "colorField": "z", + "colorMode": "flat", + "colorMap": "turbo", + "flatColor": "#ff0000ff", + "pointSize": 40.21, + "pointShape": "cube", + "cubeSize": 0.1, + "cubeOutline": true + }, + "/detected": { + "visible": false, + "pointSize": 1.5, + "pointShape": "cube", + "cubeSize": 0.118, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "flatColor": "#d70000ff", + "cubeOutline": true + }, + "/detected_0": { + "visible": true, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "pointSize": 1.6, + "pointShape": "cube", + "cubeSize": 0.1, + "flatColor": "#e00000ff", + "stixelsEnabled": false, + "decayTime": 0, + "cubeOutline": true + }, + "/detected_1": { + "visible": true, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "pointShape": "cube", + "cubeSize": 0.1, + "flatColor": "#00ff15ff", + "cubeOutline": true + }, + "/image_detected_0": { + "visible": false + }, + "/detected/pointcloud/1": { + "visible": true, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "pointShape": "cube", + "flatColor": "#15ff00ff", + "pointSize": 0.1, + "cubeSize": 0.05, + "cubeOutline": true + }, + "/detected/pointcloud/2": { + "visible": true, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "pointShape": "cube", + "flatColor": "#00ffe1ff", + "pointSize": 10, + "cubeOutline": true, + "cubeSize": 0.05 + }, + "/detected/pointcloud/0": { + "visible": true, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "pointShape": "cube", + "flatColor": "#ff0000ff", + "cubeOutline": true, + "cubeSize": 0.04 + }, + "/detected/image/0": { + "visible": false + }, + "/detected/image/3": { + "visible": false + }, + "/detected/pointcloud/3": { + "visible": true, + "pointSize": 1.5, + "pointShape": "cube", + "cubeSize": 0.1, + "flatColor": "#00fffaff", + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo" + }, + "/detected/image/1": { + "visible": false + }, + "/registered_scan": { + "visible": true, + "colorField": "z", + "colorMode": "colormap", + "colorMap": "turbo", + "pointShape": "circle", + "pointSize": 2 + }, + "/image/camera_info": { + "visible": true, + "distance": 2 + }, + "/map": { + "visible": true, + "colorField": "z", + "colorMode": "colormap", + "colorMap": "turbo", + "pointShape": "square", + "cubeSize": 0.13, + "explicitAlpha": 1, + "pointSize": 1, + "decayTime": 2 + }, + "/detection3d/markers": { + "visible": true, + "color": "#88ff00ff", + "showOutlines": true, + "selectedIdVariable": "" + }, + "/foxglove/scene_update": { + "visible": true + }, + "/scene_update": { + "visible": true, + "showOutlines": true, + "computeVertexNormals": true + }, + "/target": { + "visible": true, + "axisScale": 1 + }, + "/goal_pose": { + "visible": true, + "axisScale": 0.5 } }, "publish": { @@ -211,8 +411,8 @@ "poseEstimateThetaDeviation": 0.26179939 }, "imageMode": {}, - "foxglovePanelTitle": "test", - "followTf": "world" + "foxglovePanelTitle": "", + "followTf": "map" }, "Image!3mnp456": { "cameraState": { @@ -245,7 +445,7 @@ "enableStats": false, "transforms": { "showLabel": false, - "visible": false + "visible": true } }, "transforms": { @@ -264,13 +464,14 @@ }, "topics": { "/lidar": { - "visible": true, - "colorField": "_auto_distance", + "visible": false, + "colorField": "z", "colorMode": "colormap", "colorMap": "turbo", - "pointSize": 4, + "pointSize": 6, "explicitAlpha": 0.6, - "pointShape": "square" + "pointShape": "circle", + "cubeSize": 0.016 }, "/odom": { "visible": false @@ -281,6 +482,103 @@ "/global_costmap": { "visible": false, "minColor": "#ffffff00" + }, + "/detected_0": { + "visible": true, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "pointSize": 23, + "pointShape": "cube", + "cubeSize": 0.04, + "flatColor": "#ff0000ff", + "stixelsEnabled": false + }, + "/detected_1": { + "visible": true, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "pointSize": 20.51, + "flatColor": "#34ff00ff", + "pointShape": "cube", + "cubeSize": 0.04, + "cubeOutline": false + }, + "/filtered_pointcloud": { + "visible": true, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "rainbow", + "pointSize": 1.5, + "pointShape": "cube", + "flatColor": "#ff0000ff", + "cubeSize": 0.1 + }, + "/global_map": { + "visible": false, + "colorField": "z", + "colorMode": "colormap", + "colorMap": "turbo", + "pointShape": "cube", + "pointSize": 5 + }, + "/detected/pointcloud/1": { + "visible": false, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "pointShape": "cube", + "cubeSize": 0.01, + "flatColor": "#00ff1eff", + "pointSize": 15, + "decayTime": 0, + "cubeOutline": true + }, + "/detected/pointcloud/2": { + "visible": false, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "pointShape": "circle", + "cubeSize": 0.1, + "flatColor": "#00fbffff", + "pointSize": 0.01 + }, + "/detected/pointcloud/0": { + "visible": false, + "colorField": "intensity", + "colorMode": "flat", + "colorMap": "turbo", + "pointShape": "cube", + "flatColor": "#ff0000ff", + "pointSize": 15, + "cubeOutline": true, + "cubeSize": 0.03 + }, + "/registered_scan": { + "visible": false, + "colorField": "z", + "colorMode": "colormap", + "colorMap": "turbo", + "pointShape": "circle", + "pointSize": 6.49 + }, + "/detection3d/markers": { + "visible": false + }, + "/foxglove/scene_update": { + "visible": true + }, + "/scene_update": { + "visible": false + }, + "/map": { + "visible": false, + "colorField": "z", + "colorMode": "colormap", + "colorMap": "turbo", + "pointSize": 8 } }, "layers": {}, @@ -305,40 +603,44 @@ } }, "synchronize": false, - "rotation": 0 + "rotation": 0, + "calibrationTopic": "/camera_info" }, "foxglovePanelTitle": "" }, - "StateTransitions!pu21x4": { + "Plot!3heo336": { "paths": [ { - "value": "/annotations.texts[1].text", - "timestampMethod": "receiveTime", - "label": "detection1" + "timestampMethod": "publishTime", + "value": "/image.header.stamp.sec", + "enabled": true, + "color": "#4e98e2", + "label": "image", + "showLine": false }, { - "value": "/annotations.texts[3].text", - "timestampMethod": "receiveTime", - "label": "detection2" - } - ], - "isSynced": true, - "showPoints": true, - "timeWindowMode": "automatic" - }, - "Plot!a1gj37": { - "paths": [ + "timestampMethod": "publishTime", + "value": "/map.header.stamp.sec", + "enabled": true, + "color": "#f5774d", + "label": "lidar", + "showLine": false + }, { - "timestampMethod": "receiveTime", - "value": "/annotations.points[0].points[0].x", + "timestampMethod": "publishTime", + "value": "/tf.transforms[0].header.stamp.sec", "enabled": true, - "color": "#4e98e2" + "color": "#f7df71", + "label": "tf", + "showLine": false }, { - "timestampMethod": "receiveTime", - "value": "/annotations.points[0].points[0].y", + "timestampMethod": "publishTime", + "value": "/odom.header.stamp.sec", "enabled": true, - "color": "#f5774d" + "color": "#5cd6a9", + "label": "odom", + "showLine": false } ], "showXAxisLabels": true, @@ -349,6 +651,161 @@ "isSynced": true, "xAxisVal": "timestamp", "sidebarDimension": 240 + }, + "Image!47pi3ov": { + "cameraState": { + "distance": 20, + "perspective": true, + "phi": 60, + "target": [ + 0, + 0, + 0 + ], + "targetOffset": [ + 0, + 0, + 0 + ], + "targetOrientation": [ + 0, + 0, + 0, + 1 + ], + "thetaOffset": 45, + "fovy": 45, + "near": 0.5, + "far": 5000 + }, + "followMode": "follow-pose", + "scene": {}, + "transforms": {}, + "topics": {}, + "layers": {}, + "publish": { + "type": "point", + "poseTopic": "/move_base_simple/goal", + "pointTopic": "/clicked_point", + "poseEstimateTopic": "/initialpose", + "poseEstimateXDeviation": 0.5, + "poseEstimateYDeviation": 0.5, + "poseEstimateThetaDeviation": 0.26179939 + }, + "imageMode": { + "imageTopic": "/detected/image/0" + } + }, + "Image!4kk50gw": { + "cameraState": { + "distance": 20, + "perspective": true, + "phi": 60, + "target": [ + 0, + 0, + 0 + ], + "targetOffset": [ + 0, + 0, + 0 + ], + "targetOrientation": [ + 0, + 0, + 0, + 1 + ], + "thetaOffset": 45, + "fovy": 45, + "near": 0.5, + "far": 5000 + }, + "followMode": "follow-pose", + "scene": {}, + "transforms": {}, + "topics": {}, + "layers": {}, + "publish": { + "type": "point", + "poseTopic": "/move_base_simple/goal", + "pointTopic": "/clicked_point", + "poseEstimateTopic": "/initialpose", + "poseEstimateXDeviation": 0.5, + "poseEstimateYDeviation": 0.5, + "poseEstimateThetaDeviation": 0.26179939 + }, + "imageMode": { + "imageTopic": "/detected/image/1" + } + }, + "Image!2348e0b": { + "cameraState": { + "distance": 20, + "perspective": true, + "phi": 60, + "target": [ + 0, + 0, + 0 + ], + "targetOffset": [ + 0, + 0, + 0 + ], + "targetOrientation": [ + 0, + 0, + 0, + 1 + ], + "thetaOffset": 45, + "fovy": 45, + "near": 0.5, + "far": 5000 + }, + "followMode": "follow-pose", + "scene": {}, + "transforms": {}, + "topics": {}, + "layers": {}, + "publish": { + "type": "point", + "poseTopic": "/move_base_simple/goal", + "pointTopic": "/clicked_point", + "poseEstimateTopic": "/initialpose", + "poseEstimateXDeviation": 0.5, + "poseEstimateYDeviation": 0.5, + "poseEstimateThetaDeviation": 0.26179939 + }, + "imageMode": { + "imageTopic": "/detected/image/2", + "synchronize": false + } + }, + "StateTransitions!pu21x4": { + "paths": [ + { + "value": "/annotations.texts[1].text", + "timestampMethod": "receiveTime", + "label": "detection1" + }, + { + "value": "/annotations.texts[3].text", + "timestampMethod": "receiveTime", + "label": "detection2" + }, + { + "value": "/annotations.texts[5].text", + "timestampMethod": "receiveTime", + "label": "detection3" + } + ], + "isSynced": true, + "showPoints": true, + "timeWindowMode": "automatic" } }, "globalVariables": {}, @@ -360,19 +817,33 @@ "tracks": [] }, "layout": { - "first": "3D!18i6zy7", + "first": { + "first": "3D!18i6zy7", + "second": "Image!3mnp456", + "direction": "row", + "splitPercentage": 47.265625 + }, "second": { - "first": "Image!3mnp456", + "first": "Plot!3heo336", "second": { - "first": "StateTransitions!pu21x4", - "second": "Plot!a1gj37", + "first": { + "first": "Image!47pi3ov", + "second": { + "first": "Image!4kk50gw", + "second": "Image!2348e0b", + "direction": "row" + }, + "direction": "row", + "splitPercentage": 33.06523681858802 + }, + "second": "StateTransitions!pu21x4", "direction": "column", - "splitPercentage": 32.661870503597136 + "splitPercentage": 86.63101604278076 }, - "direction": "column", - "splitPercentage": 59.451575262543756 + "direction": "row", + "splitPercentage": 46.39139486467731 }, - "direction": "row", - "splitPercentage": 55.3125 + "direction": "column", + "splitPercentage": 81.62970106075217 } } diff --git a/data/.lfs/replay_g1.tar.gz b/data/.lfs/replay_g1.tar.gz new file mode 100644 index 0000000000..67750bd0cf --- /dev/null +++ b/data/.lfs/replay_g1.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19ad1c53c4f8f9414c0921b94cd4c87e81bf0ad676881339f15ae2d8a8619311 +size 557410250 diff --git a/data/.lfs/replay_g1_run.tar.gz b/data/.lfs/replay_g1_run.tar.gz new file mode 100644 index 0000000000..86368ec788 --- /dev/null +++ b/data/.lfs/replay_g1_run.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00cf21f65a15994895150f74044f5d00d7aa873d24f071d249ecbd09cb8f2b26 +size 559554274 diff --git a/dimos/agents/memory/spatial_vector_db.py b/dimos/agents/memory/spatial_vector_db.py index afc77df12a..a4eefb792b 100644 --- a/dimos/agents/memory/spatial_vector_db.py +++ b/dimos/agents/memory/spatial_vector_db.py @@ -200,6 +200,9 @@ def _process_query_results(self, results) -> List[Dict]: processed_results = [] for i, vector_id in enumerate(results["ids"]): + if isinstance(vector_id, list) and not vector_id: + continue + lookup_id = vector_id[0] if isinstance(vector_id, list) else vector_id # Create the result dictionary with metadata regardless of image availability diff --git a/dimos/agents2/skills/navigation.py b/dimos/agents2/skills/navigation.py index c20293f60e..7a2dd65edd 100644 --- a/dimos/agents2/skills/navigation.py +++ b/dimos/agents2/skills/navigation.py @@ -14,20 +14,22 @@ import time from typing import Any, Optional + import cv2 from reactivex import Observable +from reactivex.disposable import CompositeDisposable, Disposable +from dimos.models.qwen.video_query import BBox from dimos.models.vl.qwen import QwenVlModel +from dimos.msgs.geometry_msgs import PoseStamped +from dimos.msgs.geometry_msgs.Vector3 import make_vector3 from dimos.msgs.sensor_msgs import Image from dimos.navigation.visual.query import get_object_bbox_from_image from dimos.protocol.skill.skill import SkillContainer, skill from dimos.robot.robot import UnitreeRobot from dimos.types.robot_location import RobotLocation -from dimos.models.qwen.video_query import BBox -from dimos.msgs.geometry_msgs import PoseStamped -from dimos.msgs.geometry_msgs.Vector3 import make_vector3 -from dimos.utils.transform_utils import euler_to_quaternion, quaternion_to_euler from dimos.utils.logging_config import setup_logger +from dimos.utils.transform_utils import euler_to_quaternion, quaternion_to_euler from dimos.navigation.bt_navigator.navigator import NavigatorState from reactivex.disposable import Disposable, CompositeDisposable @@ -203,11 +205,7 @@ def _get_bbox_for_current_frame(self, query: str) -> Optional[BBox]: if self._latest_image is None: return None - frame = cv2.cvtColor(self._latest_image.data, cv2.COLOR_RGB2BGR) - if frame is None: - return None - - return get_object_bbox_from_image(self._vl_model, frame, query) + return get_object_bbox_from_image(self._vl_model, self._latest_image, query) def _navigate_using_semantic_map(self, query: str) -> str: results = self._robot.spatial_memory.query_by_text(query) diff --git a/dimos/agents2/skills/ros_navigation.py b/dimos/agents2/skills/ros_navigation.py new file mode 100644 index 0000000000..d989f95b5d --- /dev/null +++ b/dimos/agents2/skills/ros_navigation.py @@ -0,0 +1,123 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from typing import Any, Optional +from dimos.msgs.geometry_msgs import PoseStamped +from dimos.msgs.geometry_msgs.Vector3 import make_vector3 +from dimos.protocol.skill.skill import SkillContainer, skill +from dimos.utils.logging_config import setup_logger +from dimos.utils.transform_utils import euler_to_quaternion + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from dimos.robot.unitree_webrtc.unitree_g1 import UnitreeG1 + +logger = setup_logger(__file__) + + +class RosNavigation(SkillContainer): + _robot: "UnitreeG1" + _started: bool + + def __init__(self, robot: "UnitreeG1"): + self._robot = robot + self._similarity_threshold = 0.23 + self._started = False + + def __enter__(self) -> "RosNavigation": + self._started = True + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self._close_module() + return False + + @skill() + def navigate_with_text(self, query: str) -> str: + """Navigate to a location by querying the existing semantic map using natural language. + + CALL THIS SKILL FOR ONE SUBJECT AT A TIME. For example: "Go to the person wearing a blue shirt in the living room", + you should call this skill twice, once for the person wearing a blue shirt and once for the living room. + + Args: + query: Text query to search for in the semantic map + """ + + print("X" * 10000) + + if not self._started: + raise ValueError(f"{self} has not been started.") + + success_msg = self._navigate_using_semantic_map(query) + if success_msg: + return success_msg + + return "Failed to navigate." + + def _navigate_using_semantic_map(self, query: str) -> str: + results = self._robot.spatial_memory.query_by_text(query) + + if not results: + return f"No matching location found in semantic map for '{query}'" + + best_match = results[0] + + goal_pose = self._get_goal_pose_from_result(best_match) + + if not goal_pose: + return f"Found a result for '{query}' but it didn't have a valid position." + + result = self._robot.nav.go_to(goal_pose) + + if not result: + return f"Failed to navigate for '{query}'" + + return f"Successfuly arrived at '{query}'" + + @skill() + def stop_movement(self) -> str: + """Immediatly stop moving.""" + + if not self._started: + raise ValueError(f"{self} has not been started.") + + self._robot.cancel_navigation() + + return "Stopped" + + def _get_goal_pose_from_result(self, result: dict[str, Any]) -> Optional[PoseStamped]: + similarity = 1.0 - (result.get("distance") or 1) + if similarity < self._similarity_threshold: + logger.warning( + f"Match found but similarity score ({similarity:.4f}) is below threshold ({self._similarity_threshold})" + ) + return None + + metadata = result.get("metadata") + if not metadata: + return None + + first = metadata[0] + pos_x = first.get("pos_x", 0) + pos_y = first.get("pos_y", 0) + theta = first.get("rot_z", 0) + + return PoseStamped( + ts=time.time(), + position=make_vector3(pos_x, pos_y, 0), + orientation=euler_to_quaternion(make_vector3(0, 0, theta)), + frame_id="map", + ) diff --git a/dimos/agents2/temp/webcam_agent.py b/dimos/agents2/temp/webcam_agent.py index 8e2538f832..fed01ed96f 100644 --- a/dimos/agents2/temp/webcam_agent.py +++ b/dimos/agents2/temp/webcam_agent.py @@ -23,24 +23,23 @@ import sys import time from pathlib import Path - -from dotenv import load_dotenv - -from dimos.agents2.cli.human import HumanInput - -# Add parent directories to path -sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) - from threading import Thread import reactivex as rx import reactivex.operators as ops +from dotenv import load_dotenv from dimos.agents2 import Agent, Output, Reducer, Stream, skill +from dimos.agents2.cli.human import HumanInput from dimos.agents2.spec import Model, Provider from dimos.core import LCMTransport, Module, pLCMTransport, start -from dimos.hardware.webcam import ColorCameraModule, Webcam -from dimos.msgs.sensor_msgs import Image +from dimos.hardware.camera import zed +from dimos.hardware.camera.module import CameraModule +from dimos.hardware.camera.webcam import Webcam +from dimos.msgs.geometry_msgs import Quaternion, Transform, Vector3 + +# from dimos.hardware.webcam import ColorCameraModule, Webcam +from dimos.msgs.sensor_msgs import CameraInfo, Image from dimos.protocol.skill.test_coordinator import SkillContainerTest from dimos.robot.unitree_webrtc.unitree_go2 import UnitreeGo2 from dimos.robot.unitree_webrtc.unitree_skill_container import UnitreeSkillContainer @@ -110,7 +109,24 @@ def main(): ) testcontainer = dimos.deploy(SkillContainerTest) - webcam = dimos.deploy(ColorCameraModule, hardware=lambda: Webcam(camera_index=0)) + webcam = dimos.deploy( + CameraModule, + transform=Transform( + translation=Vector3(0.0, 0.0, 0.0), + rotation=Quaternion(0.0, 0.0, 0.0, 1.0), + frame_id="base_link", + child_frame_id="camera_link", + ), + hardware=lambda: Webcam( + camera_index=0, + frequency=15, + stereo_slice="left", + camera_info=zed.CameraInfo.SingleWebcam, + ), + ) + + webcam.camera_info.transport = LCMTransport("/camera_info", CameraInfo) + webcam.image.transport = LCMTransport("/image", Image) webcam.start() diff --git a/dimos/core/module.py b/dimos/core/module.py index e235ba16e0..0385aad041 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -117,7 +117,8 @@ def _close_rpc(self): @property def tf(self): if self._tf is None: - self._tf = self.config.tf_transport() + # self._tf = self.config.tf_transport() + self._tf = LCMTF() return self._tf @tf.setter diff --git a/dimos/hardware/camera/module.py b/dimos/hardware/camera/module.py new file mode 100644 index 0000000000..2b2880b80a --- /dev/null +++ b/dimos/hardware/camera/module.py @@ -0,0 +1,127 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import queue +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Generic, Literal, Optional, Protocol, TypeVar + +import reactivex as rx +from dimos_lcm.sensor_msgs import CameraInfo +from reactivex import operators as ops +from reactivex.disposable import Disposable +from reactivex.observable import Observable + +from dimos.agents2 import Output, Reducer, Stream, skill +from dimos.core import Module, Out, rpc +from dimos.core.module import Module, ModuleConfig +from dimos.hardware.camera.spec import ( + CameraHardware, +) +from dimos.hardware.camera.webcam import Webcam, WebcamConfig +from dimos.msgs.geometry_msgs import Quaternion, Transform, Vector3 +from dimos.msgs.sensor_msgs import Image +from dimos.msgs.sensor_msgs.Image import Image, sharpness_barrier + +default_transform = lambda: Transform( + translation=Vector3(0.0, 0.0, 0.0), + rotation=Quaternion(0.0, 0.0, 0.0, 1.0), + frame_id="base_link", + child_frame_id="camera_link", +) + + +@dataclass +class CameraModuleConfig(ModuleConfig): + frame_id: str = "camera_link" + transform: Optional[Transform] = field(default_factory=default_transform) + hardware: Callable[[], CameraHardware] | CameraHardware = Webcam + + +class CameraModule(Module): + image: Out[Image] = None + camera_info: Out[CameraInfo] = None + + hardware: CameraHardware = None + _module_subscription: Optional[Disposable] = None + _camera_info_subscription: Optional[Disposable] = None + _skill_stream: Optional[Observable[Image]] = None + + default_config = CameraModuleConfig + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @rpc + def start(self): + if callable(self.config.hardware): + self.hardware = self.config.hardware() + else: + self.hardware = self.config.hardware + + if self._module_subscription: + return "already started" + + stream = self.hardware.image_stream().pipe(sharpness_barrier(5)) + + # camera_info_stream = self.camera_info_stream(frequency=5.0) + + def publish_info(camera_info: CameraInfo): + self.camera_info.publish(camera_info) + + if self.config.transform is None: + return + + camera_link = self.config.transform + camera_link.ts = camera_info.ts + camera_optical = Transform( + translation=Vector3(0.0, 0.0, 0.0), + rotation=Quaternion(-0.5, 0.5, -0.5, 0.5), + frame_id="camera_link", + child_frame_id="camera_optical", + ts=camera_link.ts, + ) + + self.tf.publish(camera_link, camera_optical) + + self._camera_info_subscription = self.camera_info_stream().subscribe(publish_info) + self._module_subscription = stream.subscribe(self.image.publish) + + @skill(stream=Stream.passive, output=Output.image, reducer=Reducer.latest) + def video_stream(self) -> Image: + """implicit video stream skill""" + _queue = queue.Queue(maxsize=1) + self.hardware.image_stream().subscribe(_queue.put) + + for image in iter(_queue.get, None): + yield image + + def camera_info_stream(self, frequency: float = 5.0) -> Observable[CameraInfo]: + def camera_info(_) -> CameraInfo: + self.hardware.camera_info.ts = time.time() + return self.hardware.camera_info + + return rx.interval(1.0 / frequency).pipe(ops.map(camera_info)) + + def stop(self): + if self._module_subscription: + self._module_subscription.dispose() + self._module_subscription = None + if self._camera_info_subscription: + self._camera_info_subscription.dispose() + self._camera_info_subscription = None + # Also stop the hardware if it has a stop method + if self.hardware and hasattr(self.hardware, "stop"): + self.hardware.stop() + super().stop() diff --git a/dimos/hardware/camera/spec.py b/dimos/hardware/camera/spec.py new file mode 100644 index 0000000000..cc69db5d1c --- /dev/null +++ b/dimos/hardware/camera/spec.py @@ -0,0 +1,55 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod, abstractproperty +from typing import Generic, Optional, Protocol, TypeVar + +from dimos_lcm.sensor_msgs import CameraInfo +from reactivex.observable import Observable + +from dimos.msgs.sensor_msgs import Image +from dimos.protocol.service import Configurable + + +class CameraConfig(Protocol): + frame_id_prefix: Optional[str] + + +CameraConfigT = TypeVar("CameraConfigT", bound=CameraConfig) + + +class CameraHardware(ABC, Configurable[CameraConfigT], Generic[CameraConfigT]): + @abstractmethod + def image_stream(self) -> Observable[Image]: + pass + + @abstractproperty + def camera_info(self) -> CameraInfo: + pass + + +# This is an example, feel free to change spec for stereo cameras +# e.g., separate camera_info or streams for left/right, etc. +class StereoCameraHardware(ABC, Configurable[CameraConfigT], Generic[CameraConfigT]): + @abstractmethod + def image_stream(self) -> Observable[Image]: + pass + + @abstractmethod + def depth_stream(self) -> Observable[Image]: + pass + + @abstractproperty + def camera_info(self) -> CameraInfo: + pass diff --git a/dimos/hardware/camera/test_webcam.py b/dimos/hardware/camera/test_webcam.py new file mode 100644 index 0000000000..0f6a509084 --- /dev/null +++ b/dimos/hardware/camera/test_webcam.py @@ -0,0 +1,108 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time + +import pytest + +from dimos import core +from dimos.hardware.camera import zed +from dimos.hardware.camera.module import CameraModule +from dimos.hardware.camera.webcam import Webcam +from dimos.msgs.geometry_msgs import Quaternion, Transform, Vector3 +from dimos.msgs.sensor_msgs import CameraInfo, Image + + +@pytest.mark.tool +def test_streaming_single(): + dimos = core.start(1) + + camera = dimos.deploy( + CameraModule, + transform=Transform( + translation=Vector3(0.05, 0.0, 0.0), + rotation=Quaternion(0.0, 0.0, 0.0, 1.0), + frame_id="sensor", + child_frame_id="camera_link", + ), + hardware=lambda: Webcam( + stereo_slice="left", + camera_index=0, + frequency=15, + camera_info=zed.CameraInfo.SingleWebcam, + ), + ) + + camera.image.transport = core.LCMTransport("/image1", Image) + camera.camera_info.transport = core.LCMTransport("/image1/camera_info", CameraInfo) + camera.start() + + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + camera.stop() + dimos.stop() + + +@pytest.mark.tool +def test_streaming_double(): + dimos = core.start(2) + + camera1 = dimos.deploy( + CameraModule, + transform=Transform( + translation=Vector3(0.05, 0.0, 0.0), + rotation=Quaternion(0.0, 0.0, 0.0, 1.0), + frame_id="sensor", + child_frame_id="camera_link", + ), + hardware=lambda: Webcam( + stereo_slice="left", + camera_index=0, + frequency=15, + camera_info=zed.CameraInfo.SingleWebcam, + ), + ) + + camera2 = dimos.deploy( + CameraModule, + transform=Transform( + translation=Vector3(0.05, 0.0, 0.0), + rotation=Quaternion(0.0, 0.0, 0.0, 1.0), + frame_id="sensor", + child_frame_id="camera_link", + ), + hardware=lambda: Webcam( + camera_index=4, + frequency=15, + stereo_slice="left", + camera_info=zed.CameraInfo.SingleWebcam, + ), + ) + + camera1.image.transport = core.LCMTransport("/image1", Image) + camera1.camera_info.transport = core.LCMTransport("/image1/camera_info", CameraInfo) + camera1.start() + camera2.image.transport = core.LCMTransport("/image2", Image) + camera2.camera_info.transport = core.LCMTransport("/image2/camera_info", CameraInfo) + camera2.start() + + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + camera1.stop() + camera2.stop() + dimos.stop() diff --git a/dimos/hardware/webcam.py b/dimos/hardware/camera/webcam.py similarity index 69% rename from dimos/hardware/webcam.py rename to dimos/hardware/camera/webcam.py index d7413ccb1b..87ba492d6e 100644 --- a/dimos/hardware/webcam.py +++ b/dimos/hardware/camera/webcam.py @@ -15,57 +15,36 @@ import queue import threading import time -from abc import ABC, abstractmethod, abstractproperty from dataclasses import dataclass, field from functools import cache from typing import Any, Callable, Generic, Literal, Optional, Protocol, TypeVar import cv2 -import numpy as np from dimos_lcm.sensor_msgs import CameraInfo from reactivex import create from reactivex.observable import Observable -from dimos.agents2 import Output, Reducer, Stream, skill -from dimos.core import Module, Out, rpc -from dimos.core.module import DaskModule, ModuleConfig +from dimos.hardware.camera.spec import ( + CameraConfig, + CameraHardware, +) from dimos.msgs.sensor_msgs import Image from dimos.msgs.sensor_msgs.Image import ImageFormat -from dimos.protocol.service import Configurable, Service from dimos.utils.reactive import backpressure -class CameraConfig(Protocol): - frame_id_prefix: Optional[str] - - -CameraConfigT = TypeVar("CameraConfigT", bound=CameraConfig) - - -# StereoCamera interface, for cameras that provide standard -# color, depth, pointcloud, and pose messages -class ColorCameraHardware(Configurable[CameraConfigT], Generic[CameraConfigT]): - @abstractmethod - def color_stream(self) -> Observable[Image]: - pass - - @abstractproperty - def camera_info(self) -> CameraInfo: - pass - - @dataclass class WebcamConfig(CameraConfig): - camera_index: int = 0 + camera_index: int = 0 # /dev/videoN frame_width: int = 640 frame_height: int = 480 - frequency: int = 10 + frequency: int = 15 camera_info: CameraInfo = field(default_factory=CameraInfo) frame_id_prefix: Optional[str] = None stereo_slice: Optional[Literal["left", "right"]] = None # For stereo cameras -class Webcam(ColorCameraHardware[WebcamConfig]): +class Webcam(CameraHardware[WebcamConfig]): default_config = WebcamConfig def __init__(self, *args, **kwargs): @@ -76,7 +55,7 @@ def __init__(self, *args, **kwargs): self._observer = None @cache - def color_stream(self) -> Observable[Image]: + def image_stream(self) -> Observable[Image]: """Create an observable that starts/stops camera on subscription""" def subscribe(observer, scheduler=None): @@ -117,7 +96,6 @@ def start(self): self._capture_thread = threading.Thread(target=self._capture_loop, daemon=True) self._capture_thread.start() - @rpc def stop(self): """Stop capturing frames""" # Signal thread to stop @@ -191,56 +169,6 @@ def _capture_loop(self): @property def camera_info(self) -> CameraInfo: - """Return the camera info from config""" return self.config.camera_info def emit(self, image: Image): ... - - def image_stream(self): - return self.image.observable() - - -@dataclass -class ColorCameraModuleConfig(ModuleConfig): - hardware: Callable[[], ColorCameraHardware] | ColorCameraHardware = Webcam - - -class ColorCameraModule(DaskModule): - image: Out[Image] = None - hardware: ColorCameraHardware = None - _module_subscription: Optional[Any] = None # Subscription disposable - _skill_stream: Optional[Observable[Image]] = None - default_config = ColorCameraModuleConfig - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - @rpc - def start(self): - if callable(self.config.hardware): - self.hardware = self.config.hardware() - else: - self.hardware = self.config.hardware - - if self._module_subscription: - return "already started" - stream = self.hardware.color_stream() - self._module_subscription = stream.subscribe(self.image.publish) - - @skill(stream=Stream.passive, output=Output.image, reducer=Reducer.latest) - def video_stream(self) -> Image: - """implicit video stream skill""" - _queue = queue.Queue(maxsize=1) - self.hardware.color_stream().subscribe(_queue.put) - - for image in iter(_queue.get, None): - yield image - - def stop(self): - if self._module_subscription: - self._module_subscription.dispose() - self._module_subscription = None - # Also stop the hardware if it has a stop method - if self.hardware and hasattr(self.hardware, "stop"): - self.hardware.stop() - super().stop() diff --git a/dimos/hardware/test_webcam.py b/dimos/hardware/test_webcam.py deleted file mode 100644 index e6d26d2b8d..0000000000 --- a/dimos/hardware/test_webcam.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import time - -import pytest - -from dimos.core import LCMTransport, start -from dimos.hardware.webcam import ColorCameraModule, Webcam -from dimos.msgs.sensor_msgs import Image - - -@pytest.mark.tool -def test_basic(): - webcam = Webcam() - subscription = webcam.color_stream().subscribe( - on_next=lambda img: print(f"Got image: {img.width}x{img.height}"), - on_error=lambda e: print(f"Error: {e}"), - on_completed=lambda: print("Stream completed"), - ) - - # Keep the subscription alive for a few seconds - try: - time.sleep(3) - finally: - # Clean disposal - subscription.dispose() - print("Test completed") - - -@pytest.mark.tool -def test_module(): - dimos = start(1) - # Deploy ColorCameraModule, not Webcam directly - camera_module = dimos.deploy( - ColorCameraModule, - hardware=lambda: Webcam(camera_index=4, frequency=30, stereo_slice="left"), - ) - camera_module.image.transport = LCMTransport("/image", Image) - camera_module.start() - - test_transport = LCMTransport("/image", Image) - test_transport.subscribe(print) - - time.sleep(60) - - print("shutting down") - camera_module.stop() - time.sleep(1.0) - dimos.stop() - - -if __name__ == "__main__": - test_module() diff --git a/dimos/mapping/types.py b/dimos/mapping/types.py index 1ee2263e21..3ceb64c56b 100644 --- a/dimos/mapping/types.py +++ b/dimos/mapping/types.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from dataclasses import dataclass from typing import Optional, TypeAlias diff --git a/dimos/models/vl/__init__.py b/dimos/models/vl/__init__.py new file mode 100644 index 0000000000..8cb0a7944b --- /dev/null +++ b/dimos/models/vl/__init__.py @@ -0,0 +1,2 @@ +from dimos.models.vl.base import VlModel +from dimos.models.vl.qwen import QwenVlModel diff --git a/dimos/models/vl/base.py b/dimos/models/vl/base.py index c9df67b5fd..faab96363d 100644 --- a/dimos/models/vl/base.py +++ b/dimos/models/vl/base.py @@ -2,7 +2,9 @@ import numpy as np +from dimos.msgs.sensor_msgs import Image + class VlModel(ABC): @abstractmethod - def query(self, image: np.ndarray, query: str) -> str: ... + def query(self, image: Image | np.ndarray, query: str) -> str: ... diff --git a/dimos/models/vl/qwen.py b/dimos/models/vl/qwen.py index 4944f081ae..05ad4715c5 100644 --- a/dimos/models/vl/qwen.py +++ b/dimos/models/vl/qwen.py @@ -1,12 +1,11 @@ import os -import base64 -import io from typing import Optional + import numpy as np from openai import OpenAI -from PIL import Image from dimos.models.vl.base import VlModel +from dimos.msgs.sensor_msgs import Image class QwenVlModel(VlModel): @@ -27,11 +26,19 @@ def __init__(self, api_key: Optional[str] = None, model_name: str = "qwen2.5-vl- api_key=api_key, ) - def query(self, image: np.ndarray, query: str) -> str: - pil_image = Image.fromarray(image.astype("uint8")) - buffered = io.BytesIO() - pil_image.save(buffered, format="PNG") - img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + def query(self, image: Image | np.ndarray, query: str) -> str: + if isinstance(image, np.ndarray): + import warnings + + warnings.warn( + "QwenVlModel.query should receive standard dimos Image type, not a numpy array", + DeprecationWarning, + stacklevel=2, + ) + + image = Image.from_numpy(image) + + img_base64 = image.to_base64() response = self._client.chat.completions.create( model=self._model_name, diff --git a/dimos/msgs/foxglove_msgs/Color.py b/dimos/msgs/foxglove_msgs/Color.py new file mode 100644 index 0000000000..30362f837a --- /dev/null +++ b/dimos/msgs/foxglove_msgs/Color.py @@ -0,0 +1,50 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import hashlib +from dimos_lcm.foxglove_msgs import Color as LCMColor + + +class Color(LCMColor): + """Color with convenience methods.""" + + @classmethod + def from_string(cls, name: str, alpha: float = 0.2) -> Color: + """Generate a consistent color from a string using hash function. + + Args: + name: String to generate color from + alpha: Transparency value (0.0-1.0) + + Returns: + Color instance with deterministic RGB values + """ + # Hash the string to get consistent values + hash_obj = hashlib.md5(name.encode()) + hash_bytes = hash_obj.digest() + + # Use first 3 bytes for RGB (0-255) + r = hash_bytes[0] / 255.0 + g = hash_bytes[1] / 255.0 + b = hash_bytes[2] / 255.0 + + # Create and return color instance + color = cls() + color.r = r + color.g = g + color.b = b + color.a = alpha + return color diff --git a/dimos/msgs/geometry_msgs/Transform.py b/dimos/msgs/geometry_msgs/Transform.py index 468d009519..4db4c929a7 100644 --- a/dimos/msgs/geometry_msgs/Transform.py +++ b/dimos/msgs/geometry_msgs/Transform.py @@ -87,6 +87,9 @@ def lcm_transform(self) -> LCMTransformStamped: ), ) + def apply(self, other: "Transform") -> "Transform": + return self.__add__(other) + def __add__(self, other: "Transform") -> "Transform": """Compose two transforms (transform composition). @@ -248,7 +251,7 @@ def from_pose(cls, frame_id: str, pose: "Pose | PoseStamped") -> "Transform": else: raise TypeError(f"Expected Pose or PoseStamped, got {type(pose).__name__}") - def to_pose(self) -> "PoseStamped": + def to_pose(self, **kwargs) -> "PoseStamped": """Create a Transform from a Pose or PoseStamped. Args: @@ -262,9 +265,12 @@ def to_pose(self) -> "PoseStamped": # Handle both Pose and PoseStamped return PoseStamped( - position=self.translation, - orientation=self.rotation, - frame_id=self.frame_id, + **{ + "position": self.translation, + "orientation": self.rotation, + "frame_id": self.frame_id, + }, + **kwargs, ) def to_matrix(self) -> "np.ndarray": diff --git a/dimos/msgs/geometry_msgs/Vector3.py b/dimos/msgs/geometry_msgs/Vector3.py index 72e2fcef36..2eb204693b 100644 --- a/dimos/msgs/geometry_msgs/Vector3.py +++ b/dimos/msgs/geometry_msgs/Vector3.py @@ -155,6 +155,10 @@ def getArrow(): return f"{getArrow()} Vector {self.__repr__()}" + def agent_encode(self) -> dict: + """Encode the vector for agent communication.""" + return {"x": self.x, "y": self.y, "z": self.z} + def serialize(self) -> dict: """Serialize the vector to a tuple.""" return {"type": "vector", "c": (self.x, self.y, self.z)} diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py index b79e07639f..30c74fd243 100644 --- a/dimos/msgs/sensor_msgs/Image.py +++ b/dimos/msgs/sensor_msgs/Image.py @@ -13,17 +13,15 @@ # limitations under the License. import base64 +import functools import time from dataclasses import dataclass, field -from datetime import timedelta from enum import Enum from typing import Literal, Optional, Tuple, TypedDict import cv2 import numpy as np import reactivex as rx - -# Import LCM types from dimos_lcm.sensor_msgs.Image import Image as LCMImage from dimos_lcm.std_msgs.Header import Header from reactivex import operators as ops @@ -31,6 +29,12 @@ from reactivex.scheduler import ThreadPoolScheduler from dimos.types.timestamped import Timestamped, TimestampedBufferCollection, to_human_readable +from dimos.utils.reactive import quality_barrier + +try: + from sensor_msgs.msg import Image as ROSImage +except ImportError: + ROSImage = None class ImageFormat(Enum): @@ -297,6 +301,7 @@ def crop(self, x: int, y: int, width: int, height: int) -> "Image": ts=self.ts, ) + @functools.cached_property def sharpness(self) -> float: """ Compute the Tenengrad focus measure for an image. @@ -497,6 +502,67 @@ def _parse_encoding(encoding: str) -> dict: return encoding_map[encoding] + @classmethod + def from_ros_msg(cls, ros_msg: ROSImage) -> "Image": + """Create an Image from a ROS sensor_msgs/Image message. + + Args: + ros_msg: ROS Image message + + Returns: + Image instance + """ + # Convert timestamp from ROS header + ts = ros_msg.header.stamp.sec + (ros_msg.header.stamp.nanosec / 1_000_000_000) + + # Parse encoding to determine format and data type + format_info = cls._parse_encoding(ros_msg.encoding) + + # Convert data from ROS message (array.array) to numpy array + data_array = np.frombuffer(ros_msg.data, dtype=format_info["dtype"]) + + # Reshape to image dimensions + if format_info["channels"] == 1: + data_array = data_array.reshape((ros_msg.height, ros_msg.width)) + else: + data_array = data_array.reshape( + (ros_msg.height, ros_msg.width, format_info["channels"]) + ) + + # Crop to center 1/3 of the image (simulate 120-degree FOV from 360-degree) + original_width = data_array.shape[1] + crop_width = original_width // 3 + start_x = (original_width - crop_width) // 2 + end_x = start_x + crop_width + + # Crop the image horizontally to center 1/3 + if len(data_array.shape) == 2: + # Grayscale image + data_array = data_array[:, start_x:end_x] + else: + # Color image + data_array = data_array[:, start_x:end_x, :] + + # Fix color channel order: if ROS sends RGB but we expect BGR, swap channels + # ROS typically uses rgb8 encoding, but OpenCV/our system expects BGR + if format_info["format"] == ImageFormat.RGB: + # Convert RGB to BGR by swapping channels + if len(data_array.shape) == 3 and data_array.shape[2] == 3: + data_array = data_array[:, :, [2, 1, 0]] # RGB -> BGR + format_info["format"] = ImageFormat.BGR + elif format_info["format"] == ImageFormat.RGBA: + # Convert RGBA to BGRA by swapping channels + if len(data_array.shape) == 3 and data_array.shape[2] == 4: + data_array = data_array[:, :, [2, 1, 0, 3]] # RGBA -> BGRA + format_info["format"] = ImageFormat.BGRA + + return cls( + data=data_array, + format=format_info["format"], + frame_id=ros_msg.header.frame_id, + ts=ts, + ) + def __repr__(self) -> str: """String representation.""" return ( @@ -522,18 +588,8 @@ def __len__(self) -> int: def sharpness_window(target_frequency: float, source: Observable[Image]) -> Observable[Image]: - window = TimestampedBufferCollection(1.0 / target_frequency) - source.subscribe(window.add) - - thread_scheduler = ThreadPoolScheduler(max_workers=1) - - def find_best(*argv): - if not window._items: - return None + raise NotImplementedError("use sharpness_barrier instead") - found = max(window._items, key=lambda x: x.sharpness()) - return found - return rx.interval(1.0 / target_frequency).pipe( - ops.observe_on(thread_scheduler), ops.map(find_best), ops.filter(lambda x: x is not None) - ) +def sharpness_barrier(target_frequency: float): + return quality_barrier(lambda x: x.sharpness, target_frequency) diff --git a/dimos/msgs/sensor_msgs/Joy.py b/dimos/msgs/sensor_msgs/Joy.py new file mode 100644 index 0000000000..e528b304b6 --- /dev/null +++ b/dimos/msgs/sensor_msgs/Joy.py @@ -0,0 +1,181 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import time +from typing import List, TypeAlias + +from dimos_lcm.sensor_msgs import Joy as LCMJoy + +try: + from sensor_msgs.msg import Joy as ROSJoy +except ImportError: + ROSJoy = None + +from plum import dispatch + +from dimos.types.timestamped import Timestamped + +# Types that can be converted to/from Joy +JoyConvertable: TypeAlias = ( + tuple[List[float], List[int]] | dict[str, List[float] | List[int]] | LCMJoy +) + + +def sec_nsec(ts): + s = int(ts) + return [s, int((ts - s) * 1_000_000_000)] + + +class Joy(Timestamped): + msg_name = "sensor_msgs.Joy" + ts: float + frame_id: str + axes: List[float] + buttons: List[int] + + @dispatch + def __init__( + self, + ts: float = 0.0, + frame_id: str = "", + axes: List[float] | None = None, + buttons: List[int] | None = None, + ) -> None: + """Initialize a Joy message. + + Args: + ts: Timestamp in seconds + frame_id: Frame ID for the message + axes: List of axis values (typically -1.0 to 1.0) + buttons: List of button states (0 or 1) + """ + self.ts = ts if ts != 0 else time.time() + self.frame_id = frame_id + self.axes = axes if axes is not None else [] + self.buttons = buttons if buttons is not None else [] + + @dispatch + def __init__(self, joy_tuple: tuple[List[float], List[int]]) -> None: + """Initialize from a tuple of (axes, buttons).""" + self.ts = time.time() + self.frame_id = "" + self.axes = list(joy_tuple[0]) + self.buttons = list(joy_tuple[1]) + + @dispatch + def __init__(self, joy_dict: dict[str, List[float] | List[int]]) -> None: + """Initialize from a dictionary with 'axes' and 'buttons' keys.""" + self.ts = joy_dict.get("ts", time.time()) + self.frame_id = joy_dict.get("frame_id", "") + self.axes = list(joy_dict.get("axes", [])) + self.buttons = list(joy_dict.get("buttons", [])) + + @dispatch + def __init__(self, joy: Joy) -> None: + """Initialize from another Joy (copy constructor).""" + self.ts = joy.ts + self.frame_id = joy.frame_id + self.axes = list(joy.axes) + self.buttons = list(joy.buttons) + + @dispatch + def __init__(self, lcm_joy: LCMJoy) -> None: + """Initialize from an LCM Joy message.""" + self.ts = lcm_joy.header.stamp.sec + (lcm_joy.header.stamp.nsec / 1_000_000_000) + self.frame_id = lcm_joy.header.frame_id + self.axes = list(lcm_joy.axes) + self.buttons = list(lcm_joy.buttons) + + def lcm_encode(self) -> bytes: + lcm_msg = LCMJoy() + [lcm_msg.header.stamp.sec, lcm_msg.header.stamp.nsec] = sec_nsec(self.ts) + lcm_msg.header.frame_id = self.frame_id + lcm_msg.axes_length = len(self.axes) + lcm_msg.axes = self.axes + lcm_msg.buttons_length = len(self.buttons) + lcm_msg.buttons = self.buttons + return lcm_msg.lcm_encode() + + @classmethod + def lcm_decode(cls, data: bytes) -> Joy: + lcm_msg = LCMJoy.lcm_decode(data) + return cls( + ts=lcm_msg.header.stamp.sec + (lcm_msg.header.stamp.nsec / 1_000_000_000), + frame_id=lcm_msg.header.frame_id, + axes=list(lcm_msg.axes) if lcm_msg.axes else [], + buttons=list(lcm_msg.buttons) if lcm_msg.buttons else [], + ) + + def __str__(self) -> str: + return ( + f"Joy(axes={len(self.axes)} values, buttons={len(self.buttons)} values, " + f"frame_id='{self.frame_id}')" + ) + + def __repr__(self) -> str: + return ( + f"Joy(ts={self.ts}, frame_id='{self.frame_id}', " + f"axes={self.axes}, buttons={self.buttons})" + ) + + def __eq__(self, other) -> bool: + """Check if two Joy messages are equal.""" + if not isinstance(other, Joy): + return False + return ( + self.axes == other.axes + and self.buttons == other.buttons + and self.frame_id == other.frame_id + ) + + @classmethod + def from_ros_msg(cls, ros_msg: ROSJoy) -> "Joy": + """Create a Joy from a ROS sensor_msgs/Joy message. + + Args: + ros_msg: ROS Joy message + + Returns: + Joy instance + """ + # Convert timestamp from ROS header + ts = ros_msg.header.stamp.sec + (ros_msg.header.stamp.nanosec / 1_000_000_000) + + return cls( + ts=ts, + frame_id=ros_msg.header.frame_id, + axes=list(ros_msg.axes), + buttons=list(ros_msg.buttons), + ) + + def to_ros_msg(self) -> ROSJoy: + """Convert to a ROS sensor_msgs/Joy message. + + Returns: + ROS Joy message + """ + ros_msg = ROSJoy() + + # Set header + ros_msg.header.frame_id = self.frame_id + ros_msg.header.stamp.sec = int(self.ts) + ros_msg.header.stamp.nanosec = int((self.ts - int(self.ts)) * 1_000_000_000) + + # Set axes and buttons + ros_msg.axes = self.axes + ros_msg.buttons = self.buttons + + return ros_msg diff --git a/dimos/msgs/sensor_msgs/PointCloud2.py b/dimos/msgs/sensor_msgs/PointCloud2.py index 8004032ceb..d81c8d0198 100644 --- a/dimos/msgs/sensor_msgs/PointCloud2.py +++ b/dimos/msgs/sensor_msgs/PointCloud2.py @@ -14,6 +14,7 @@ from __future__ import annotations +import functools import struct import time from typing import Optional @@ -28,6 +29,8 @@ from dimos_lcm.sensor_msgs.PointField import PointField from dimos_lcm.std_msgs.Header import Header +from dimos.msgs.geometry_msgs import Vector3 + # Import ROS types try: from sensor_msgs.msg import PointCloud2 as ROSPointCloud2 @@ -73,14 +76,83 @@ def from_numpy( pcd.points = o3d.utility.Vector3dVector(points) return cls(pointcloud=pcd, ts=timestamp, frame_id=frame_id) + def __str__(self) -> str: + return f"PointCloud2(frame_id='{self.frame_id}', num_points={len(self.pointcloud.points)})" + + @functools.cached_property + def center(self) -> Vector3: + """Calculate the center of the pointcloud in world frame.""" + center = np.asarray(self.pointcloud.points).mean(axis=0) + return Vector3(*center) + def points(self): return self.pointcloud.points + def __add__(self, other: PointCloud2) -> PointCloud2: + """Combine two PointCloud2 instances into one. + + The resulting point cloud contains points from both inputs. + The frame_id and timestamp are taken from the first point cloud. + + Args: + other: Another PointCloud2 instance to combine with + + Returns: + New PointCloud2 instance containing combined points + """ + if not isinstance(other, PointCloud2): + raise ValueError("Can only add PointCloud2 to another PointCloud2") + + return PointCloud2( + pointcloud=self.pointcloud + other.pointcloud, + frame_id=self.frame_id, + ts=max(self.ts, other.ts), + ) + # TODO what's the usual storage here? is it already numpy? def as_numpy(self) -> np.ndarray: """Get points as numpy array.""" return np.asarray(self.pointcloud.points) + @functools.cache + def get_axis_aligned_bounding_box(self) -> o3d.geometry.AxisAlignedBoundingBox: + """Get axis-aligned bounding box of the point cloud.""" + return self.pointcloud.get_axis_aligned_bounding_box() + + @functools.cache + def get_oriented_bounding_box(self) -> o3d.geometry.OrientedBoundingBox: + """Get oriented bounding box of the point cloud.""" + return self.pointcloud.get_oriented_bounding_box() + + @functools.cache + def get_bounding_box_dimensions(self) -> tuple[float, float, float]: + """Get dimensions (width, height, depth) of axis-aligned bounding box.""" + bbox = self.get_axis_aligned_bounding_box() + extent = bbox.get_extent() + return tuple(extent) + + def bounding_box_intersects(self, other: "PointCloud2") -> bool: + # Get axis-aligned bounding boxes + bbox1 = self.get_axis_aligned_bounding_box() + bbox2 = other.get_axis_aligned_bounding_box() + + # Get min and max bounds + min1 = bbox1.get_min_bound() + max1 = bbox1.get_max_bound() + min2 = bbox2.get_min_bound() + max2 = bbox2.get_max_bound() + + # Check overlap in all three dimensions + # Boxes intersect if they overlap in ALL dimensions + return ( + min1[0] <= max2[0] + and max1[0] >= min2[0] + and min1[1] <= max2[1] + and max1[1] >= min2[1] + and min1[2] <= max2[2] + and max1[2] >= min2[2] + ) + def lcm_encode(self, frame_id: Optional[str] = None) -> bytes: """Convert to LCM PointCloud2 message.""" msg = LCMPointCloud2() diff --git a/dimos/msgs/sensor_msgs/__init__.py b/dimos/msgs/sensor_msgs/__init__.py index a7afafe2f2..9a8a7b54fe 100644 --- a/dimos/msgs/sensor_msgs/__init__.py +++ b/dimos/msgs/sensor_msgs/__init__.py @@ -1,3 +1,4 @@ from dimos.msgs.sensor_msgs.Image import Image, ImageFormat from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Joy import Joy diff --git a/dimos/msgs/sensor_msgs/test_Joy.py b/dimos/msgs/sensor_msgs/test_Joy.py new file mode 100644 index 0000000000..fd11624b08 --- /dev/null +++ b/dimos/msgs/sensor_msgs/test_Joy.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import time + +try: + from sensor_msgs.msg import Joy as ROSJoy + from std_msgs.msg import Header as ROSHeader + + ROS_AVAILABLE = True +except ImportError: + ROSJoy = None + ROSHeader = None + ROS_AVAILABLE = False + +from dimos.msgs.sensor_msgs.Joy import Joy + + +def test_lcm_encode_decode(): + """Test LCM encode/decode preserves Joy data.""" + print("Testing Joy LCM encode/decode...") + + # Create test joy message with sample gamepad data + original = Joy( + ts=1234567890.123456789, + frame_id="gamepad", + axes=[0.5, -0.25, 1.0, -1.0, 0.0, 0.75], # 6 axes (e.g., left/right sticks + triggers) + buttons=[1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], # 12 buttons + ) + + # Encode to LCM bytes + encoded = original.lcm_encode() + assert isinstance(encoded, bytes) + assert len(encoded) > 0 + + # Decode back + decoded = Joy.lcm_decode(encoded) + + # Verify all fields match + assert abs(decoded.ts - original.ts) < 1e-9 + assert decoded.frame_id == original.frame_id + assert decoded.axes == original.axes + assert decoded.buttons == original.buttons + + print("✓ Joy LCM encode/decode test passed") + + +def test_initialization_methods(): + """Test various initialization methods for Joy.""" + print("Testing Joy initialization methods...") + + # Test default initialization + joy1 = Joy() + assert joy1.axes == [] + assert joy1.buttons == [] + assert joy1.frame_id == "" + assert joy1.ts > 0 # Should have current time + + # Test full initialization + joy2 = Joy(ts=1234567890.0, frame_id="xbox_controller", axes=[0.1, 0.2, 0.3], buttons=[1, 0, 1]) + assert joy2.ts == 1234567890.0 + assert joy2.frame_id == "xbox_controller" + assert joy2.axes == [0.1, 0.2, 0.3] + assert joy2.buttons == [1, 0, 1] + + # Test tuple initialization + joy3 = Joy(([0.5, -0.5], [1, 1, 0])) + assert joy3.axes == [0.5, -0.5] + assert joy3.buttons == [1, 1, 0] + + # Test dict initialization + joy4 = Joy({"axes": [0.7, 0.8], "buttons": [0, 1], "frame_id": "ps4_controller"}) + assert joy4.axes == [0.7, 0.8] + assert joy4.buttons == [0, 1] + assert joy4.frame_id == "ps4_controller" + + # Test copy constructor + joy5 = Joy(joy2) + assert joy5.ts == joy2.ts + assert joy5.frame_id == joy2.frame_id + assert joy5.axes == joy2.axes + assert joy5.buttons == joy2.buttons + assert joy5 is not joy2 # Different objects + + print("✓ Joy initialization methods test passed") + + +def test_equality(): + """Test Joy equality comparison.""" + print("Testing Joy equality...") + + joy1 = Joy(ts=1000.0, frame_id="controller1", axes=[0.5, -0.5], buttons=[1, 0, 1]) + + joy2 = Joy(ts=1000.0, frame_id="controller1", axes=[0.5, -0.5], buttons=[1, 0, 1]) + + joy3 = Joy( + ts=1000.0, + frame_id="controller2", # Different frame_id + axes=[0.5, -0.5], + buttons=[1, 0, 1], + ) + + joy4 = Joy( + ts=1000.0, + frame_id="controller1", + axes=[0.6, -0.5], # Different axes + buttons=[1, 0, 1], + ) + + # Same content should be equal + assert joy1 == joy2 + + # Different frame_id should not be equal + assert joy1 != joy3 + + # Different axes should not be equal + assert joy1 != joy4 + + # Different type should not be equal + assert joy1 != "not a joy" + assert joy1 != 42 + + print("✓ Joy equality test passed") + + +def test_string_representation(): + """Test Joy string representations.""" + print("Testing Joy string representations...") + + joy = Joy( + ts=1234567890.123, + frame_id="test_controller", + axes=[0.1, -0.2, 0.3, 0.4], + buttons=[1, 0, 1, 0, 0, 1], + ) + + # Test __str__ + str_repr = str(joy) + assert "Joy" in str_repr + assert "axes=4 values" in str_repr + assert "buttons=6 values" in str_repr + assert "test_controller" in str_repr + + # Test __repr__ + repr_str = repr(joy) + assert "Joy" in repr_str + assert "1234567890.123" in repr_str + assert "test_controller" in repr_str + assert "[0.1, -0.2, 0.3, 0.4]" in repr_str + assert "[1, 0, 1, 0, 0, 1]" in repr_str + + print("✓ Joy string representation test passed") + + +@pytest.mark.skipif(not ROS_AVAILABLE, reason="ROS not available") +def test_ros_conversion(): + """Test conversion to/from ROS Joy messages.""" + print("Testing Joy ROS conversion...") + + # Create a ROS Joy message + ros_msg = ROSJoy() + ros_msg.header = ROSHeader() + ros_msg.header.stamp.sec = 1234567890 + ros_msg.header.stamp.nanosec = 123456789 + ros_msg.header.frame_id = "ros_gamepad" + ros_msg.axes = [0.25, -0.75, 0.0, 1.0, -1.0] + ros_msg.buttons = [1, 1, 0, 0, 1, 0, 1, 0] + + # Convert from ROS + joy = Joy.from_ros_msg(ros_msg) + assert abs(joy.ts - 1234567890.123456789) < 1e-9 + assert joy.frame_id == "ros_gamepad" + assert joy.axes == [0.25, -0.75, 0.0, 1.0, -1.0] + assert joy.buttons == [1, 1, 0, 0, 1, 0, 1, 0] + + # Convert back to ROS + ros_msg2 = joy.to_ros_msg() + assert ros_msg2.header.frame_id == "ros_gamepad" + assert ros_msg2.header.stamp.sec == 1234567890 + assert abs(ros_msg2.header.stamp.nanosec - 123456789) < 100 # Allow small rounding + assert list(ros_msg2.axes) == [0.25, -0.75, 0.0, 1.0, -1.0] + assert list(ros_msg2.buttons) == [1, 1, 0, 0, 1, 0, 1, 0] + + print("✓ Joy ROS conversion test passed") + + +def test_edge_cases(): + """Test Joy with edge cases.""" + print("Testing Joy edge cases...") + + # Empty axes and buttons + joy1 = Joy(axes=[], buttons=[]) + assert joy1.axes == [] + assert joy1.buttons == [] + encoded = joy1.lcm_encode() + decoded = Joy.lcm_decode(encoded) + assert decoded.axes == [] + assert decoded.buttons == [] + + # Large number of axes and buttons + many_axes = [float(i) / 100.0 for i in range(20)] + many_buttons = [i % 2 for i in range(32)] + joy2 = Joy(axes=many_axes, buttons=many_buttons) + assert len(joy2.axes) == 20 + assert len(joy2.buttons) == 32 + encoded = joy2.lcm_encode() + decoded = Joy.lcm_decode(encoded) + # Check axes with floating point tolerance + assert len(decoded.axes) == len(many_axes) + for i, (a, b) in enumerate(zip(decoded.axes, many_axes)): + assert abs(a - b) < 1e-6, f"Axis {i}: {a} != {b}" + assert decoded.buttons == many_buttons + + # Extreme axis values + extreme_axes = [-1.0, 1.0, 0.0, -0.999999, 0.999999] + joy3 = Joy(axes=extreme_axes) + assert joy3.axes == extreme_axes + + print("✓ Joy edge cases test passed") + + +if __name__ == "__main__": + test_lcm_encode_decode() + test_initialization_methods() + test_equality() + test_string_representation() + if ROS_AVAILABLE: + test_ros_conversion() + test_edge_cases() + print("\nAll Joy tests passed! ✓") diff --git a/dimos/msgs/sensor_msgs/test_PointCloud2.py b/dimos/msgs/sensor_msgs/test_PointCloud2.py index ac115462be..cb18d6fd9d 100644 --- a/dimos/msgs/sensor_msgs/test_PointCloud2.py +++ b/dimos/msgs/sensor_msgs/test_PointCloud2.py @@ -234,6 +234,80 @@ def test_ros_conversion(): print("\n✓ All ROS conversion tests passed!") +def test_bounding_box_intersects(): + """Test bounding_box_intersects method with various scenarios.""" + # Test 1: Overlapping boxes + pc1 = PointCloud2.from_numpy(np.array([[0, 0, 0], [2, 2, 2]])) + pc2 = PointCloud2.from_numpy(np.array([[1, 1, 1], [3, 3, 3]])) + assert pc1.bounding_box_intersects(pc2) == True + assert pc2.bounding_box_intersects(pc1) == True # Should be symmetric + + # Test 2: Non-overlapping boxes + pc3 = PointCloud2.from_numpy(np.array([[0, 0, 0], [1, 1, 1]])) + pc4 = PointCloud2.from_numpy(np.array([[2, 2, 2], [3, 3, 3]])) + assert pc3.bounding_box_intersects(pc4) == False + assert pc4.bounding_box_intersects(pc3) == False + + # Test 3: Touching boxes (edge case - should be True) + pc5 = PointCloud2.from_numpy(np.array([[0, 0, 0], [1, 1, 1]])) + pc6 = PointCloud2.from_numpy(np.array([[1, 1, 1], [2, 2, 2]])) + assert pc5.bounding_box_intersects(pc6) == True + assert pc6.bounding_box_intersects(pc5) == True + + # Test 4: One box completely inside another + pc7 = PointCloud2.from_numpy(np.array([[0, 0, 0], [3, 3, 3]])) + pc8 = PointCloud2.from_numpy(np.array([[1, 1, 1], [2, 2, 2]])) + assert pc7.bounding_box_intersects(pc8) == True + assert pc8.bounding_box_intersects(pc7) == True + + # Test 5: Boxes overlapping only in 2 dimensions (not all 3) + pc9 = PointCloud2.from_numpy(np.array([[0, 0, 0], [2, 2, 1]])) + pc10 = PointCloud2.from_numpy(np.array([[1, 1, 2], [3, 3, 3]])) + assert pc9.bounding_box_intersects(pc10) == False + assert pc10.bounding_box_intersects(pc9) == False + + # Test 6: Real-world detection scenario with floating point coordinates + detection1_points = np.array( + [[-3.5, -0.3, 0.1], [-3.3, -0.2, 0.1], [-3.5, -0.3, 0.3], [-3.3, -0.2, 0.3]] + ) + pc_det1 = PointCloud2.from_numpy(detection1_points) + + detection2_points = np.array( + [[-3.4, -0.25, 0.15], [-3.2, -0.15, 0.15], [-3.4, -0.25, 0.35], [-3.2, -0.15, 0.35]] + ) + pc_det2 = PointCloud2.from_numpy(detection2_points) + + assert pc_det1.bounding_box_intersects(pc_det2) == True + + # Test 7: Single point clouds + pc_single1 = PointCloud2.from_numpy(np.array([[1.0, 1.0, 1.0]])) + pc_single2 = PointCloud2.from_numpy(np.array([[1.0, 1.0, 1.0]])) + pc_single3 = PointCloud2.from_numpy(np.array([[2.0, 2.0, 2.0]])) + + # Same point should intersect + assert pc_single1.bounding_box_intersects(pc_single2) == True + # Different points should not intersect + assert pc_single1.bounding_box_intersects(pc_single3) == False + + # Test 8: Empty point clouds + pc_empty1 = PointCloud2.from_numpy(np.array([]).reshape(0, 3)) + pc_empty2 = PointCloud2.from_numpy(np.array([]).reshape(0, 3)) + pc_nonempty = PointCloud2.from_numpy(np.array([[1.0, 1.0, 1.0]])) + + # Empty clouds should handle gracefully (Open3D returns inf bounds) + # This might raise an exception or return False - we should handle gracefully + try: + result = pc_empty1.bounding_box_intersects(pc_empty2) + # If no exception, verify behavior is consistent + assert isinstance(result, bool) + except: + # If it raises an exception, that's also acceptable for empty clouds + pass + + print("✓ All bounding box intersection tests passed!") + + if __name__ == "__main__": test_lcm_encode_decode() test_ros_conversion() + test_bounding_box_intersects() diff --git a/dimos/msgs/sensor_msgs/test_image.py b/dimos/msgs/sensor_msgs/test_image.py index 7a3c84a8b6..0af559904a 100644 --- a/dimos/msgs/sensor_msgs/test_image.py +++ b/dimos/msgs/sensor_msgs/test_image.py @@ -14,8 +14,9 @@ import numpy as np import pytest +from reactivex import operators as ops -from dimos.msgs.sensor_msgs.Image import Image, ImageFormat, sharpness_window +from dimos.msgs.sensor_msgs.Image import Image, ImageFormat, sharpness_barrier, sharpness_window from dimos.utils.data import get_data from dimos.utils.testing import TimedSensorReplay @@ -65,7 +66,7 @@ def test_opencv_conversion(img: Image): @pytest.mark.tool -def test_sharpness_detector(): +def test_sharpness_stream(): get_data("unitree_office_walk") # Preload data for testing video_store = TimedSensorReplay( "unitree_office_walk/video", autocast=lambda x: Image.from_numpy(x).to_rgb() @@ -74,55 +75,85 @@ def test_sharpness_detector(): cnt = 0 for image in video_store.iterate(): cnt = cnt + 1 - print(image.sharpness()) + print(image.sharpness) if cnt > 30: return -@pytest.mark.tool -def test_sharpness_sliding_window_foxglove(): +def test_sharpness_barrier(): import time + from unittest.mock import MagicMock + + # Create mock images with known sharpness values + # This avoids loading real data from disk + mock_images = [] + sharpness_values = [0.3711, 0.3241, 0.3067, 0.2583, 0.3665] # Just 5 images for 1 window + + for i, sharp in enumerate(sharpness_values): + img = MagicMock() + img.sharpness = sharp + img.ts = 1758912038.208 + i * 0.01 # Simulate timestamps + mock_images.append(img) + + # Track what goes into windows and what comes out + start_wall_time = None + window_contents = [] # List of (wall_time, image) + emitted_images = [] + + def track_input(img): + """Track all images going into sharpness_barrier with wall-clock time""" + nonlocal start_wall_time + wall_time = time.time() + if start_wall_time is None: + start_wall_time = wall_time + relative_time = wall_time - start_wall_time + window_contents.append((relative_time, img)) + return img + + def track_output(img): + """Track what sharpness_barrier emits""" + emitted_images.append(img) + + # Use 20Hz frequency (0.05s windows) for faster test + # Emit images at 100Hz to get ~5 per window + from reactivex import from_iterable, interval + + source = from_iterable(mock_images).pipe( + ops.zip(interval(0.01)), # 100Hz emission rate + ops.map(lambda x: x[0]), # Extract just the image + ) - from dimos.msgs.geometry_msgs import Vector3 - from dimos.protocol.pubsub.lcmpubsub import LCM, Topic - - lcm = LCM() - lcm.start() - - ping = 0 - sharp_topic = Topic("/sharp", Image) - all_topic = Topic("/all", Image) - sharpness_topic = Topic("/sharpness", Vector3) - - get_data("unitree_office_walk") # Preload data for testing - video_stream = TimedSensorReplay( - "unitree_office_walk/video", autocast=lambda x: Image.from_numpy(x).to_rgb() - ).stream() + source.pipe( + ops.do_action(track_input), # Track inputs + sharpness_barrier(20), # 20Hz = 0.05s windows + ops.do_action(track_output), # Track outputs + ).run() - # Publish all images to all_topic - video_stream.subscribe(lambda x: lcm.publish(all_topic, x)) + # Only need 0.08s for 1 full window at 20Hz plus buffer + time.sleep(0.08) - def sharpness_vector(x: Image): - nonlocal ping - sharpness = x.sharpness() - if ping: - y = 1 - ping = ping - 1 - else: - y = 0 + # Verify we got correct emissions + assert len(emitted_images) >= 1, f"Expected at least 1 emission, got {len(emitted_images)}" - return Vector3([sharpness, y, 0]) + # Group inputs by wall-clock windows and verify we got the sharpest + window_duration = 0.05 # 20Hz - video_stream.subscribe(lambda x: lcm.publish(sharpness_topic, sharpness_vector(x))) + # Test just the first window + for window_idx in range(min(1, len(emitted_images))): + window_start = window_idx * window_duration + window_end = window_start + window_duration - def pub_sharp(x: Image): - nonlocal ping - ping = 3 - lcm.publish(sharp_topic, x) + # Get all images that arrived during this wall-clock window + window_imgs = [ + img for wall_time, img in window_contents if window_start <= wall_time < window_end + ] - sharpness_window( - 1, - source=video_stream, - ).subscribe(pub_sharp) + if window_imgs: + max_sharp = max(img.sharpness for img in window_imgs) + emitted_sharp = emitted_images[window_idx].sharpness - time.sleep(120) + # Verify we emitted the sharpest + assert abs(emitted_sharp - max_sharp) < 0.0001, ( + f"Window {window_idx}: Emitted image (sharp={emitted_sharp}) " + f"is not the sharpest (max={max_sharp})" + ) diff --git a/dimos/msgs/std_msgs/Bool.py b/dimos/msgs/std_msgs/Bool.py new file mode 100644 index 0000000000..6af250277e --- /dev/null +++ b/dimos/msgs/std_msgs/Bool.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Bool message type.""" + +from typing import ClassVar + +from dimos_lcm.std_msgs import Bool as LCMBool + +try: + from std_msgs.msg import Bool as ROSBool +except ImportError: + ROSBool = None + + +class Bool(LCMBool): + """ROS-compatible Bool message.""" + + msg_name = "std_msgs.Bool" + + def __init__(self, data: bool = False): + """Initialize Bool with data value.""" + self.data = data + + @classmethod + def from_ros_msg(cls, ros_msg: ROSBool) -> "Bool": + """Create a Bool from a ROS std_msgs/Bool message. + + Args: + ros_msg: ROS Bool message + + Returns: + Bool instance + """ + return cls(data=ros_msg.data) + + def to_ros_msg(self) -> ROSBool: + """Convert to a ROS std_msgs/Bool message. + + Returns: + ROS Bool message + """ + if ROSBool is None: + raise ImportError("ROS std_msgs not available") + ros_msg = ROSBool() + ros_msg.data = bool(self.data) + return ros_msg diff --git a/dimos/msgs/std_msgs/__init__.py b/dimos/msgs/std_msgs/__init__.py index 3ed93fa77d..898b1035b5 100644 --- a/dimos/msgs/std_msgs/__init__.py +++ b/dimos/msgs/std_msgs/__init__.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .Bool import Bool from .Header import Header from .Int32 import Int32 -__all__ = ["Header", "Int32"] +__all__ = ["Bool", "Header", "Int32"] diff --git a/dimos/navigation/visual/query.py b/dimos/navigation/visual/query.py index 42ec9623ef..7f54664c31 100644 --- a/dimos/navigation/visual/query.py +++ b/dimos/navigation/visual/query.py @@ -13,25 +13,23 @@ # limitations under the License. from typing import Optional -import numpy as np + from dimos.models.qwen.video_query import BBox from dimos.models.vl.base import VlModel +from dimos.msgs.sensor_msgs import Image from dimos.utils.generic import extract_json_from_llm_response def get_object_bbox_from_image( - vl_model: VlModel, image_data: np.ndarray, object_description: str + vl_model: VlModel, image: Image, object_description: str ) -> Optional[BBox]: - if not isinstance(image_data, np.ndarray): - raise ValueError("Frame must be a numpy array") - prompt = ( f"Look at this image and find the '{object_description}'. " "Return ONLY a JSON object with format: {'name': 'object_name', 'bbox': [x1, y1, x2, y2]} " "where x1,y1 is the top-left and x2,y2 is the bottom-right corner of the bounding box. If not found, return None." ) - response = vl_model.query(image_data, prompt) + response = vl_model.query(image, prompt) result = extract_json_from_llm_response(response) if not result: diff --git a/dimos/perception/detection2d/conftest.py b/dimos/perception/detection2d/conftest.py index c471965287..b19cdcbd00 100644 --- a/dimos/perception/detection2d/conftest.py +++ b/dimos/perception/detection2d/conftest.py @@ -12,21 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -import functools -from typing import Optional, TypedDict +from typing import Callable, Optional, TypedDict, Union import pytest from dimos_lcm.foxglove_msgs.ImageAnnotations import ImageAnnotations -from dimos_lcm.sensor_msgs import CameraInfo, PointCloud2 +from dimos_lcm.foxglove_msgs.SceneUpdate import SceneUpdate +from dimos_lcm.visualization_msgs.MarkerArray import MarkerArray -from dimos.core import start -from dimos.core.transport import LCMTransport from dimos.msgs.geometry_msgs import Transform +from dimos.msgs.sensor_msgs import CameraInfo from dimos.msgs.sensor_msgs.Image import Image from dimos.perception.detection2d.module2D import Detection2DModule from dimos.perception.detection2d.module3D import Detection3DModule -from dimos.perception.detection2d.type import ImageDetections3D -from dimos.protocol.service import lcmservice as lcm +from dimos.perception.detection2d.moduleDB import ObjectDBModule +from dimos.perception.detection2d.type import ( + Detection2D, + Detection3D, + ImageDetections2D, + ImageDetections3D, +) from dimos.protocol.tf import TF from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule from dimos.robot.unitree_webrtc.type.lidar import LidarMessage @@ -44,123 +48,162 @@ class Moment(TypedDict, total=False): tf: TF annotations: Optional[ImageAnnotations] detections: Optional[ImageDetections3D] + markers: Optional[MarkerArray] + scene_update: Optional[SceneUpdate] + + +class Moment2D(Moment): + detections2d: ImageDetections2D + + +class Moment3D(Moment): + detections3d: ImageDetections3D + + +@pytest.fixture +def tf(): + t = TF() + yield t + t.stop() @pytest.fixture -def dimos_cluster(): - dimos = start(5) - yield dimos - dimos.stop() +def get_moment(tf): + def moment_provider(**kwargs) -> Moment: + seek = kwargs.get("seek", 10.0) + data_dir = "unitree_go2_lidar_corrected" + get_data(data_dir) -@pytest.fixture(scope="function") -def moment(): - data_dir = "unitree_go2_lidar_corrected" - get_data(data_dir) + lidar_frame = TimedSensorReplay(f"{data_dir}/lidar").find_closest_seek(seek) - seek = 10 + image_frame = TimedSensorReplay( + f"{data_dir}/video", + ).find_closest(lidar_frame.ts) - lidar_frame = TimedSensorReplay(f"{data_dir}/lidar").find_closest_seek(seek) + image_frame.frame_id = "camera_optical" - image_frame = TimedSensorReplay( - f"{data_dir}/video", - ).find_closest(lidar_frame.ts) + odom_frame = TimedSensorReplay(f"{data_dir}/odom", autocast=Odometry.from_msg).find_closest( + lidar_frame.ts + ) - image_frame.frame_id = "camera_optical" + transforms = ConnectionModule._odom_to_tf(odom_frame) + + tf.receive_transform(*transforms) + return { + "odom_frame": odom_frame, + "lidar_frame": lidar_frame, + "image_frame": image_frame, + "camera_info": ConnectionModule._camera_info(), + "transforms": transforms, + "tf": tf, + } + + return moment_provider - odom_frame = TimedSensorReplay(f"{data_dir}/odom", autocast=Odometry.from_msg).find_closest( - lidar_frame.ts - ) - transforms = ConnectionModule._odom_to_tf(odom_frame) - - tf = TF() - tf.publish(*transforms) - - yield { - "odom_frame": odom_frame, - "lidar_frame": lidar_frame, - "image_frame": image_frame, - "camera_info": ConnectionModule._camera_info(), - "transforms": transforms, - "tf": tf, - } - - # Cleanup - tf.stop() - - -@pytest.fixture(scope="function") -def publish_lcm(): - def publish(moment: Moment): - lcm.autoconf() - - transports = [] - - try: - lidar_frame_transport: LCMTransport = LCMTransport("/lidar", LidarMessage) - lidar_frame_transport.publish(moment.get("lidar_frame")) - transports.append(lidar_frame_transport) - - image_frame_transport: LCMTransport = LCMTransport("/image", Image) - image_frame_transport.publish(moment.get("image_frame")) - transports.append(image_frame_transport) - - odom_frame_transport: LCMTransport = LCMTransport("/odom", Odometry) - odom_frame_transport.publish(moment.get("odom_frame")) - transports.append(odom_frame_transport) - - camera_info_transport: LCMTransport = LCMTransport("/camera_info", CameraInfo) - camera_info_transport.publish(moment.get("camera_info")) - transports.append(camera_info_transport) - - annotations = moment.get("annotations") - if annotations: - annotations_transport: LCMTransport = LCMTransport("/annotations", ImageAnnotations) - annotations_transport.publish(annotations) - transports.append(annotations_transport) - - detections = moment.get("detections") - if detections: - for i, detection in enumerate(detections): - detections_transport: LCMTransport = LCMTransport( - f"/detected/pointcloud/{i}", PointCloud2 - ) - detections_transport.publish(detection.pointcloud) - transports.append(detections_transport) - - detections_image_transport: LCMTransport = LCMTransport( - f"/detected/image/{i}", Image - ) - detections_image_transport.publish(detection.cropped_image()) - transports.append(detections_image_transport) - finally: - # Cleanup all transports immediately after publishing - for transport in transports: - if transport._started: - transport.lcm.stop() - - return publish - - -@pytest.fixture(scope="function") -def detections2d(moment: Moment): +@pytest.fixture +def detection2d(get_moment_2d) -> Detection2D: + moment = get_moment_2d(seek=10.0) + assert len(moment["detections2d"]) > 0, "No detections found in the moment" + return moment["detections2d"][0] + + +@pytest.fixture +def detection3d(get_moment_3d) -> Detection3D: + moment = get_moment_3d(seek=10.0) + assert len(moment["detections3d"]) > 0, "No detections found in the moment" + print(moment["detections3d"]) + return moment["detections3d"][0] + + +@pytest.fixture +def get_moment_2d(get_moment) -> Callable[[], Moment2D]: module = Detection2DModule() - yield module.process_image_frame(moment["image_frame"]) + + def moment_provider(**kwargs) -> Moment2D: + moment = get_moment(**kwargs) + detections = module.process_image_frame(moment.get("image_frame")) + + return { + **moment, + "detections2d": detections, + } + + yield moment_provider module._close_module() -@pytest.fixture(scope="function") -def detections3d(moment: Moment): +@pytest.fixture +def get_moment_3d(get_moment_2d) -> Callable[[], Moment2D]: + module = None + + def moment_provider(**kwargs) -> Moment2D: + nonlocal module + moment = get_moment_2d(**kwargs) + + module = Detection3DModule(camera_info=moment["camera_info"]) + + camera_transform = moment["tf"].get("camera_optical", moment.get("lidar_frame").frame_id) + if camera_transform is None: + raise ValueError("No camera_optical transform in tf") + + return { + **moment, + "detections3d": module.process_frame( + moment["detections2d"], moment["lidar_frame"], camera_transform + ), + } + + yield moment_provider + print("Closing 3D detection module", module) + module._close_module() + + +@pytest.fixture +def object_db_module(get_moment): + """Create and populate an ObjectDBModule with detections from multiple frames.""" module2d = Detection2DModule() - detections2d = module2d.process_image_frame(moment["image_frame"]) - camera_transform = moment["tf"].get("camera_optical", "world") - if camera_transform is None: - raise ValueError("No camera_optical transform in tf") + module3d = Detection3DModule(camera_info=ConnectionModule._camera_info()) + moduleDB = ObjectDBModule( + camera_info=ConnectionModule._camera_info(), + goto=lambda obj_id: None, # No-op for testing + ) + + # Process 5 frames to build up object history + for i in range(5): + seek_value = 10.0 + (i * 2) + moment = get_moment(seek=seek_value) + + # Process 2D detections + imageDetections2d = module2d.process_image_frame(moment["image_frame"]) - module3d = Detection3DModule(camera_info=moment["camera_info"]) + # Get camera transform + camera_transform = moment["tf"].get("camera_optical", moment.get("lidar_frame").frame_id) - yield module3d.process_frame(detections2d, moment["lidar_frame"], camera_transform) + # Process 3D detections + imageDetections3d = module3d.process_frame( + imageDetections2d, moment["lidar_frame"], camera_transform + ) + # Add to database + moduleDB.add_detections(imageDetections3d) + + yield moduleDB module2d._close_module() module3d._close_module() + moduleDB._close_module() + + +@pytest.fixture +def first_object(object_db_module): + """Get the first object from the database.""" + objects = list(object_db_module.objects.values()) + assert len(objects) > 0, "No objects found in database" + return objects[0] + + +@pytest.fixture +def all_objects(object_db_module): + """Get all objects from the database.""" + return list(object_db_module.objects.values()) diff --git a/dimos/perception/detection2d/module2D.py b/dimos/perception/detection2d/module2D.py index 92d1c0612b..82fb181be9 100644 --- a/dimos/perception/detection2d/module2D.py +++ b/dimos/perception/detection2d/module2D.py @@ -12,55 +12,158 @@ # See the License for the specific language governing permissions and # limitations under the License. import functools -from typing import Any, Callable, List, Optional +import json +import time +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Callable, Optional +import numpy as np from dimos_lcm.foxglove_msgs.ImageAnnotations import ( ImageAnnotations, ) from reactivex import operators as ops from reactivex.observable import Observable +from reactivex.subject import Subject from dimos.core import In, Module, Out, rpc +from dimos.models.vl import QwenVlModel, VlModel from dimos.msgs.sensor_msgs import Image +from dimos.msgs.sensor_msgs.Image import sharpness_barrier from dimos.msgs.vision_msgs import Detection2DArray -from dimos.perception.detection2d.type import Detection2D, ImageDetections2D +from dimos.perception.detection2d.type import ( + Detection2D, + ImageDetections2D, + InconvinientDetectionFormat, +) from dimos.perception.detection2d.yolo_2d_det import Yolo2DDetector from dimos.utils.reactive import backpressure +class Detector(ABC): + @abstractmethod + def process_image(self, image: np.ndarray) -> InconvinientDetectionFormat: ... + + +@dataclass +class Config: + detector: Optional[Callable[[Any], Detector]] = Yolo2DDetector + max_freq: float = 0.5 # hz + vlmodel: VlModel = QwenVlModel + + class Detection2DModule(Module): + config: Config + detector: Detector + image: In[Image] = None # type: ignore detections: Out[Detection2DArray] = None # type: ignore annotations: Out[ImageAnnotations] = None # type: ignore - _initDetector = Yolo2DDetector + detected_image_0: Out[Image] = None # type: ignore + detected_image_1: Out[Image] = None # type: ignore + detected_image_2: Out[Image] = None # type: ignore + + detected_image_0: Out[Image] = None # type: ignore + detected_image_1: Out[Image] = None # type: ignore + detected_image_2: Out[Image] = None # type: ignore - def __init__(self, *args, detector=Optional[Callable[[Any], Any]], **kwargs): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - if detector: - self._detectorClass = detector - self.detector = self._initDetector() + self.config: Config = Config(**kwargs) + self.detector = self.config.detector() + self.vlmodel = self.config.vlmodel() + self.vlm_detections_subject = Subject() + + def vlm_query(self, query: str) -> ImageDetections2D: + image = self.sharp_image_stream().pipe(ops.take(1)).run() + + full_query = f"""show me a bounding boxes in pixels for this query: `{query}` + + format should be: + `[ + [label, x1, y1, x2, y2] + ... + ]` + + (etc, multiple matches are possible) + + If there's no match return `[]`. Label is whatever you think is appropriate + + Only respond with the coordinates, no other text.""" + + response = self.vlmodel.query(image, full_query) + coords = json.loads(response) + + imageDetections = ImageDetections2D(image) + + for track_id, detection_list in enumerate(coords): + if len(detection_list) != 5: + continue + name = detection_list[0] + bbox = list(map(float, detection_list[1:])) + imageDetections.detections.append( + Detection2D( + bbox=bbox, + track_id=track_id, + class_id=-100, + confidence=1.0, + name=name, + ts=time.time(), + image=image, + ) + ) + + print("vlm detected", imageDetections) + # Emit the VLM detections to the subject + self.vlm_detections_subject.on_next(imageDetections) + + return imageDetections def process_image_frame(self, image: Image) -> ImageDetections2D: - detections = ImageDetections2D.from_detector( + print("Processing image frame for detections", image) + return ImageDetections2D.from_detector( image, self.detector.process_image(image.to_opencv()) ) - return detections @functools.cache - def detection_stream(self) -> Observable[ImageDetections2D]: - return backpressure(self.image.observable().pipe(ops.map(self.process_image_frame))) + def sharp_image_stream(self) -> Observable[Image]: + return backpressure( + self.image.observable().pipe( + sharpness_barrier(self.config.max_freq), + ) + ) + + @functools.cache + def detection_stream_2d(self) -> Observable[ImageDetections2D]: + # self.vlm_detections_subject + # Regular detection stream from the detector + regular_detections = self.sharp_image_stream().pipe(ops.map(self.process_image_frame)) + # Merge with VL model detections + return backpressure(regular_detections.pipe(ops.merge(self.vlm_detections_subject))) @rpc def start(self): - self.detection_stream().subscribe( - lambda det: self.detections.publish(det.to_ros_detection2d_array()) - ) + # self.detection_stream_2d().subscribe( + # lambda det: self.detections.publish(det.to_ros_detection2d_array()) + # ) - self.detection_stream().subscribe( + def publish_cropped_images(detections: ImageDetections2D): + for index, detection in enumerate(detections[:3]): + image_topic = getattr(self, "detected_image_" + str(index)) + image_topic.publish(detection.cropped_image()) + + self.detection_stream_2d().subscribe( lambda det: self.annotations.publish(det.to_foxglove_annotations()) ) + def publish_cropped(detections: ImageDetections2D): + for index, detection in enumerate(detections[:3]): + image_topic = getattr(self, "detected_image_" + str(index)) + image_topic.publish(detection.cropped_image()) + + self.detection_stream_2d().subscribe(publish_cropped) + @rpc def stop(self): ... diff --git a/dimos/perception/detection2d/module3D.py b/dimos/perception/detection2d/module3D.py index 98a1dc14af..0ad3517bf5 100644 --- a/dimos/perception/detection2d/module3D.py +++ b/dimos/perception/detection2d/module3D.py @@ -12,32 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -import time -from typing import List, Optional, Tuple -import numpy as np -from dimos_lcm.foxglove_msgs.ImageAnnotations import ( - ImageAnnotations, -) from dimos_lcm.sensor_msgs import CameraInfo from reactivex import operators as ops +from reactivex.observable import Observable from dimos.core import In, Out, rpc from dimos.msgs.geometry_msgs import Transform from dimos.msgs.sensor_msgs import Image, PointCloud2 -from dimos.msgs.vision_msgs import Detection2DArray from dimos.perception.detection2d.module2D import Detection2DModule - -# from dimos.perception.detection2d.detic import Detic2DDetector from dimos.perception.detection2d.type import ( - Detection2D, ImageDetections2D, ImageDetections3D, ) - -# Type aliases for clarity -ImageDetections = Tuple[Image, List[Detection2D]] -ImageDetection = Tuple[Image, Detection2D] +from dimos.perception.detection2d.type.detection3d import Detection3D +from dimos.types.timestamped import align_timestamped +from dimos.utils.reactive import backpressure class Detection3DModule(Detection2DModule): @@ -45,164 +35,19 @@ class Detection3DModule(Detection2DModule): image: In[Image] = None # type: ignore pointcloud: In[PointCloud2] = None # type: ignore - # type: ignore - detections: Out[Detection2DArray] = None # type: ignore - annotations: Out[ImageAnnotations] = None # type: ignore detected_pointcloud_0: Out[PointCloud2] = None # type: ignore detected_pointcloud_1: Out[PointCloud2] = None # type: ignore detected_pointcloud_2: Out[PointCloud2] = None # type: ignore - detected_image_0: Out[Image] = None # type: ignore - detected_image_1: Out[Image] = None # type: ignore - detected_image_2: Out[Image] = None # type: ignore + + detection_3d_stream: Observable[ImageDetections3D] = None def __init__(self, camera_info: CameraInfo, *args, **kwargs): - self.camera_info = camera_info super().__init__(*args, **kwargs) - - def detect(self, image: Image) -> ImageDetections: - detections = Detection2D.from_detector( - self.detector.process_image(image.to_opencv()), ts=image.ts - ) - return (image, detections) - - def project_points_to_camera( - self, - points_3d: np.ndarray, - camera_matrix: np.ndarray, - extrinsics: Transform, - ) -> Tuple[np.ndarray, np.ndarray]: - """Project 3D points to 2D camera coordinates.""" - # Transform points from world to camera_optical frame - points_homogeneous = np.hstack([points_3d, np.ones((points_3d.shape[0], 1))]) - extrinsics_matrix = extrinsics.to_matrix() - points_camera = (extrinsics_matrix @ points_homogeneous.T).T - - # Filter out points behind the camera - valid_mask = points_camera[:, 2] > 0 - points_camera = points_camera[valid_mask] - - # Project to 2D - points_2d_homogeneous = (camera_matrix @ points_camera[:, :3].T).T - points_2d = points_2d_homogeneous[:, :2] / points_2d_homogeneous[:, 2:3] - - return points_2d, valid_mask - - def filter_points_in_detections( - self, - detections: ImageDetections2D, - pointcloud: PointCloud2, - world_to_camera_transform: Transform, - ) -> List[Optional[PointCloud2]]: - """Filter lidar points that fall within detection bounding boxes.""" - # Extract camera parameters - camera_info = self.camera_info - fx, fy, cx = camera_info.K[0], camera_info.K[4], camera_info.K[2] - cy = camera_info.K[5] - image_width = camera_info.width - image_height = camera_info.height - - camera_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]]) - - # Convert pointcloud to numpy array - lidar_points = pointcloud.as_numpy() - - # Project all points to camera frame - points_2d_all, valid_mask = self.project_points_to_camera( - lidar_points, camera_matrix, world_to_camera_transform - ) - valid_3d_points = lidar_points[valid_mask] - points_2d = points_2d_all.copy() - - # Filter points within image bounds - in_image_mask = ( - (points_2d[:, 0] >= 0) - & (points_2d[:, 0] < image_width) - & (points_2d[:, 1] >= 0) - & (points_2d[:, 1] < image_height) - ) - points_2d = points_2d[in_image_mask] - valid_3d_points = valid_3d_points[in_image_mask] - - filtered_pointclouds: List[Optional[PointCloud2]] = [] - - for detection in detections: - # Extract bbox from Detection2D object - bbox = detection.bbox - x_min, y_min, x_max, y_max = bbox - - # Find points within this detection box (with small margin) - margin = 5 # pixels - in_box_mask = ( - (points_2d[:, 0] >= x_min - margin) - & (points_2d[:, 0] <= x_max + margin) - & (points_2d[:, 1] >= y_min - margin) - & (points_2d[:, 1] <= y_max + margin) - ) - - detection_points = valid_3d_points[in_box_mask] - - # Create PointCloud2 message for this detection - if detection_points.shape[0] > 0: - detection_pointcloud = PointCloud2.from_numpy( - detection_points, - frame_id=pointcloud.frame_id, - timestamp=pointcloud.ts, - ) - filtered_pointclouds.append(detection_pointcloud) - else: - filtered_pointclouds.append(None) - - return filtered_pointclouds - - def combine_pointclouds(self, pointcloud_list: List[PointCloud2]) -> PointCloud2: - """Combine multiple pointclouds into a single one.""" - # Filter out None values - valid_pointclouds = [pc for pc in pointcloud_list if pc is not None] - - if not valid_pointclouds: - # Return empty pointcloud if no valid pointclouds - return PointCloud2.from_numpy( - np.array([]).reshape(0, 3), frame_id="world", timestamp=time.time() - ) - - # Combine all point arrays - all_points = np.vstack([pc.as_numpy() for pc in valid_pointclouds]) - - # Use frame_id and timestamp from first pointcloud - combined_pointcloud = PointCloud2.from_numpy( - all_points, - frame_id=valid_pointclouds[0].frame_id, - timestamp=valid_pointclouds[0].ts, - ) - - return combined_pointcloud - - def hidden_point_removal( - self, camera_transform: Transform, pc: PointCloud2, radius: float = 100.0 - ): - camera_position = camera_transform.inverse().translation - camera_pos_np = camera_position.to_numpy().reshape(3, 1) - - pcd = pc.pointcloud - try: - _, visible_indices = pcd.hidden_point_removal(camera_pos_np, radius) - visible_pcd = pcd.select_by_index(visible_indices) - - return PointCloud2(visible_pcd, frame_id=pc.frame_id, ts=pc.ts) - except Exception as e: - return pc - - def cleanup_pointcloud(self, pc: PointCloud2) -> PointCloud2: - height = pc.filter_by_height(-0.05) - statistical, _ = height.pointcloud.remove_statistical_outlier( - nb_neighbors=20, std_ratio=2.0 - ) - return PointCloud2(statistical, pc.frame_id, pc.ts) + self.camera_info = camera_info def process_frame( self, - # these have to be timestamp aligned detections: ImageDetections2D, pointcloud: PointCloud2, transform: Transform, @@ -210,57 +55,48 @@ def process_frame( if not transform: return ImageDetections3D(detections.image, []) - pointcloud_list = self.filter_points_in_detections(detections, pointcloud, transform) - + print("3d projection", detections, pointcloud, transform) detection3d_list = [] - for detection, pc in zip(detections, pointcloud_list): - if pc is None: - continue - pc = self.hidden_point_removal(transform, self.cleanup_pointcloud(pc)) - if pc is None: - continue - - detection3d_list.append(detection.to_3d(pointcloud=pc, transform=transform)) + for detection in detections: + detection3d = Detection3D.from_2d( + detection, + world_pointcloud=pointcloud, + camera_info=self.camera_info, + world_to_optical_transform=transform, + ) + if detection3d is not None: + detection3d_list.append(detection3d) - return ImageDetections3D(detections.image, detection3d_list) + ret = ImageDetections3D(detections.image, detection3d_list) + print("3d projection finished", ret) + return ret @rpc def start(self): - time_tolerance = 5.0 # seconds + super().start() def detection2d_to_3d(args): detections, pc = args - transform = self.tf.get("camera_optical", "world", detections.image.ts, time_tolerance) + transform = self.tf.get("camera_optical", pc.frame_id, detections.image.ts, 5.0) return self.process_frame(detections, pc, transform) - combined_stream = self.detection_stream().pipe( - ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d) - ) - - self.detection_stream().subscribe( - lambda det: self.detections.publish(det.to_ros_detection2d_array()) - ) + self.detection_stream_3d = align_timestamped( + backpressure(self.detection_stream_2d()), + self.pointcloud.observable(), + match_tolerance=0.25, + buffer_size=20.0, + ).pipe(ops.map(detection2d_to_3d)) - self.detection_stream().subscribe( - lambda det: self.annotations.publish(det.to_image_annotations()) - ) + # self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe( + # ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d) + # ) - combined_stream.subscribe(self._handle_combined_detections) + self.detection_stream_3d.subscribe(self._publish_detections) - def _handle_combined_detections(self, detections: ImageDetections3D): + def _publish_detections(self, detections: ImageDetections3D): if not detections: return - print(detections) - - if len(detections) > 0: - self.detected_pointcloud_0.publish(detections[0].pointcloud) - self.detected_image_0.publish(detections[0].cropped_image()) - - if len(detections) > 1: - self.detected_pointcloud_1.publish(detections[1].pointcloud) - self.detected_image_1.publish(detections[1].cropped_image()) - - if len(detections) > 3: - self.detected_pointcloud_2.publish(detections[2].pointcloud) - self.detected_image_2.publish(detections[2].cropped_image()) + for index, detection in enumerate(detections[:3]): + pointcloud_topic = getattr(self, "detected_pointcloud_" + str(index)) + pointcloud_topic.publish(detection.pointcloud) diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection2d/moduleDB.py index 8e9205c4f3..052b65d6c7 100644 --- a/dimos/perception/detection2d/moduleDB.py +++ b/dimos/perception/detection2d/moduleDB.py @@ -11,46 +11,353 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import functools +import threading import time -from typing import List, Optional, Tuple +from copy import copy +from typing import Any, Callable, Dict, List, Optional -import numpy as np -from dimos_lcm.foxglove_msgs.ImageAnnotations import ( - ImageAnnotations, -) -from dimos_lcm.sensor_msgs import CameraInfo -from dimos_lcm.vision_msgs import Detection2D as ROSDetection2D -from reactivex import operators as ops +from dimos_lcm.foxglove_msgs.ImageAnnotations import ImageAnnotations +from lcm_msgs.foxglove_msgs import SceneUpdate +from reactivex.observable import Observable +from dimos.agents2 import Agent, Output, Reducer, Stream, skill from dimos.core import In, Out, rpc -from dimos.msgs.geometry_msgs import Transform +from dimos.msgs.geometry_msgs import PoseStamped, Quaternion, Transform, Vector3 from dimos.msgs.sensor_msgs import Image, PointCloud2 -from dimos.msgs.vision_msgs import Detection2DArray, Detection3DArray -from dimos.perception.detection2d.module2D import Detection2DModule +from dimos.msgs.vision_msgs import Detection2DArray from dimos.perception.detection2d.module3D import Detection3DModule -from dimos.perception.detection2d.type import ( - Detection2D, - Detection3D, - ImageDetections2D, - ImageDetections3D, -) -from dimos.protocol.skill import skill +from dimos.perception.detection2d.type import Detection3D, ImageDetections3D, TableStr +from dimos.protocol.skill.skill import skill +from dimos.protocol.skill.type import Output, Reducer, Stream +from dimos.types.timestamped import to_datetime -class DetectionDBModule(Detection3DModule): - @rpc - def start(self): - super().start() - self.pointcloud_stream().subscribe(self.add_detections) +# Represents an object in space, as collection of 3d detections over time +class Object3D(Detection3D): + best_detection: Detection3D = None + center: Vector3 = None + track_id: str = None + detections: List[Detection3D] + + def to_repr_dict(self) -> Dict[str, Any]: + return { + "object_id": self.track_id, + "detections": len(self.detections), + "center": "[" + ", ".join(list(map(lambda n: f"{n:1f}", self.center.to_list()))) + "]", + } + + def __init__(self, track_id: str, detection: Optional[Detection3D] = None, *args, **kwargs): + if detection is None: + self.detections = [] + return + self.ts = detection.ts + self.track_id = track_id + self.class_id = detection.class_id + self.name = detection.name + self.confidence = detection.confidence + self.pointcloud = detection.pointcloud + self.bbox = detection.bbox + self.transform = detection.transform + self.center = detection.center + self.frame_id = detection.frame_id + self.detections = [detection] + self.best_detection = detection + + def __add__(self, detection: Detection3D) -> "Object3D": + new_object = Object3D(self.track_id) + new_object.bbox = detection.bbox + new_object.confidence = max(self.confidence, detection.confidence) + new_object.ts = max(self.ts, detection.ts) + new_object.track_id = self.track_id + new_object.class_id = self.class_id + new_object.name = self.name + new_object.transform = self.transform + new_object.pointcloud = self.pointcloud + detection.pointcloud + new_object.frame_id = self.frame_id + new_object.center = (self.center + detection.center) / 2 + new_object.detections = self.detections + [detection] + + if detection.bbox_2d_volume() > self.bbox_2d_volume(): + new_object.best_detection = detection + else: + new_object.best_detection = self.best_detection + + return new_object + + @property + def image(self) -> Image: + return self.best_detection.image + + def scene_entity_label(self) -> str: + return f"{self.name} ({len(self.detections)})" + + def agent_encode(self): + return { + "id": self.track_id, + "name": self.name, + "detections": len(self.detections), + "last_seen": f"{round((time.time() - self.ts))}s ago", + # "position": self.to_pose().position.agent_encode(), + } + + def to_pose(self) -> PoseStamped: + optical_inverse = Transform( + translation=Vector3(0.0, 0.0, 0.0), + rotation=Quaternion(-0.5, 0.5, -0.5, 0.5), + frame_id="camera_link", + child_frame_id="camera_optical", + ).inverse() + + print("transform is", self.best_detection.transform) + + global_transform = optical_inverse + self.best_detection.transform + + print("inverse optical is", global_transform) + + print("obj center is", self.center) + global_pose = global_transform.to_pose() + print("Global pose:", global_pose) + global_pose.frame_id = self.best_detection.frame_id + print("remap to", self.best_detection.frame_id) + return PoseStamped( + position=self.center, orientation=Quaternion(), frame_id=self.best_detection.frame_id + ) - def add_detections(self, detections: List[Detection3DArray]): - for det in detections: - self.add_detection(det) + +class ObjectDBModule(Detection3DModule, TableStr): + cnt: int = 0 + objects: dict[str, Object3D] + object_stream: Observable[Object3D] = None + + goto: Callable[[PoseStamped], Any] = None + + image: In[Image] = None # type: ignore + pointcloud: In[PointCloud2] = None # type: ignore + + detections: Out[Detection2DArray] = None # type: ignore + annotations: Out[ImageAnnotations] = None # type: ignore + + detected_pointcloud_0: Out[PointCloud2] = None # type: ignore + detected_pointcloud_1: Out[PointCloud2] = None # type: ignore + detected_pointcloud_2: Out[PointCloud2] = None # type: ignore + + detected_image_0: Out[Image] = None # type: ignore + detected_image_1: Out[Image] = None # type: ignore + detected_image_2: Out[Image] = None # type: ignore + + scene_update: Out[SceneUpdate] = None # type: ignore + + target: Out[PoseStamped] = None # type: ignore + + remembered_locations: Dict[str, PoseStamped] + + def __init__(self, goto: Callable[[PoseStamped], Any], *args, **kwargs): + super().__init__(*args, **kwargs) + self.goto = goto + self.objects = {} + self.remembered_locations = {} + + def closest_object(self, detection: Detection3D) -> Optional[Object3D]: + # Filter objects to only those with matching names + matching_objects = [obj for obj in self.objects.values() if obj.name == detection.name] + + if not matching_objects: + return None + + # Sort by distance + distances = sorted(matching_objects, key=lambda obj: detection.center.distance(obj.center)) + + return distances[0] + + def add_detections(self, detections: List[Detection3D]) -> List[Object3D]: + return [ + detection for detection in map(self.add_detection, detections) if detection is not None + ] def add_detection(self, detection: Detection3D): - print("DETECTION", detection) + """Add detection to existing object or create new one.""" + closest = self.closest_object(detection) + if closest and closest.bounding_box_intersects(detection): + return self.add_to_object(closest, detection) + else: + return self.create_new_object(detection) + + def add_to_object(self, closest: Object3D, detection: Detection3D): + new_object = closest + detection + self.objects[closest.track_id] = new_object + return new_object + + def create_new_object(self, detection: Detection3D): + new_object = Object3D(f"obj_{self.cnt}", detection) + self.objects[new_object.track_id] = new_object + self.cnt += 1 + return new_object + + def agent_encode(self) -> List[Any]: + ret = [] + for obj in copy(self.objects).values(): + # we need at least 3 detectieons to consider it a valid object + # for this to be serious we need a ratio of detections within the window of observations + # if len(obj.detections) < 3: + # continue + ret.append(str(obj.agent_encode())) + if not ret: + return "No objects detected yet." + return "\n".join(ret) + + def vlm_query(self, description: str) -> str: + imageDetections2D = super().vlm_query(description) + print("VLM query found", imageDetections2D, "detections") + time.sleep(3) + + if not imageDetections2D.detections: + return None + + ret = [] + for obj in self.objects.values(): + if obj.ts != imageDetections2D.ts: + print( + "Skipping", + obj.track_id, + "ts", + obj.ts, + "!=", + imageDetections2D.ts, + ) + continue + if obj.class_id != -100: + continue + if obj.name != imageDetections2D.detections[0].name: + print("Skipping", obj.name, "!=", imageDetections2D.detections[0].name) + continue + ret.append(obj) + ret.sort(key=lambda x: x.ts) + + return ret[0] if ret else None + + @skill() + def remember_location(self, name: str) -> str: + """Remember the current location with a name.""" + transform = self.tf.get("map", "sensor", time_point=time.time(), time_tolerance=1.0) + if not transform: + return f"Could not get current location transform from map to sensor" + + pose = transform.to_pose() + pose.frame_id = "map" + self.remembered_locations[name] = pose + return f"Location '{name}' saved at position: {pose.position}" + + @skill() + def goto_remembered_location(self, name: str) -> str: + """Go to a remembered location by name.""" + pose = self.remembered_locations.get(name, None) + if not pose: + return f"Location {name} not found. Known locations: {list(self.remembered_locations.keys())}" + self.goto(pose) + return f"Navigating to remembered location {name} and pose {pose}" + + @skill() + def list_remembered_locations(self) -> List[str]: + """List all remembered locations.""" + return str(list(self.remembered_locations.keys())) + + def nav_to(self, target_pose) -> str: + target_pose.orientation = Quaternion(0.0, 0.0, 0.0, 0.0) + self.target.publish(target_pose) + time.sleep(0.1) + self.target.publish(target_pose) + self.goto(target_pose) + + @skill() + def navigate_to_object_in_view(self, query: str) -> str: + """Navigate to an object in your current image view via natural language query using vision-language model to find it.""" + target_obj = self.vlm_query(query) + if not target_obj: + return f"No objects found matching '{query}'" + return self.navigate_to_object_by_id(target_obj.track_id) + + @skill(reducer=Reducer.all) + def list_objects(self): + """List all detected objects that the system remembers and can navigate to.""" + data = self.agent_encode() + return data + + @skill() + def navigate_to_object_by_id(self, object_id: str): + """Navigate to an object by an object id""" + target_obj = self.objects.get(object_id, None) + if not target_obj: + return f"Object {object_id} not found\nHere are the known objects:\n{str(self.agent_encode())}" + target_pose = target_obj.to_pose() + target_pose.frame_id = "map" + self.target.publish(target_pose) + time.sleep(0.1) + self.target.publish(target_pose) + self.nav_to(target_pose) + return f"Navigating to f{object_id} f{target_obj.name}" def lookup(self, label: str) -> List[Detection3D]: """Look up a detection by label.""" return [] + + @rpc + def start(self): + super().start() + + def update_objects(imageDetections: ImageDetections3D): + for detection in imageDetections.detections: + # print(detection) + return self.add_detection(detection) + + def scene_thread(): + while True: + scene_update = self.to_foxglove_scene_update() + self.scene_update.publish(scene_update) + time.sleep(1.0) + + threading.Thread(target=scene_thread, daemon=True).start() + + self.detection_stream_3d.subscribe(update_objects) + + def goto_object(self, object_id: str) -> Optional[Object3D]: + """Go to object by id.""" + return self.objects.get(object_id, None) + + def to_foxglove_scene_update(self) -> "SceneUpdate": + """Convert all detections to a Foxglove SceneUpdate message. + + Returns: + SceneUpdate containing SceneEntity objects for all detections + """ + + # Create SceneUpdate message with all detections + scene_update = SceneUpdate() + scene_update.deletions_length = 0 + scene_update.deletions = [] + scene_update.entities = [] + + for obj in copy(self.objects).values(): + # we need at least 3 detectieons to consider it a valid object + # for this to be serious we need a ratio of detections within the window of observations + if obj.class_id != -100 and len(obj.detections) < 3: + continue + + # print( + # f"Object {obj.track_id}: {len(obj.detections)} detections, confidence {obj.confidence}" + # ) + # print(obj.to_pose()) + + scene_update.entities.append( + obj.to_foxglove_scene_entity( + entity_id=f"object_{obj.name}_{obj.track_id}_{len(obj.detections)}" + ) + ) + + scene_update.entities_length = len(scene_update.entities) + return scene_update + + def __len__(self): + return len(self.objects.values()) + + def __iter__(self): + return iter(self.detections.values()) diff --git a/dimos/perception/detection2d/test_module.py b/dimos/perception/detection2d/test_module.py deleted file mode 100644 index 09ce3d1769..0000000000 --- a/dimos/perception/detection2d/test_module.py +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pytest -from dimos_lcm.foxglove_msgs.ImageAnnotations import ( - ImageAnnotations, -) -from dimos_lcm.sensor_msgs import Image, PointCloud2 - -from dimos.core import LCMTransport -from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3 -from dimos.msgs.nav_msgs import OccupancyGrid -from dimos.msgs.sensor_msgs import PointCloud2 as PointCloud2Msg -from dimos.msgs.vision_msgs import Detection2DArray -from dimos.perception.detection2d.conftest import Moment -from dimos.perception.detection2d.module2D import Detection2DModule -from dimos.perception.detection2d.module3D import Detection3DModule -from dimos.perception.detection2d.type import ( - Detection2D, - Detection3D, - ImageDetections2D, - ImageDetections3D, -) -from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation -from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule -from dimos.robot.unitree_webrtc.type.lidar import LidarMessage -from dimos.robot.unitree_webrtc.type.map import Map - - -def test_module2d(moment: Moment, publish_lcm): - module = Detection2DModule() - detections2d = module.process_image_frame(moment["image_frame"]) - - print(detections2d) - - # Assertions for test_module2d - assert isinstance(detections2d, ImageDetections2D) - assert len(detections2d) == 1 - assert detections2d.image.ts == pytest.approx(1757960670.490248) - assert detections2d.image.shape == (720, 1280, 3) - assert detections2d.image.frame_id == "camera_optical" - - # Check first detection - det = detections2d.detections[0] - assert isinstance(det, Detection2D) - assert det.name == "suitcase" - assert det.class_id == 28 - assert det.track_id == 1 - assert det.confidence == pytest.approx(0.8145349025726318) - - # Check bbox values - assert det.bbox == pytest.approx( - [503.437255859375, 249.89385986328125, 655.950439453125, 469.82879638671875] - ) - - annotations = detections2d.to_image_annotations() - publish_lcm({"annotations": annotations, **moment}) - - module._close_module() - - -def test_module3d(moment: Moment, publish_lcm): - module2d = Detection2DModule() - detections2d = module2d.process_image_frame(moment["image_frame"]) - pointcloud = moment["lidar_frame"] - camera_transform = moment["tf"].get("camera_optical", "world") - if camera_transform is None: - raise ValueError("No camera_optical transform in tf") - annotations = detections2d.to_image_annotations() - - module3d = Detection3DModule(camera_info=moment["camera_info"]) - detections3d = module3d.process_frame(detections2d, pointcloud, camera_transform) - - publish_lcm( - { - **moment, - "annotations": annotations, - "detections": detections3d, - } - ) - - print(detections3d) - - # Assertions for test_module3d - assert isinstance(detections3d, ImageDetections3D) - assert len(detections3d) == 1 - assert detections3d.image.ts == pytest.approx(1757960670.490248) - assert detections3d.image.shape == (720, 1280, 3) - assert detections3d.image.frame_id == "camera_optical" - - # Check first 3D detection - det = detections3d.detections[0] - assert isinstance(det, Detection3D) - assert det.name == "suitcase" - assert det.class_id == 28 - assert det.track_id == 1 - - assert det.confidence == pytest.approx(0.8145349025726318) - - # Check bbox values (should match 2D) - assert det.bbox == pytest.approx( - [503.437255859375, 249.89385986328125, 655.950439453125, 469.82879638671875] - ) - - # 3D-specific assertions - assert isinstance(det.pointcloud, PointCloud2Msg) - assert len(det.pointcloud) == 81 - assert det.pointcloud.frame_id == "world" - assert isinstance(det.transform, Transform) - - # Check center - center = det.center - assert isinstance(center, Vector3) - # Values from output: Vector([ -3.3565 -0.26265 0.18549]) - assert center.x == pytest.approx(-3.3565, abs=1e-4) - assert center.y == pytest.approx(-0.26265, abs=1e-4) - assert center.z == pytest.approx(0.18549, abs=1e-4) - - # Check pose - pose = det.pose - assert isinstance(pose, PoseStamped) - assert pose.frame_id == "world" - assert pose.ts == det.ts - - # Check repr dict values - repr_dict = det.to_repr_dict() - assert repr_dict["dist"] == "0.88m" - assert repr_dict["points"] == "81" - - module2d._close_module() - module3d._close_module() - - -@pytest.mark.tool -def test_module3d_replay(dimos_cluster): - connection = deploy_connection(dimos_cluster, loop=False, speed=1.0) - # mapper = deploy_navigation(dimos_cluster, connection) - mapper = dimos_cluster.deploy( - Map, voxel_size=0.5, cost_resolution=0.05, global_publish_interval=1.0 - ) - mapper.lidar.connect(connection.lidar) - mapper.global_map.transport = LCMTransport("/global_map", LidarMessage) - mapper.global_costmap.transport = LCMTransport("/global_costmap", OccupancyGrid) - mapper.local_costmap.transport = LCMTransport("/local_costmap", OccupancyGrid) - - mapper.start() - - module3D = dimos_cluster.deploy(Detection3DModule, camera_info=ConnectionModule._camera_info()) - - module3D.image.connect(connection.video) - module3D.pointcloud.connect(connection.lidar) - - module3D.annotations.transport = LCMTransport("/annotations", ImageAnnotations) - module3D.detections.transport = LCMTransport("/detections", Detection2DArray) - - module3D.detected_pointcloud_0.transport = LCMTransport("/detected/pointcloud/0", PointCloud2) - module3D.detected_pointcloud_1.transport = LCMTransport("/detected/pointcloud/1", PointCloud2) - module3D.detected_pointcloud_2.transport = LCMTransport("/detected/pointcloud/2", PointCloud2) - - module3D.detected_image_0.transport = LCMTransport("/detected/image/0", Image) - module3D.detected_image_1.transport = LCMTransport("/detected/image/1", Image) - module3D.detected_image_2.transport = LCMTransport("/detected/image/2", Image) - - module3D.start() - connection.start() - import time - - while True: - time.sleep(1) diff --git a/dimos/perception/detection2d/test_moduleDB.py b/dimos/perception/detection2d/test_moduleDB.py new file mode 100644 index 0000000000..a3a1b003fd --- /dev/null +++ b/dimos/perception/detection2d/test_moduleDB.py @@ -0,0 +1,62 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import time + +import pytest +from lcm_msgs.foxglove_msgs import SceneUpdate + +from dimos.core import LCMTransport +from dimos.msgs.foxglove_msgs import ImageAnnotations +from dimos.msgs.geometry_msgs import PoseStamped +from dimos.msgs.sensor_msgs import Image, PointCloud2 +from dimos.msgs.vision_msgs import Detection2DArray +from dimos.perception.detection2d.moduleDB import ObjectDBModule +from dimos.protocol.service import lcmservice as lcm +from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation +from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule + + +@pytest.mark.module +def test_moduleDB(dimos_cluster): + connection = deploy_connection(dimos_cluster) + + moduleDB = dimos_cluster.deploy( + ObjectDBModule, + camera_info=ConnectionModule._camera_info(), + goto=lambda obj_id: print(f"Going to {obj_id}"), + ) + moduleDB.image.connect(connection.video) + moduleDB.pointcloud.connect(connection.lidar) + + moduleDB.annotations.transport = LCMTransport("/annotations", ImageAnnotations) + moduleDB.detections.transport = LCMTransport("/detections", Detection2DArray) + + moduleDB.detected_pointcloud_0.transport = LCMTransport("/detected/pointcloud/0", PointCloud2) + moduleDB.detected_pointcloud_1.transport = LCMTransport("/detected/pointcloud/1", PointCloud2) + moduleDB.detected_pointcloud_2.transport = LCMTransport("/detected/pointcloud/2", PointCloud2) + + moduleDB.detected_image_0.transport = LCMTransport("/detected/image/0", Image) + moduleDB.detected_image_1.transport = LCMTransport("/detected/image/1", Image) + moduleDB.detected_image_2.transport = LCMTransport("/detected/image/2", Image) + + moduleDB.scene_update.transport = LCMTransport("/scene_update", SceneUpdate) + moduleDB.target.transport = LCMTransport("/target", PoseStamped) + + connection.start() + moduleDB.start() + + time.sleep(4) + print("STARTING QUERY!!") + print("VLM RES", moduleDB.navigate_to_object_in_view("white floor")) + time.sleep(30) diff --git a/dimos/perception/detection2d/test_type.py b/dimos/perception/detection2d/test_type.py deleted file mode 100644 index 25264ef727..0000000000 --- a/dimos/perception/detection2d/test_type.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dimos.perception.detection2d.conftest import detections2d, detections3d -from dimos.perception.detection2d.type import ( - Detection2D, - Detection3D, - ImageDetections2D, - ImageDetections3D, -) -from dimos.msgs.sensor_msgs import PointCloud2 -from dimos.msgs.geometry_msgs import Transform, Vector3, PoseStamped - - -def test_detections2d(detections2d): - print(f"\n=== ImageDetections2D Test ===") - print(f"Type: {type(detections2d)}") - print(f"Number of detections: {len(detections2d)}") - print(f"Image timestamp: {detections2d.image.ts}") - print(f"Image shape: {detections2d.image.shape}") - print(f"Image frame_id: {detections2d.image.frame_id}") - - print(f"\nFull detections object:") - print(detections2d) - - # Basic type assertions - assert isinstance(detections2d, ImageDetections2D) - assert hasattr(detections2d, "image") - assert hasattr(detections2d, "detections") - - # Image assertions - assert detections2d.image is not None - assert detections2d.image.ts == 1757960670.490248 - assert detections2d.image.shape == (720, 1280, 3) - assert detections2d.image.frame_id == "camera_optical" - - # Detection count assertions - assert len(detections2d) == 1 - assert isinstance(detections2d.detections, list) - assert len(detections2d.detections) == 1 - - # Test first detection with literal checks - det = detections2d.detections[0] - print(f"\n--- Detection 0 (literal checks) ---") - print(f"Type: {type(det)}") - print(f"Name: {det.name}") - print(f"Class ID: {det.class_id}") - print(f"Track ID: {det.track_id}") - print(f"Confidence: {det.confidence}") - print(f"Bbox: {det.bbox}") - print(f"Timestamp: {det.ts}") - - # Detection type assertions - assert isinstance(det, Detection2D) - - # Literal value assertions - assert det.name == "suitcase" - assert det.class_id == 28 # COCO class 28 is suitcase - assert det.track_id == 1 - assert 0.814 < det.confidence < 0.815 # Allow small floating point variance - - # Data type assertions - assert isinstance(det.name, str) - assert isinstance(det.class_id, int) - assert isinstance(det.track_id, int) - assert isinstance(det.confidence, float) - assert isinstance(det.bbox, (tuple, list)) and len(det.bbox) == 4 - assert isinstance(det.ts, float) - - # Bbox literal checks (with tolerance for float precision) - x1, y1, x2, y2 = det.bbox - assert 503.4 < x1 < 503.5 - assert 249.8 < y1 < 250.0 - assert 655.9 < x2 < 656.0 - assert 469.8 < y2 < 470.0 - - # Bbox format assertions - assert all(isinstance(coord, (int, float)) for coord in det.bbox) - assert x2 > x1, f"x2 ({x2}) should be greater than x1 ({x1})" - assert y2 > y1, f"y2 ({y2}) should be greater than y1 ({y1})" - assert x1 >= 0 and y1 >= 0, "Bbox coordinates should be non-negative" - - # Bbox width/height checks - width = x2 - x1 - height = y2 - y1 - assert 152.0 < width < 153.0 # Expected width ~152.5 - assert 219.0 < height < 221.0 # Expected height ~219.9 - - # Confidence assertions - assert 0.0 <= det.confidence <= 1.0, ( - f"Confidence should be between 0 and 1, got {det.confidence}" - ) - - # Image reference assertion - assert det.image is detections2d.image, "Detection should reference the same image" - - # Timestamp consistency - assert det.ts == detections2d.image.ts - assert det.ts == 1757960670.490248 - - -def test_detections3d(detections3d): - print(f"\n=== ImageDetections3D Test ===") - print(f"Type: {type(detections3d)}") - print(f"Number of detections: {len(detections3d)}") - print(f"Image timestamp: {detections3d.image.ts}") - print(f"Image shape: {detections3d.image.shape}") - print(f"Image frame_id: {detections3d.image.frame_id}") - - print(f"\nFull detections object:") - print(detections3d) - - # Basic type assertions - assert isinstance(detections3d, ImageDetections3D) - assert hasattr(detections3d, "image") - assert hasattr(detections3d, "detections") - - # Image assertions - assert detections3d.image is not None - assert detections3d.image.ts == 1757960670.490248 - assert detections3d.image.shape == (720, 1280, 3) - assert detections3d.image.frame_id == "camera_optical" - - # Detection count assertions - assert len(detections3d) == 1 - assert isinstance(detections3d.detections, list) - assert len(detections3d.detections) == 1 - - # Test first 3D detection with literal checks - det = detections3d.detections[0] - print(f"\n--- Detection3D 0 (literal checks) ---") - print(f"Type: {type(det)}") - print(f"Name: {det.name}") - print(f"Class ID: {det.class_id}") - print(f"Track ID: {det.track_id}") - print(f"Confidence: {det.confidence}") - print(f"Bbox: {det.bbox}") - print(f"Timestamp: {det.ts}") - print(f"Has pointcloud: {hasattr(det, 'pointcloud')}") - print(f"Has transform: {hasattr(det, 'transform')}") - if hasattr(det, "pointcloud"): - print(f"Pointcloud points: {len(det.pointcloud)}") - print(f"Pointcloud frame_id: {det.pointcloud.frame_id}") - - # Detection type assertions - assert isinstance(det, Detection3D) - - # Detection3D should have all Detection2D fields plus pointcloud and transform - assert hasattr(det, "bbox") - assert hasattr(det, "track_id") - assert hasattr(det, "class_id") - assert hasattr(det, "confidence") - assert hasattr(det, "name") - assert hasattr(det, "ts") - assert hasattr(det, "image") - assert hasattr(det, "pointcloud") - assert hasattr(det, "transform") - - # Literal value assertions (should match Detection2D) - assert det.name == "suitcase" - assert det.class_id == 28 # COCO class 28 is suitcase - assert det.track_id == 1 - assert 0.814 < det.confidence < 0.815 # Allow small floating point variance - - # Data type assertions - assert isinstance(det.name, str) - assert isinstance(det.class_id, int) - assert isinstance(det.track_id, int) - assert isinstance(det.confidence, float) - assert isinstance(det.bbox, (tuple, list)) and len(det.bbox) == 4 - assert isinstance(det.ts, float) - - # Bbox literal checks (should match Detection2D) - x1, y1, x2, y2 = det.bbox - assert 503.4 < x1 < 503.5 - assert 249.8 < y1 < 250.0 - assert 655.9 < x2 < 656.0 - assert 469.8 < y2 < 470.0 - - # 3D-specific assertions - assert isinstance(det.pointcloud, PointCloud2) - assert isinstance(det.transform, Transform) - - # Pointcloud assertions - assert len(det.pointcloud) == 81 # Based on the output we saw - assert det.pointcloud.frame_id == "world" # Pointcloud should be in world frame - - # Test center calculation - center = det.center - print(f"\nDetection center: {center}") - assert isinstance(center, Vector3) - assert hasattr(center, "x") - assert hasattr(center, "y") - assert hasattr(center, "z") - - # Test pose property - pose = det.pose - print(f"Detection pose: {pose}") - assert isinstance(pose, PoseStamped) - assert pose.frame_id == "world" - assert pose.ts == det.ts - assert pose.position == center # Pose position should match center - - # Check distance calculation (from to_repr_dict) - repr_dict = det.to_repr_dict() - print(f"\nRepr dict: {repr_dict}") - assert "dist" in repr_dict - assert repr_dict["dist"] == "0.88m" # Based on the output - assert repr_dict["points"] == "81" - assert repr_dict["name"] == "suitcase" - assert repr_dict["class"] == "28" - assert repr_dict["track"] == "1" - - # Image reference assertion - assert det.image is detections3d.image, "Detection should reference the same image" - - # Timestamp consistency - assert det.ts == detections3d.image.ts - assert det.ts == 1757960670.490248 - - -def test_detection3d_to_pose(detections3d): - det = detections3d[0] - pose = det.pose - - # Check that pose is valid - assert pose.frame_id == "world" - assert pose.ts == det.ts diff --git a/dimos/perception/detection2d/type/__init__.py b/dimos/perception/detection2d/type/__init__.py new file mode 100644 index 0000000000..fb7c435c0c --- /dev/null +++ b/dimos/perception/detection2d/type/__init__.py @@ -0,0 +1,7 @@ +from dimos.perception.detection2d.type.detection2d import ( + Detection2D, + ImageDetections2D, + InconvinientDetectionFormat, +) +from dimos.perception.detection2d.type.detection3d import Detection3D, ImageDetections3D +from dimos.perception.detection2d.type.imageDetections import ImageDetections, TableStr diff --git a/dimos/perception/detection2d/type.py b/dimos/perception/detection2d/type/detection2d.py similarity index 61% rename from dimos/perception/detection2d/type.py rename to dimos/perception/detection2d/type/detection2d.py index b37828e8e8..5bbedfb6ae 100644 --- a/dimos/perception/detection2d/type.py +++ b/dimos/perception/detection2d/type/detection2d.py @@ -14,13 +14,10 @@ from __future__ import annotations -import functools import hashlib from dataclasses import dataclass -from typing import Any, Dict, Generic, List, Optional, Tuple, TypeVar +from typing import Any, Dict, List, Tuple -import numpy as np -from dimos_lcm.foxglove_msgs.Color import Color from dimos_lcm.foxglove_msgs.ImageAnnotations import ( PointsAnnotation, TextAnnotation, @@ -37,14 +34,13 @@ Detection2D as ROSDetection2D, ) from rich.console import Console -from rich.table import Table from rich.text import Text from dimos.msgs.foxglove_msgs import ImageAnnotations -from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3 -from dimos.msgs.sensor_msgs import Image, PointCloud2 +from dimos.msgs.foxglove_msgs.Color import Color +from dimos.msgs.sensor_msgs import Image from dimos.msgs.std_msgs import Header -from dimos.msgs.vision_msgs import Detection2DArray +from dimos.perception.detection2d.type.imageDetections import ImageDetections from dimos.types.timestamped import Timestamped, to_ros_stamp, to_timestamp Bbox = Tuple[float, float, float, float] @@ -56,6 +52,32 @@ Detections = List[Detection] +def _hash_to_color(name: str) -> str: + """Generate a consistent color for a given name using hash.""" + # List of rich colors to choose from + colors = [ + "cyan", + "magenta", + "yellow", + "blue", + "green", + "red", + "bright_cyan", + "bright_magenta", + "bright_yellow", + "bright_blue", + "bright_green", + "bright_red", + "purple", + "white", + "pink", + ] + + # Hash the name and pick a color + hash_value = hashlib.md5(name.encode()).digest()[0] + return colors[hash_value % len(colors)] + + # yolo and detic have bad formats this translates into list of detections def better_detection_format(inconvinient_detections: InconvinientDetectionFormat) -> Detections: bboxes, track_ids, class_ids, confidences, names = inconvinient_detections @@ -84,13 +106,10 @@ def to_repr_dict(self) -> Dict[str, Any]: "name": self.name, "class": str(self.class_id), "track": str(self.track_id), - "conf": self.confidence, + "conf": f"{self.confidence:.2f}", "bbox": f"[{x1:.0f},{y1:.0f},{x2:.0f},{y2:.0f}]", } - def to_image(self) -> Image: - return self.image - # return focused image, only on the bbox def cropped_image(self, padding: int = 20) -> Image: """Return a cropped version of the image focused on the bounding box. @@ -110,26 +129,18 @@ def __str__(self): console = Console(force_terminal=True, legacy_windows=False) d = self.to_repr_dict() - # Create confidence text with color based on value - conf_color = "green" if d["conf"] > 0.8 else "yellow" if d["conf"] > 0.5 else "red" - conf_text = Text(f"{d['conf']:.1%}", style=conf_color) - # Build the string representation parts = [ Text(f"{self.__class__.__name__}("), - Text(d["name"], style="bold cyan"), - Text(f" cls={d['class']} trk={d['track']} "), - conf_text, - Text(f" {d['bbox']}"), ] # Add any extra fields (e.g., points for Detection3D) - extra_keys = [k for k in d.keys() if k not in ["name", "class", "track", "conf", "bbox"]] + extra_keys = [k for k in d.keys() if k not in ["class"]] for key in extra_keys: if d[key] == "None": - parts.append(Text(f" {key}={d[key]}", style="dim")) + parts.append(Text(f"{key}={d[key]}", style="dim")) else: - parts.append(Text(f" {key}={d[key]}", style="blue")) + parts.append(Text(f"{key}={d[key]}", style=_hash_to_color(key))) parts.append(Text(")")) @@ -138,6 +149,12 @@ def __str__(self): console.print(*parts, end="") return capture.get().strip() + def bbox_2d_volume(self) -> float: + x1, y1, x2, y2 = self.bbox + width = max(0.0, x2 - x1) + height = max(0.0, y2 - y1) + return width * height + @classmethod def from_detector( cls, raw_detections: InconvinientDetectionFormat, **kwargs @@ -179,7 +196,7 @@ def to_ros_bbox(self) -> BoundingBox2D: ) def lcm_encode(self): - return self.to_imageannotations().lcm_encode() + return self.to_image_annotations().lcm_encode() def to_text_annotation(self) -> List[TextAnnotation]: x1, y1, x2, y2 = self.bbox @@ -198,7 +215,7 @@ def to_text_annotation(self) -> List[TextAnnotation]: TextAnnotation( timestamp=to_ros_stamp(self.ts), position=Point2(x=x1, y=y1), - text=f"{self.name}_{self.class_id}_{self.track_id}", + text=f"{self.name}_{self.track_id}", font_size=font_size, text_color=Color(r=1.0, g=1.0, b=1.0, a=1), background_color=Color(r=0, g=0, b=0, a=1), @@ -214,7 +231,7 @@ def to_points_annotation(self) -> List[PointsAnnotation]: PointsAnnotation( timestamp=to_ros_stamp(self.ts), outline_color=Color(r=0.0, g=0.0, b=0.0, a=1.0), - fill_color=Color(r=1.0, g=0.0, b=0.0, a=0.15), + fill_color=Color.from_string(self.name, alpha=0.15), thickness=thickness, points_length=4, points=[ @@ -230,7 +247,7 @@ def to_points_annotation(self) -> List[PointsAnnotation]: # this is almost never called directly since this is a single detection # and ImageAnnotations message normally contains multiple detections annotations # so ImageDetections2D and ImageDetections3D normally implements this for whole image - def to_annotations(self) -> ImageAnnotations: + def to_image_annotations(self) -> ImageAnnotations: points = self.to_points_annotation() texts = self.to_text_annotation() @@ -300,179 +317,6 @@ def to_ros_detection2d(self) -> ROSDetection2D: id=str(self.track_id), ) - def to_3d(self, **kwargs) -> "Detection3D": - return Detection3D( - image=self.image, - bbox=self.bbox, - track_id=self.track_id, - class_id=self.class_id, - confidence=self.confidence, - name=self.name, - ts=self.ts, - **kwargs, - ) - - -@dataclass -class Detection3D(Detection2D): - pointcloud: PointCloud2 - transform: Transform - - def localize(self, pointcloud: PointCloud2) -> Detection3D: - self.pointcloud = pointcloud - return self - - @functools.cached_property - def center(self) -> Vector3: - """Calculate the center of the pointcloud in world frame.""" - points = np.asarray(self.pointcloud.pointcloud.points) - center = points.mean(axis=0) - return Vector3(*center) - - @functools.cached_property - def pose(self) -> PoseStamped: - """Convert detection to a PoseStamped using pointcloud center. - - Returns pose in world frame with identity rotation. - The pointcloud is already in world frame. - """ - return PoseStamped( - ts=self.ts, - frame_id="world", - position=self.center, - orientation=(0.0, 0.0, 0.0, 1.0), # Identity quaternion - ) - - def to_repr_dict(self) -> Dict[str, Any]: - d = super().to_repr_dict() - - # Add pointcloud info - d["points"] = str(len(self.pointcloud)) - - # Calculate distance from camera - # The pointcloud is in world frame, and transform gives camera position in world - center_world = self.center - # Camera position in world frame is the translation part of the transform - camera_pos = self.transform.translation - # Use Vector3 subtraction and magnitude - distance = (center_world - camera_pos).magnitude() - d["dist"] = f"{distance:.2f}m" - - return d - - -T = TypeVar("T", bound="Detection2D") - - -def _hash_to_color(name: str) -> str: - """Generate a consistent color for a given name using hash.""" - # List of rich colors to choose from - colors = [ - "cyan", - "magenta", - "yellow", - "blue", - "green", - "red", - "bright_cyan", - "bright_magenta", - "bright_yellow", - "bright_blue", - "bright_green", - "bright_red", - "purple", - "white", - "pink", - ] - - # Hash the name and pick a color - hash_value = hashlib.md5(name.encode()).digest()[0] - return colors[hash_value % len(colors)] - - -class ImageDetections(Generic[T]): - image: Image - detections: List[T] - - def __init__(self, image: Image, detections: List[T]): - self.image = image - self.detections = detections - for det in self.detections: - if not det.ts: - det.ts = image.ts - - def __str__(self): - console = Console(force_terminal=True, legacy_windows=False) - - # Dynamically build columns based on the first detection's dict keys - if not self.detections: - return "Empty ImageDetections" - - # Create a table for detections - table = Table( - title=f"{self.__class__.__name__} [{len(self.detections)} detections @ {to_timestamp(self.image.ts):.3f}]", - show_header=True, - show_edge=True, - ) - - # Cache all repr_dicts to avoid double computation - detection_dicts = [det.to_repr_dict() for det in self.detections] - - first_dict = detection_dicts[0] - table.add_column("#", style="dim") - for col in first_dict.keys(): - color = _hash_to_color(col) - table.add_column(col.title(), style=color) - - # Add each detection to the table - for i, d in enumerate(detection_dicts): - row = [str(i)] - - for key in first_dict.keys(): - if key == "conf": - # Color-code confidence - conf_color = "green" if d[key] > 0.8 else "yellow" if d[key] > 0.5 else "red" - row.append(Text(f"{d[key]:.1%}", style=conf_color)) - elif key == "points" and d.get(key) == "None": - row.append(Text(d.get(key, ""), style="dim")) - else: - row.append(str(d.get(key, ""))) - table.add_row(*row) - - with console.capture() as capture: - console.print(table) - return capture.get().strip() - - def __len__(self): - return len(self.detections) - - def __iter__(self): - return iter(self.detections) - - def __getitem__(self, index): - return self.detections[index] - - def to_ros_detection2d_array(self) -> Detection2DArray: - return Detection2DArray( - detections_length=len(self.detections), - header=Header(self.image.ts, "camera_optical"), - detections=[det.to_ros_detection2d() for det in self.detections], - ) - - def to_image_annotations(self) -> ImageAnnotations: - def flatten(xss): - return [x for xs in xss for x in xs] - - texts = flatten(det.to_text_annotation() for det in self.detections) - points = flatten(det.to_points_annotation() for det in self.detections) - - return ImageAnnotations( - texts=texts, - texts_length=len(texts), - points=points, - points_length=len(points), - ) - class ImageDetections2D(ImageDetections[Detection2D]): @classmethod @@ -483,9 +327,3 @@ def from_detector( image=image, detections=Detection2D.from_detector(raw_detections, image=image, ts=image.ts), ) - - -class ImageDetections3D(ImageDetections[Detection3D]): - """Specialized class for 3D detections in an image.""" - - ... diff --git a/dimos/perception/detection2d/type/detection3d.py b/dimos/perception/detection2d/type/detection3d.py new file mode 100644 index 0000000000..8c0919b700 --- /dev/null +++ b/dimos/perception/detection2d/type/detection3d.py @@ -0,0 +1,403 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import functools +from dataclasses import dataclass +from typing import Any, Callable, Dict, Optional, TypeVar + +import numpy as np +from dimos_lcm.sensor_msgs import CameraInfo +from lcm_msgs.builtin_interfaces import Duration +from lcm_msgs.foxglove_msgs import CubePrimitive, SceneEntity, SceneUpdate, TextPrimitive +from lcm_msgs.geometry_msgs import Point, Pose, Quaternion +from lcm_msgs.geometry_msgs import Vector3 as LCMVector3 + +from dimos.msgs.foxglove_msgs.Color import Color +from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3 +from dimos.msgs.sensor_msgs import PointCloud2 +from dimos.perception.detection2d.type.detection2d import Detection2D +from dimos.perception.detection2d.type.imageDetections import ImageDetections +from dimos.types.timestamped import to_ros_stamp + +Detection3DFilter = Callable[ + [Detection2D, PointCloud2, CameraInfo, Transform], Optional["Detection3D"] +] + + +def height_filter(height=0.1) -> Detection3DFilter: + return lambda det, pc, ci, tf: pc.filter_by_height(height) + + +def statistical(nb_neighbors=40, std_ratio=0.5) -> Detection3DFilter: + def filter_func( + det: Detection2D, pc: PointCloud2, ci: CameraInfo, tf: Transform + ) -> Optional[PointCloud2]: + try: + statistical, removed = pc.pointcloud.remove_statistical_outlier( + nb_neighbors=nb_neighbors, std_ratio=std_ratio + ) + return PointCloud2(statistical, pc.frame_id, pc.ts) + except Exception as e: + # print("statistical filter failed:", e) + return None + + return filter_func + + +def raycast() -> Detection3DFilter: + def filter_func( + det: Detection2D, pc: PointCloud2, ci: CameraInfo, tf: Transform + ) -> Optional[PointCloud2]: + try: + camera_pos = tf.inverse().translation + camera_pos_np = camera_pos.to_numpy() + _, visible_indices = pc.pointcloud.hidden_point_removal(camera_pos_np, radius=100.0) + visible_pcd = pc.pointcloud.select_by_index(visible_indices) + return PointCloud2(visible_pcd, pc.frame_id, pc.ts) + except Exception as e: + # print("raycast filter failed:", e) + return None + + return filter_func + + +def radius_outlier(min_neighbors: int = 20, radius: float = 0.3) -> Detection3DFilter: + """ + Remove isolated points: keep only points that have at least `min_neighbors` + neighbors within `radius` meters (same units as your point cloud). + """ + + def filter_func( + det: Detection2D, pc: PointCloud2, ci: CameraInfo, tf: Transform + ) -> Optional[PointCloud2]: + filtered_pcd, removed = pc.pointcloud.remove_radius_outlier( + nb_points=min_neighbors, radius=radius + ) + return PointCloud2(filtered_pcd, pc.frame_id, pc.ts) + + return filter_func + + +@dataclass +class Detection3D(Detection2D): + pointcloud: PointCloud2 + transform: Transform + frame_id: str = "unknown" + + @classmethod + def from_2d( + cls, + det: Detection2D, + world_pointcloud: PointCloud2, + camera_info: CameraInfo, + world_to_optical_transform: Transform, + # filters are to be adjusted based on the sensor noise characteristics if feeding + # sensor data directly + filters: list[Callable[[PointCloud2], PointCloud2]] = [ + # height_filter(0.1), + raycast(), + radius_outlier(), + statistical(), + ], + ) -> Optional["Detection3D"]: + """Create a Detection3D from a 2D detection by projecting world pointcloud. + + This method handles: + 1. Projecting world pointcloud to camera frame + 2. Filtering points within the 2D detection bounding box + 3. Cleaning up the pointcloud (height filter, outlier removal) + 4. Hidden point removal from camera perspective + + Args: + det: The 2D detection + world_pointcloud: Full pointcloud in world frame + camera_info: Camera calibration info + world_to_camerlka_transform: Transform from world to camera frame + filters: List of functions to apply to the pointcloud for filtering + Returns: + Detection3D with filtered pointcloud, or None if no valid points + """ + # Extract camera parameters + fx, fy = camera_info.K[0], camera_info.K[4] + cx, cy = camera_info.K[2], camera_info.K[5] + image_width = camera_info.width + image_height = camera_info.height + + camera_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]]) + + # Convert pointcloud to numpy array + world_points = world_pointcloud.as_numpy() + + # Project points to camera frame + points_homogeneous = np.hstack([world_points, np.ones((world_points.shape[0], 1))]) + extrinsics_matrix = world_to_optical_transform.to_matrix() + points_camera = (extrinsics_matrix @ points_homogeneous.T).T + + # Filter out points behind the camera + valid_mask = points_camera[:, 2] > 0 + points_camera = points_camera[valid_mask] + world_points = world_points[valid_mask] + + if len(world_points) == 0: + return None + + # Project to 2D + points_2d_homogeneous = (camera_matrix @ points_camera[:, :3].T).T + points_2d = points_2d_homogeneous[:, :2] / points_2d_homogeneous[:, 2:3] + + # Filter points within image bounds + in_image_mask = ( + (points_2d[:, 0] >= 0) + & (points_2d[:, 0] < image_width) + & (points_2d[:, 1] >= 0) + & (points_2d[:, 1] < image_height) + ) + points_2d = points_2d[in_image_mask] + world_points = world_points[in_image_mask] + + if len(world_points) == 0: + return None + + # Extract bbox from Detection2D + x_min, y_min, x_max, y_max = det.bbox + + # Find points within this detection box (with small margin) + margin = 5 # pixels + in_box_mask = ( + (points_2d[:, 0] >= x_min - margin) + & (points_2d[:, 0] <= x_max + margin) + & (points_2d[:, 1] >= y_min - margin) + & (points_2d[:, 1] <= y_max + margin) + ) + + detection_points = world_points[in_box_mask] + + if detection_points.shape[0] == 0: + # print(f"No points found in detection bbox after projection. {det.name}") + return None + + # Create initial pointcloud for this detection + initial_pc = PointCloud2.from_numpy( + detection_points, + frame_id=world_pointcloud.frame_id, + timestamp=world_pointcloud.ts, + ) + + # Apply filters - each filter needs all 4 arguments + detection_pc = initial_pc + for filter_func in filters: + result = filter_func(det, detection_pc, camera_info, world_to_optical_transform) + if result is None: + return None + detection_pc = result + + # Final check for empty pointcloud + if len(detection_pc.pointcloud.points) == 0: + return None + + # Create Detection3D with filtered pointcloud + return Detection3D( + image=det.image, + bbox=det.bbox, + track_id=det.track_id, + class_id=det.class_id, + confidence=det.confidence, + name=det.name, + ts=det.ts, + pointcloud=detection_pc, + transform=world_to_optical_transform, + frame_id=world_pointcloud.frame_id, + ) + + @functools.cached_property + def center(self) -> Vector3: + return Vector3(*self.pointcloud.center) + + @functools.cached_property + def pose(self) -> PoseStamped: + """Convert detection to a PoseStamped using pointcloud center. + + Returns pose in world frame with identity rotation. + The pointcloud is already in world frame. + """ + return PoseStamped( + ts=self.ts, + frame_id=self.frame_id, + position=self.center, + orientation=(0.0, 0.0, 0.0, 1.0), # Identity quaternion + ) + + def get_bounding_box(self): + """Get axis-aligned bounding box of the detection's pointcloud.""" + return self.pointcloud.get_axis_aligned_bounding_box() + + def get_oriented_bounding_box(self): + """Get oriented bounding box of the detection's pointcloud.""" + return self.pointcloud.get_oriented_bounding_box() + + def get_bounding_box_dimensions(self) -> tuple[float, float, float]: + """Get dimensions (width, height, depth) of the detection's bounding box.""" + return self.pointcloud.get_bounding_box_dimensions() + + def bounding_box_intersects(self, other: "Detection3D") -> bool: + """Check if this detection's bounding box intersects with another's.""" + return self.pointcloud.bounding_box_intersects(other.pointcloud) + + def to_repr_dict(self) -> Dict[str, Any]: + # Calculate distance from camera + # The pointcloud is in world frame, and transform gives camera position in world + center_world = self.center + # Camera position in world frame is the translation part of the transform + camera_pos = self.transform.translation + # Use Vector3 subtraction and magnitude + distance = (center_world - camera_pos).magnitude() + + parent_dict = super().to_repr_dict() + # Remove bbox key if present + parent_dict.pop("bbox", None) + + return { + **parent_dict, + "dist": f"{distance:.2f}m", + "points": str(len(self.pointcloud)), + } + + def to_foxglove_scene_entity(self, entity_id: str = None) -> "SceneEntity": + """Convert detection to a Foxglove SceneEntity with cube primitive and text label. + + Args: + entity_id: Optional custom entity ID. If None, generates one from name and hash. + + Returns: + SceneEntity with cube bounding box and text label + """ + + # Create a cube primitive for the bounding box + cube = CubePrimitive() + + # Get the axis-aligned bounding box + aabb = self.get_bounding_box() + + # Set pose from axis-aligned bounding box + cube.pose = Pose() + cube.pose.position = Point() + # Get center of the axis-aligned bounding box + aabb_center = aabb.get_center() + cube.pose.position.x = aabb_center[0] + cube.pose.position.y = aabb_center[1] + cube.pose.position.z = aabb_center[2] + + # For axis-aligned box, use identity quaternion (no rotation) + cube.pose.orientation = Quaternion() + cube.pose.orientation.x = 0 + cube.pose.orientation.y = 0 + cube.pose.orientation.z = 0 + cube.pose.orientation.w = 1 + + # Set size from axis-aligned bounding box + cube.size = LCMVector3() + aabb_extent = aabb.get_extent() + cube.size.x = aabb_extent[0] # width + cube.size.y = aabb_extent[1] # height + cube.size.z = aabb_extent[2] # depth + + # Set color based on name hash + cube.color = Color.from_string(self.name, alpha=0.2) + + # Create text label + text = TextPrimitive() + text.pose = Pose() + text.pose.position = Point() + text.pose.position.x = aabb_center[0] + text.pose.position.y = aabb_center[1] + text.pose.position.z = aabb_center[2] + aabb_extent[2] / 2 + 0.1 # Above the box + text.pose.orientation = Quaternion() + text.pose.orientation.x = 0 + text.pose.orientation.y = 0 + text.pose.orientation.z = 0 + text.pose.orientation.w = 1 + text.billboard = True + text.font_size = 20.0 + text.scale_invariant = True + text.color = Color() + text.color.r = 1.0 + text.color.g = 1.0 + text.color.b = 1.0 + text.color.a = 1.0 + text.text = self.scene_entity_label() + + # Create scene entity + entity = SceneEntity() + entity.timestamp = to_ros_stamp(self.ts) + entity.frame_id = self.frame_id + entity.id = str(self.track_id) + entity.lifetime = Duration() + entity.lifetime.sec = 0 # Persistent + entity.lifetime.nanosec = 0 + entity.frame_locked = False + + # Initialize all primitive arrays + entity.metadata_length = 0 + entity.metadata = [] + entity.arrows_length = 0 + entity.arrows = [] + entity.cubes_length = 1 + entity.cubes = [cube] + entity.spheres_length = 0 + entity.spheres = [] + entity.cylinders_length = 0 + entity.cylinders = [] + entity.lines_length = 0 + entity.lines = [] + entity.triangles_length = 0 + entity.triangles = [] + entity.texts_length = 1 + entity.texts = [text] + entity.models_length = 0 + entity.models = [] + + return entity + + def scene_entity_label(self) -> str: + return f"{self.track_id}/{self.name} ({self.confidence:.0%})" + + +T = TypeVar("T", bound="Detection2D") + + +class ImageDetections3D(ImageDetections[Detection3D]): + """Specialized class for 3D detections in an image.""" + + def to_foxglove_scene_update(self) -> "SceneUpdate": + """Convert all detections to a Foxglove SceneUpdate message. + + Returns: + SceneUpdate containing SceneEntity objects for all detections + """ + + # Create SceneUpdate message with all detections + scene_update = SceneUpdate() + scene_update.deletions_length = 0 + scene_update.deletions = [] + scene_update.entities = [] + + # Process each detection + for i, detection in enumerate(self.detections): + entity = detection.to_foxglove_scene_entity(entity_id=f"detection_{detection.name}_{i}") + scene_update.entities.append(entity) + + scene_update.entities_length = len(scene_update.entities) + return scene_update diff --git a/dimos/perception/detection2d/type/imageDetections.py b/dimos/perception/detection2d/type/imageDetections.py new file mode 100644 index 0000000000..edd8449f06 --- /dev/null +++ b/dimos/perception/detection2d/type/imageDetections.py @@ -0,0 +1,157 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import hashlib +from typing import TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple, TypeVar + +from rich.console import Console +from rich.table import Table +from rich.text import Text + +from dimos.msgs.foxglove_msgs import ImageAnnotations +from dimos.msgs.sensor_msgs import Image +from dimos.msgs.std_msgs import Header +from dimos.msgs.vision_msgs import Detection2DArray +from dimos.types.timestamped import to_timestamp + +if TYPE_CHECKING: + from dimos.perception.detection2d.type.detection2d import Detection2D + +T = TypeVar("T", bound="Detection2D") + + +def _hash_to_color(name: str) -> str: + """Generate a consistent color for a given name using hash.""" + # List of rich colors to choose from + colors = [ + "cyan", + "magenta", + "yellow", + "blue", + "green", + "red", + "bright_cyan", + "bright_magenta", + "bright_yellow", + "bright_blue", + "bright_green", + "bright_red", + "purple", + "white", + "pink", + ] + + # Hash the name and pick a color + hash_value = hashlib.md5(name.encode()).digest()[0] + return colors[hash_value % len(colors)] + + +class TableStr: + def __str__(self): + console = Console(force_terminal=True, legacy_windows=False) + + # Create a table for detections + table = Table( + title=f"{self.__class__.__name__} [{len(self.detections)} detections @ {to_timestamp(self.image.ts):.3f}]", + show_header=True, + show_edge=True, + ) + + # Dynamically build columns based on the first detection's dict keys + if not self.detections: + return ( + f" {self.__class__.__name__} [0 detections @ {to_timestamp(self.image.ts):.3f}]" + ) + + # Cache all repr_dicts to avoid double computation + detection_dicts = [det.to_repr_dict() for det in self] + + first_dict = detection_dicts[0] + table.add_column("#", style="dim") + for col in first_dict.keys(): + color = _hash_to_color(col) + table.add_column(col.title(), style=color) + + # Add each detection to the table + for i, d in enumerate(detection_dicts): + row = [str(i)] + + for key in first_dict.keys(): + if key == "conf": + # Color-code confidence + conf_color = ( + "green" + if float(d[key]) > 0.8 + else "yellow" + if float(d[key]) > 0.5 + else "red" + ) + row.append(Text(f"{d[key]}", style=conf_color)) + elif key == "points" and d.get(key) == "None": + row.append(Text(d.get(key, ""), style="dim")) + else: + row.append(str(d.get(key, ""))) + table.add_row(*row) + + with console.capture() as capture: + console.print(table) + return capture.get().strip() + + +class ImageDetections(Generic[T], TableStr): + image: Image + detections: List[T] + + @property + def ts(self) -> float: + return self.image.ts + + def __init__(self, image: Image, detections: Optional[List[T]] = None): + self.image = image + self.detections = detections or [] + for det in self.detections: + if not det.ts: + det.ts = image.ts + + def __len__(self): + return len(self.detections) + + def __iter__(self): + return iter(self.detections) + + def __getitem__(self, index): + return self.detections[index] + + def to_ros_detection2d_array(self) -> Detection2DArray: + return Detection2DArray( + detections_length=len(self.detections), + header=Header(self.image.ts, "camera_optical"), + detections=[det.to_ros_detection2d() for det in self.detections], + ) + + def to_foxglove_annotations(self) -> ImageAnnotations: + def flatten(xss): + return [x for xs in xss for x in xs] + + texts = flatten(det.to_text_annotation() for det in self.detections) + points = flatten(det.to_points_annotation() for det in self.detections) + + return ImageAnnotations( + texts=texts, + texts_length=len(texts), + points=points, + points_length=len(points), + ) diff --git a/dimos/perception/detection2d/type/test_detection2d.py b/dimos/perception/detection2d/type/test_detection2d.py new file mode 100644 index 0000000000..515bdee339 --- /dev/null +++ b/dimos/perception/detection2d/type/test_detection2d.py @@ -0,0 +1,91 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + + +def test_detection_basic_properties(detection2d): + """Test basic detection properties.""" + assert detection2d.track_id >= 0 + assert detection2d.class_id >= 0 + assert 0.0 <= detection2d.confidence <= 1.0 + assert detection2d.name is not None + assert detection2d.ts > 0 + + +def test_bounding_box_format(detection2d): + """Test bounding box format and validity.""" + bbox = detection2d.bbox + assert len(bbox) == 4, "Bounding box should have 4 values" + + x1, y1, x2, y2 = bbox + assert x2 > x1, "x2 should be greater than x1" + assert y2 > y1, "y2 should be greater than y1" + assert x1 >= 0, "x1 should be non-negative" + assert y1 >= 0, "y1 should be non-negative" + + +def test_bbox_2d_volume(detection2d): + """Test bounding box volume calculation.""" + volume = detection2d.bbox_2d_volume() + assert volume > 0, "Bounding box volume should be positive" + + # Calculate expected volume + x1, y1, x2, y2 = detection2d.bbox + expected_volume = (x2 - x1) * (y2 - y1) + assert volume == pytest.approx(expected_volume, abs=0.001) + + +def test_bbox_center_calculation(detection2d): + """Test bounding box center calculation.""" + center_bbox = detection2d.get_bbox_center() + assert len(center_bbox) == 4, "Center bbox should have 4 values" + + center_x, center_y, width, height = center_bbox + x1, y1, x2, y2 = detection2d.bbox + + # Verify center calculations + assert center_x == pytest.approx((x1 + x2) / 2.0, abs=0.001) + assert center_y == pytest.approx((y1 + y2) / 2.0, abs=0.001) + assert width == pytest.approx(x2 - x1, abs=0.001) + assert height == pytest.approx(y2 - y1, abs=0.001) + + +def test_cropped_image(detection2d): + """Test cropped image generation.""" + padding = 20 + cropped = detection2d.cropped_image(padding=padding) + + assert cropped is not None, "Cropped image should not be None" + + # The actual cropped image is (260, 192, 3) + assert cropped.width == 192 + assert cropped.height == 260 + assert cropped.shape == (260, 192, 3) + + +def test_to_ros_bbox(detection2d): + """Test ROS bounding box conversion.""" + ros_bbox = detection2d.to_ros_bbox() + + assert ros_bbox is not None + assert hasattr(ros_bbox, "center") + assert hasattr(ros_bbox, "size_x") + assert hasattr(ros_bbox, "size_y") + + # Verify values match + center_x, center_y, width, height = detection2d.get_bbox_center() + assert ros_bbox.center.position.x == pytest.approx(center_x, abs=0.001) + assert ros_bbox.center.position.y == pytest.approx(center_y, abs=0.001) + assert ros_bbox.size_x == pytest.approx(width, abs=0.001) + assert ros_bbox.size_y == pytest.approx(height, abs=0.001) diff --git a/dimos/perception/detection2d/type/test_detection3d.py b/dimos/perception/detection2d/type/test_detection3d.py new file mode 100644 index 0000000000..c8215b9601 --- /dev/null +++ b/dimos/perception/detection2d/type/test_detection3d.py @@ -0,0 +1,141 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + + +def test_oriented_bounding_box(detection3d): + """Test oriented bounding box calculation and values.""" + obb = detection3d.get_oriented_bounding_box() + assert obb is not None, "Oriented bounding box should not be None" + + # Verify OBB center values + assert obb.center[0] == pytest.approx(-3.36002, abs=0.1) + assert obb.center[1] == pytest.approx(-0.196446, abs=0.1) + assert obb.center[2] == pytest.approx(0.220184, abs=0.1) + + # Verify OBB extent values + assert obb.extent[0] == pytest.approx(0.531275, abs=0.1) + assert obb.extent[1] == pytest.approx(0.461054, abs=0.1) + assert obb.extent[2] == pytest.approx(0.155, abs=0.1) + + +def test_bounding_box_dimensions(detection3d): + """Test bounding box dimension calculation.""" + dims = detection3d.get_bounding_box_dimensions() + assert len(dims) == 3, "Bounding box dimensions should have 3 values" + assert dims[0] == pytest.approx(0.350, abs=0.1) + assert dims[1] == pytest.approx(0.250, abs=0.1) + assert dims[2] == pytest.approx(0.550, abs=0.1) + + +def test_axis_aligned_bounding_box(detection3d): + """Test axis-aligned bounding box calculation.""" + aabb = detection3d.get_bounding_box() + assert aabb is not None, "Axis-aligned bounding box should not be None" + + # Verify AABB min values + assert aabb.min_bound[0] == pytest.approx(-3.575, abs=0.1) + assert aabb.min_bound[1] == pytest.approx(-0.375, abs=0.1) + assert aabb.min_bound[2] == pytest.approx(-0.075, abs=0.1) + + # Verify AABB max values + assert aabb.max_bound[0] == pytest.approx(-3.075, abs=0.1) + assert aabb.max_bound[1] == pytest.approx(-0.125, abs=0.1) + assert aabb.max_bound[2] == pytest.approx(0.475, abs=0.1) + + +def test_point_cloud_properties(detection3d): + """Test point cloud data and boundaries.""" + pc_points = detection3d.pointcloud.points() + assert len(pc_points) in [69, 70] + assert detection3d.pointcloud.frame_id == "world", ( + f"Expected frame_id 'world', got '{detection3d.pointcloud.frame_id}'" + ) + + # Extract xyz coordinates from points + points = np.array([[pt[0], pt[1], pt[2]] for pt in pc_points]) + + min_pt = np.min(points, axis=0) + max_pt = np.max(points, axis=0) + center = np.mean(points, axis=0) + + # Verify point cloud boundaries + assert min_pt[0] == pytest.approx(-3.575, abs=0.1) + assert min_pt[1] == pytest.approx(-0.375, abs=0.1) + assert min_pt[2] == pytest.approx(-0.075, abs=0.1) + + assert max_pt[0] == pytest.approx(-3.075, abs=0.1) + assert max_pt[1] == pytest.approx(-0.125, abs=0.1) + assert max_pt[2] == pytest.approx(0.475, abs=0.1) + + assert center[0] == pytest.approx(-3.326, abs=0.1) + assert center[1] == pytest.approx(-0.202, abs=0.1) + assert center[2] == pytest.approx(0.160, abs=0.1) + + +def test_foxglove_scene_entity_generation(detection3d): + """Test Foxglove scene entity creation and structure.""" + entity = detection3d.to_foxglove_scene_entity("test_entity_123") + + # Verify entity metadata + assert entity.id == "1", f"Expected entity ID '1', got '{entity.id}'" + assert entity.frame_id == "world", f"Expected frame_id 'world', got '{entity.frame_id}'" + assert entity.cubes_length == 1, f"Expected 1 cube, got {entity.cubes_length}" + assert entity.texts_length == 1, f"Expected 1 text, got {entity.texts_length}" + + +def test_foxglove_cube_properties(detection3d): + """Test Foxglove cube primitive properties.""" + entity = detection3d.to_foxglove_scene_entity("test_entity_123") + cube = entity.cubes[0] + + # Verify position + assert cube.pose.position.x == pytest.approx(-3.325, abs=0.1) + assert cube.pose.position.y == pytest.approx(-0.250, abs=0.1) + assert cube.pose.position.z == pytest.approx(0.200, abs=0.1) + + # Verify size + assert cube.size.x == pytest.approx(0.350, abs=0.1) + assert cube.size.y == pytest.approx(0.250, abs=0.1) + assert cube.size.z == pytest.approx(0.550, abs=0.1) + + # Verify color (green with alpha) + assert cube.color.r == pytest.approx(0.08235294117647059, abs=0.1) + assert cube.color.g == pytest.approx(0.7176470588235294, abs=0.1) + assert cube.color.b == pytest.approx(0.28627450980392155, abs=0.1) + assert cube.color.a == pytest.approx(0.2, abs=0.1) + + +def test_foxglove_text_label(detection3d): + """Test Foxglove text label properties.""" + entity = detection3d.to_foxglove_scene_entity("test_entity_123") + text = entity.texts[0] + + assert text.text == "1/suitcase (81%)", f"Expected text '1/suitcase (81%)', got '{text.text}'" + assert text.pose.position.x == pytest.approx(-3.325, abs=0.1) + assert text.pose.position.y == pytest.approx(-0.250, abs=0.1) + assert text.pose.position.z == pytest.approx(0.575, abs=0.1) + assert text.font_size == 20.0, f"Expected font size 20.0, got {text.font_size}" + + +def test_detection_pose(detection3d): + """Test detection pose and frame information.""" + assert detection3d.pose.x == pytest.approx(-3.327, abs=0.1) + assert detection3d.pose.y == pytest.approx(-0.202, abs=0.1) + assert detection3d.pose.z == pytest.approx(0.160, abs=0.1) + assert detection3d.pose.frame_id == "world", ( + f"Expected frame_id 'world', got '{detection3d.pose.frame_id}'" + ) diff --git a/dimos/perception/detection2d/type/test_object3d.py b/dimos/perception/detection2d/type/test_object3d.py new file mode 100644 index 0000000000..4fac1dcf10 --- /dev/null +++ b/dimos/perception/detection2d/type/test_object3d.py @@ -0,0 +1,188 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from dimos.perception.detection2d.module2D import Detection2DModule +from dimos.perception.detection2d.module3D import Detection3DModule +from dimos.perception.detection2d.moduleDB import Object3D, ObjectDBModule +from dimos.perception.detection2d.type.detection3d import ImageDetections3D +from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule + + +def test_object_db_module_populated(object_db_module): + """Test that ObjectDBModule is properly populated.""" + assert len(object_db_module.objects) > 0, "Database should contain objects" + assert object_db_module.cnt > 0, "Object counter should be greater than 0" + + +def test_object_db_module_objects_structure(all_objects): + """Test the structure of objects in the database.""" + for obj in all_objects: + assert isinstance(obj, Object3D) + assert hasattr(obj, "track_id") + assert hasattr(obj, "detections") + assert hasattr(obj, "best_detection") + assert hasattr(obj, "center") + assert len(obj.detections) >= 1 + + +def test_object3d_properties(first_object): + """Test basic properties of an Object3D.""" + assert first_object.track_id is not None + assert isinstance(first_object.track_id, str) + assert first_object.name is not None + assert first_object.class_id >= 0 + assert 0.0 <= first_object.confidence <= 1.0 + assert first_object.ts > 0 + assert first_object.frame_id is not None + assert first_object.best_detection is not None + + +def test_object3d_multiple_detections(all_objects): + """Test objects that have been built from multiple detections.""" + # Find objects with multiple detections + multi_detection_objects = [obj for obj in all_objects if len(obj.detections) > 1] + + if multi_detection_objects: + obj = multi_detection_objects[0] + + # Test that confidence is the max of all detections + max_conf = max(d.confidence for d in obj.detections) + assert obj.confidence == max_conf + + # Test that timestamp is the max (most recent) + max_ts = max(d.ts for d in obj.detections) + assert obj.ts == max_ts + + # Test that best_detection has the largest bbox volume + best_volume = obj.best_detection.bbox_2d_volume() + for det in obj.detections: + assert det.bbox_2d_volume() <= best_volume + + +def test_object3d_center(first_object): + """Test Object3D center calculation.""" + assert first_object.center is not None + assert hasattr(first_object.center, "x") + assert hasattr(first_object.center, "y") + assert hasattr(first_object.center, "z") + + # Center should be within reasonable bounds + assert -10 < first_object.center.x < 10 + assert -10 < first_object.center.y < 10 + assert -10 < first_object.center.z < 10 + + +def test_object3d_repr_dict(first_object): + """Test to_repr_dict method.""" + repr_dict = first_object.to_repr_dict() + + assert "object_id" in repr_dict + assert "detections" in repr_dict + assert "center" in repr_dict + + assert repr_dict["object_id"] == first_object.track_id + assert repr_dict["detections"] == len(first_object.detections) + + # Center should be formatted as string with coordinates + assert isinstance(repr_dict["center"], str) + assert repr_dict["center"].startswith("[") + assert repr_dict["center"].endswith("]") + + +def test_object3d_scene_entity_label(first_object): + """Test scene entity label generation.""" + label = first_object.scene_entity_label() + + assert isinstance(label, str) + assert first_object.name in label + assert f"({len(first_object.detections)})" in label + + +def test_object3d_agent_encode(first_object): + """Test agent encoding.""" + encoded = first_object.agent_encode() + + assert isinstance(encoded, dict) + assert "id" in encoded + assert "name" in encoded + assert "detections" in encoded + assert "last_seen" in encoded + + assert encoded["id"] == first_object.track_id + assert encoded["name"] == first_object.name + assert encoded["detections"] == len(first_object.detections) + assert encoded["last_seen"].endswith("s ago") + + +def test_object3d_image_property(first_object): + """Test image property returns best_detection's image.""" + assert first_object.image is not None + assert first_object.image is first_object.best_detection.image + + +def test_object3d_addition(object_db_module): + """Test Object3D addition operator.""" + # Get existing objects from the database + objects = list(object_db_module.objects.values()) + if len(objects) < 2: + pytest.skip("Not enough objects in database") + + # Get detections from two different objects + det1 = objects[0].best_detection + det2 = objects[1].best_detection + + # Create a new object with the first detection + obj = Object3D("test_track_combined", det1) + + # Add the second detection from a different object + combined = obj + det2 + + assert combined.track_id == "test_track_combined" + assert len(combined.detections) == 2 + + # The combined object should have properties from both detections + assert det1 in combined.detections + assert det2 in combined.detections + + # Best detection should be determined by the Object3D logic + assert combined.best_detection is not None + + # Center should be valid (no specific value check since we're using real detections) + assert hasattr(combined, "center") + assert combined.center is not None + + +def test_image_detections3d_scene_update(object_db_module): + """Test ImageDetections3D to Foxglove scene update conversion.""" + # Get some detections + objects = list(object_db_module.objects.values()) + if not objects: + pytest.skip("No objects in database") + + detections = [obj.best_detection for obj in objects[:3]] # Take up to 3 + + image_detections = ImageDetections3D(image=detections[0].image, detections=detections) + + scene_update = image_detections.to_foxglove_scene_update() + + assert scene_update is not None + assert scene_update.entities_length == len(detections) + + for i, entity in enumerate(scene_update.entities): + assert entity.id == str(detections[i].track_id) + assert entity.frame_id == detections[i].frame_id + assert entity.cubes_length == 1 + assert entity.texts_length == 1 diff --git a/dimos/perception/detection2d/yolo_2d_det.py b/dimos/perception/detection2d/yolo_2d_det.py index bc8b0bc577..02c4ee5325 100644 --- a/dimos/perception/detection2d/yolo_2d_det.py +++ b/dimos/perception/detection2d/yolo_2d_det.py @@ -41,7 +41,7 @@ def __init__(self, model_path="models_yolo", model_name="yolo11n.onnx", device=" device (str): Device to run inference on ('cuda' or 'cpu') """ self.device = device - self.model = YOLO(get_data(model_path) / model_name) + self.model = YOLO(get_data(model_path) / model_name, task="detect") module_dir = os.path.dirname(__file__) self.tracker_config = os.path.join(module_dir, "config", "custom_tracker.yaml") diff --git a/dimos/protocol/tf/tf.py b/dimos/protocol/tf/tf.py index 9d1db5ed16..0052ef4758 100644 --- a/dimos/protocol/tf/tf.py +++ b/dimos/protocol/tf/tf.py @@ -281,7 +281,6 @@ def __init__(self, **kwargs) -> None: TFSpec.__init__(self, **kwargs) MultiTBuffer.__init__(self, self.config.buffer_size) - # Check if pubsub is a class (callable) or an instance pubsub_config = getattr(self.config, "pubsub", None) if pubsub_config is not None: if callable(pubsub_config): @@ -291,7 +290,7 @@ def __init__(self, **kwargs) -> None: else: raise ValueError("PubSub configuration is missing") - if getattr(self.config, "autostart", True): + if self.config.autostart: self.start() def start(self, sub=True) -> None: @@ -317,6 +316,18 @@ def publish(self, *args: Transform) -> None: def publish_static(self, *args: Transform) -> None: raise NotImplementedError("Static transforms not implemented in PubSubTF.") + def publish_all(self) -> None: + """Publish all transforms currently stored in all buffers.""" + all_transforms = [] + for buffer in self.buffers.values(): + # Get the latest transform from each buffer + latest = buffer.get() # get() with no args returns latest + if latest: + all_transforms.append(latest) + + if all_transforms: + self.publish(*all_transforms) + def get( self, parent_frame: str, diff --git a/dimos/robot/nav_bot.py b/dimos/robot/nav_bot.py new file mode 100644 index 0000000000..c8359c9a1c --- /dev/null +++ b/dimos/robot/nav_bot.py @@ -0,0 +1,408 @@ +#!/usr/bin/env python3 +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +NavBot class for navigation-related functionality. +Encapsulates ROS bridge and topic remapping for Unitree robots. +""" + +import logging +import time + +from dimos import core +from dimos.core import Module, In, Out, rpc +from dimos.msgs.geometry_msgs import PoseStamped, TwistStamped, Transform, Vector3 +from dimos.msgs.nav_msgs import Odometry +from dimos.msgs.sensor_msgs import PointCloud2, Joy, Image +from dimos.msgs.std_msgs import Bool +from dimos.msgs.tf2_msgs.TFMessage import TFMessage +from dimos.protocol.tf import TF +from dimos.robot.ros_bridge import ROSBridge, BridgeDirection +from dimos.utils.transform_utils import euler_to_quaternion +from geometry_msgs.msg import TwistStamped as ROSTwistStamped +from geometry_msgs.msg import PoseStamped as ROSPoseStamped +from nav_msgs.msg import Odometry as ROSOdometry +from sensor_msgs.msg import PointCloud2 as ROSPointCloud2, Joy as ROSJoy, Image as ROSImage +from std_msgs.msg import Bool as ROSBool +from tf2_msgs.msg import TFMessage as ROSTFMessage +from dimos.utils.logging_config import setup_logger +from dimos.protocol.pubsub.lcmpubsub import LCM, Topic + +logger = setup_logger("dimos.robot.unitree_webrtc.nav_bot", level=logging.INFO) + +############################################################ +# Navigation Module + +# first run unitree_g1.py to start the ROS bridge and webrtc connection and teleop +# python dimos/robot/unitree_webrtc/unitree_g1.py + + +# then deploy this module in any other run file. +############################################################ +class NavigationModule(Module): + goal_reached: In[Bool] = None + + goal_pose: Out[PoseStamped] = None + cancel_goal: Out[Bool] = None + joy: Out[Joy] = None + + def __init__(self, *args, **kwargs): + """Initialize NavigationModule.""" + Module.__init__(self, *args, **kwargs) + self.goal_reach = None + + @rpc + def start(self): + """Start the navigation module.""" + if self.goal_reached: + self.goal_reached.subscribe(self._on_goal_reached) + logger.info("NavigationModule started") + + def _on_goal_reached(self, msg: Bool): + """Handle goal reached status messages.""" + self.goal_reach = msg.data + + def _set_autonomy_mode(self): + """ + Set autonomy mode by publishing Joy message. + """ + + joy_msg = Joy( + frame_id="dimos", + axes=[ + 0.0, # axis 0 + 0.0, # axis 1 + -1.0, # axis 2 + 0.0, # axis 3 + 1.0, # axis 4 + 1.0, # axis 5 + 0.0, # axis 6 + 0.0, # axis 7 + ], + buttons=[ + 0, # button 0 + 0, # button 1 + 0, # button 2 + 0, # button 3 + 0, # button 4 + 0, # button 5 + 0, # button 6 + 1, # button 7 - controls autonomy mode + 0, # button 8 + 0, # button 9 + 0, # button 10 + ], + ) + + if self.joy: + self.joy.publish(joy_msg) + logger.info(f"Setting autonomy mode via Joy message") + + @rpc + def go_to(self, pose: PoseStamped, timeout: float = 60.0) -> bool: + """ + Navigate to a target pose by publishing to LCM topics. + + Args: + pose: Target pose to navigate to + blocking: If True, block until goal is reached + timeout: Maximum time to wait for goal (seconds) + + Returns: + True if navigation was successful (or started if non-blocking) + """ + logger.info( + f"Navigating to goal: ({pose.position.x:.2f}, {pose.position.y:.2f}, {pose.position.z:.2f})" + ) + + self.goal_reach = None + self._set_autonomy_mode() + self.goal_pose.publish(pose) + + start_time = time.time() + while time.time() - start_time < timeout: + if self.goal_reach is not None: + return self.goal_reach + time.sleep(0.1) + + self.stop() + + logger.warning(f"Navigation timed out after {timeout} seconds") + return False + + @rpc + def stop(self) -> bool: + """ + Cancel current navigation by publishing to cancel_goal. + + Returns: + True if cancel command was sent successfully + """ + logger.info("Cancelling navigation") + + if self.cancel_goal: + cancel_msg = Bool(data=True) + self.cancel_goal.publish(cancel_msg) + return True + + return False + + +class TopicRemapModule(Module): + """Module that remaps Odometry to PoseStamped and publishes static transforms.""" + + odom: In[Odometry] = None + odom_pose: Out[PoseStamped] = None + + def __init__(self, sensor_to_base_link_transform=None, *args, **kwargs): + Module.__init__(self, *args, **kwargs) + self.tf = TF() + self.sensor_to_base_link_transform = sensor_to_base_link_transform or [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ] + + @rpc + def start(self): + self.odom.subscribe(self._publish_odom_pose) + logger.info("TopicRemapModule started") + + def _publish_odom_pose(self, msg: Odometry): + pose_msg = PoseStamped( + ts=msg.ts, + frame_id=msg.frame_id, + position=msg.pose.pose.position, + orientation=msg.pose.pose.orientation, + ) + self.odom_pose.publish(pose_msg) + + # Publish static transform from sensor to base_link + translation = Vector3( + self.sensor_to_base_link_transform[0], + self.sensor_to_base_link_transform[1], + self.sensor_to_base_link_transform[2], + ) + euler_angles = Vector3( + self.sensor_to_base_link_transform[3], + self.sensor_to_base_link_transform[4], + self.sensor_to_base_link_transform[5], + ) + rotation = euler_to_quaternion(euler_angles) + + sensor_to_base_link_tf = Transform( + translation=translation, + rotation=rotation, + frame_id="sensor", + child_frame_id="base_link", + ts=msg.ts, + ) + + # map to world static transform + map_to_world_tf = Transform( + translation=Vector3(0.0, 0.0, 0.0), + rotation=euler_to_quaternion(Vector3(0.0, 0.0, 0.0)), + frame_id="map", + child_frame_id="world", + ts=msg.ts, + ) + + self.tf.publish(sensor_to_base_link_tf, map_to_world_tf) + + +class NavBot: + """ + NavBot class for navigation-related functionality. + Manages ROS bridge and topic remapping for navigation. + """ + + def __init__(self, dimos=None, sensor_to_base_link_transform=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]): + """ + Initialize NavBot. + + Args: + dimos: DIMOS instance (creates new one if None) + sensor_to_base_link_transform: Optional [x, y, z, roll, pitch, yaw] transform from sensor to base_link + """ + if dimos is None: + self.dimos = core.start(2) + else: + self.dimos = dimos + + self.sensor_to_base_link_transform = sensor_to_base_link_transform + self.ros_bridge = None + self.topic_remap_module = None + self.tf = TF() + self.lcm = LCM() + + def start(self): + if self.topic_remap_module: + self.topic_remap_module.start() + logger.info("Topic remap module started") + + if self.ros_bridge: + logger.info("ROS bridge started") + + def deploy_navigation_modules(self, bridge_name="nav_bot_ros_bridge"): + # Deploy topic remap module + logger.info("Deploying topic remap module...") + self.topic_remap_module = self.dimos.deploy( + TopicRemapModule, sensor_to_base_link_transform=self.sensor_to_base_link_transform + ) + self.topic_remap_module.odom.transport = core.LCMTransport("/odom", Odometry) + self.topic_remap_module.odom_pose.transport = core.LCMTransport("/odom_pose", PoseStamped) + + # Deploy ROS bridge + logger.info("Deploying ROS bridge...") + self.ros_bridge = ROSBridge(bridge_name) + + # Configure ROS topics + self.ros_bridge.add_topic( + "/cmd_vel", TwistStamped, ROSTwistStamped, direction=BridgeDirection.ROS_TO_DIMOS + ) + self.ros_bridge.add_topic( + "/state_estimation", + Odometry, + ROSOdometry, + direction=BridgeDirection.ROS_TO_DIMOS, + remap_topic="/odom", + ) + self.ros_bridge.add_topic( + "/tf", TFMessage, ROSTFMessage, direction=BridgeDirection.ROS_TO_DIMOS + ) + self.ros_bridge.add_topic( + "/registered_scan", PointCloud2, ROSPointCloud2, direction=BridgeDirection.ROS_TO_DIMOS + ) + self.ros_bridge.add_topic("/joy", Joy, ROSJoy, direction=BridgeDirection.DIMOS_TO_ROS) + # Navigation control topics from autonomy stack + self.ros_bridge.add_topic( + "/goal_pose", PoseStamped, ROSPoseStamped, direction=BridgeDirection.DIMOS_TO_ROS + ) + self.ros_bridge.add_topic( + "/cancel_goal", Bool, ROSBool, direction=BridgeDirection.DIMOS_TO_ROS + ) + self.ros_bridge.add_topic( + "/goal_reached", Bool, ROSBool, direction=BridgeDirection.ROS_TO_DIMOS + ) + + # self.ros_bridge.add_topic( + # "/camera/image", Image, ROSImage, direction=BridgeDirection.ROS_TO_DIMOS + # ) + + def _set_autonomy_mode(self): + """ + Set autonomy mode by publishing Joy message. + """ + + joy_msg = Joy( + frame_id="dimos", + axes=[ + 0.0, # axis 0 + 0.0, # axis 1 + -1.0, # axis 2 + 0.0, # axis 3 + 1.0, # axis 4 + 1.0, # axis 5 + 0.0, # axis 6 + 0.0, # axis 7 + ], + buttons=[ + 0, # button 0 + 0, # button 1 + 0, # button 2 + 0, # button 3 + 0, # button 4 + 0, # button 5 + 0, # button 6 + 1, # button 7 - controls autonomy mode + 0, # button 8 + 0, # button 9 + 0, # button 10 + ], + ) + + self.lcm.publish(Topic("/joy", Joy), joy_msg) + + def navigate_to_goal(self, pose: PoseStamped, blocking: bool = True, timeout: float = 30.0): + """Navigate to a target pose using ROS topics. + + Args: + pose: Target pose to navigate to + blocking: If True, block until goal is reached. If False, return immediately. + timeout: Maximum time to wait for goal to be reached (seconds) + + Returns: + If blocking=True: True if navigation was successful, False otherwise + If blocking=False: True if goal was sent successfully + """ + logger.info( + f"Navigating to goal: ({pose.position.x:.2f}, {pose.position.y:.2f}, {pose.position.z:.2f})" + ) + + # Publish goal to /goal_pose topic + self._set_autonomy_mode() + goal_topic = Topic("/goal_pose", PoseStamped) + self.lcm.publish(goal_topic, pose) + + if not blocking: + return True + + # Wait for goal_reached signal + goal_reached_topic = Topic("/goal_reached", Bool) + start_time = time.time() + + while time.time() - start_time < timeout: + try: + msg = self.lcm.wait_for_message(goal_reached_topic, timeout=0.5) + if msg and msg.data: + logger.info("Navigation goal reached") + return True + elif msg and not msg.data: + logger.info("Navigation was cancelled or failed") + return False + except Exception: + # Timeout on wait_for_message, continue looping + pass + + logger.warning(f"Navigation timed out after {timeout} seconds") + return False + + def cancel_navigation(self) -> bool: + """Cancel the current navigation goal using ROS topics. + + Returns: + True if cancel command was sent successfully + """ + logger.info("Cancelling navigation goal") + + # Publish cancel command to /cancel_goal topic + cancel_topic = Topic("/cancel_goal", Bool) + cancel_msg = Bool(data=True) + self.lcm.publish(cancel_topic, cancel_msg) + + return True + + def shutdown(self): + logger.info("Shutting down navigation modules...") + + if self.ros_bridge is not None: + try: + self.ros_bridge.shutdown() + logger.info("ROS bridge shut down successfully") + except Exception as e: + logger.error(f"Error shutting down ROS bridge: {e}") diff --git a/dimos/robot/unitree_webrtc/modular/connection_module.py b/dimos/robot/unitree_webrtc/modular/connection_module.py index c6214c4f2c..6e13ed938e 100644 --- a/dimos/robot/unitree_webrtc/modular/connection_module.py +++ b/dimos/robot/unitree_webrtc/modular/connection_module.py @@ -18,6 +18,7 @@ import functools import logging import os +import queue import time import warnings from dataclasses import dataclass @@ -28,7 +29,8 @@ from reactivex import operators as ops from reactivex.observable import Observable -from dimos.core import In, LCMTransport, Module, ModuleConfig, Out, rpc, DimosCluster +from dimos.agents2 import Agent, Output, Reducer, Stream, skill +from dimos.core import DimosCluster, In, LCMTransport, Module, ModuleConfig, Out, rpc from dimos.msgs.foxglove_msgs import ImageAnnotations from dimos.msgs.geometry_msgs import PoseStamped, Quaternion, Transform, Twist, Vector3 from dimos.msgs.sensor_msgs.Image import Image, sharpness_window @@ -103,9 +105,7 @@ def odom_stream(self): @functools.cache def video_stream(self): print("video stream start") - video_store = TimedSensorReplay( - f"{self.dir_name}/video", - ) + video_store = TimedSensorReplay(f"{self.dir_name}/video") return video_store.stream(**self.replay_config) @@ -136,11 +136,27 @@ class ConnectionModule(Module): default_config = ConnectionModuleConfig + # mega temporary, skill should have a limit decorator for number of + # parallel calls + video_running: bool = False + def __init__(self, connection_type: str = "webrtc", *args, **kwargs): self.connection_config = kwargs self.connection_type = connection_type Module.__init__(self, *args, **kwargs) + @skill(stream=Stream.passive, output=Output.image, reducer=Reducer.latest) + def video_stream_tool(self) -> Image: + """implicit video stream skill, don't call this directly""" + if self.video_running: + return "video stream already running" + self.video_running = True + _queue = queue.Queue(maxsize=1) + self.connection.video_stream().subscribe(_queue.put) + + for image in iter(_queue.get, None): + yield image + @rpc def record(self, recording_name: str): lidar_store: TimedSensorStorage = TimedSensorStorage(f"{recording_name}/lidar") @@ -183,10 +199,8 @@ def resize(image: Image) -> Image: int(originalwidth / image_resize_factor), int(originalheight / image_resize_factor) ) - sharpness = sharpness_window(10, self.connection.video_stream()) - sharpness.subscribe(self.video.publish) - # self.connection.video_stream().subscribe(self.video.publish) - + self.connection.video_stream().subscribe(self.video.publish) + # sharpness_window(15.0, self.connection.video_stream()).subscribe(self.video.publish) # self.connection.video_stream().pipe(ops.map(resize)).subscribe(self.video.publish) self.camera_info_stream().subscribe(self.camera_info.publish) self.movecmd.subscribe(self.connection.move) @@ -278,8 +292,8 @@ def camera_info_stream(self) -> Observable[CameraInfo]: def deploy_connection(dimos: DimosCluster, **kwargs): - # foxglove_bridge = dimos.deploy(FoxgloveBridge) - # foxglove_bridge.start() + foxglove_bridge = dimos.deploy(FoxgloveBridge) + foxglove_bridge.start() connection = dimos.deploy( ConnectionModule, diff --git a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py index 5893248530..73927cf248 100644 --- a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py +++ b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py @@ -16,7 +16,9 @@ import time from dimos_lcm.sensor_msgs import CameraInfo +from lcm_msgs.foxglove_msgs import SceneUpdate +from dimos.agents2.spec import Model, Provider from dimos.core import LCMTransport, start # from dimos.msgs.detection2d import Detection2DArray @@ -24,6 +26,7 @@ from dimos.msgs.sensor_msgs import Image, PointCloud2 from dimos.msgs.vision_msgs import Detection2DArray from dimos.perception.detection2d import Detection3DModule +from dimos.perception.detection2d.moduleDB import ObjectDBModule from dimos.protocol.pubsub import lcm from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule @@ -34,15 +37,22 @@ def detection_unitree(): dimos = start(6) - connection = deploy_connection(dimos) - connection.start() - # connection.record("unitree_go2_office_walk2") # mapper = deploy_navigation(dimos, connection) + # mapper.start() + + def goto(pose): + print("NAVIGATION REQUESTED:", pose) + return True - module3D = dimos.deploy(Detection3DModule, camera_info=ConnectionModule._camera_info()) + module3D = dimos.deploy( + ObjectDBModule, + goto=goto, + camera_info=ConnectionModule._camera_info(), + ) module3D.image.connect(connection.video) + # module3D.pointcloud.connect(mapper.global_map) module3D.pointcloud.connect(connection.lidar) module3D.annotations.transport = LCMTransport("/annotations", ImageAnnotations) @@ -55,16 +65,37 @@ def detection_unitree(): module3D.detected_image_0.transport = LCMTransport("/detected/image/0", Image) module3D.detected_image_1.transport = LCMTransport("/detected/image/1", Image) module3D.detected_image_2.transport = LCMTransport("/detected/image/2", Image) + + module3D.scene_update.transport = LCMTransport("/scene_update", SceneUpdate) + module3D.start() - # detection.start() + connection.start() + + from dimos.agents2 import Agent, Output, Reducer, Stream, skill + from dimos.agents2.cli.human import HumanInput + + agent = Agent( + system_prompt="You are a helpful assistant for controlling a Unitree Go2 robot. ", + model=Model.GPT_4O, # Could add CLAUDE models to enum + provider=Provider.OPENAI, # Would need ANTHROPIC provider + ) + + human_input = dimos.deploy(HumanInput) + agent.register_skills(human_input) + # agent.register_skills(connection) + agent.register_skills(module3D) + + # agent.run_implicit_skill("video_stream_tool") + agent.run_implicit_skill("human") + + agent.start() + agent.loop_thread() try: while True: time.sleep(1) except KeyboardInterrupt: connection.stop() - # mapper.stop() - # detection.stop() logger.info("Shutting down...") diff --git a/dimos/robot/unitree_webrtc/modular/navigation.py b/dimos/robot/unitree_webrtc/modular/navigation.py index 8ceaf0e195..c37cac700a 100644 --- a/dimos/robot/unitree_webrtc/modular/navigation.py +++ b/dimos/robot/unitree_webrtc/modular/navigation.py @@ -27,7 +27,7 @@ def deploy_navigation(dimos, connection): - mapper = dimos.deploy(Map, voxel_size=0.5, cost_resolution=0.05, global_publish_interval=1.0) + mapper = dimos.deploy(Map, voxel_size=0.5, cost_resolution=0.05, global_publish_interval=0.5) mapper.lidar.connect(connection.lidar) mapper.global_map.transport = LCMTransport("/global_map", LidarMessage) mapper.global_costmap.transport = LCMTransport("/global_costmap", OccupancyGrid) diff --git a/dimos/robot/unitree_webrtc/rosnav.py b/dimos/robot/unitree_webrtc/rosnav.py new file mode 100644 index 0000000000..969ddad950 --- /dev/null +++ b/dimos/robot/unitree_webrtc/rosnav.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time + +from dimos import core +from dimos.core import Module, In, Out, rpc +from dimos.msgs.geometry_msgs import PoseStamped, TwistStamped, Transform, Vector3 +from dimos.msgs.nav_msgs import Odometry +from dimos.msgs.sensor_msgs import PointCloud2, Joy +from dimos.msgs.std_msgs.Bool import Bool +from dimos.msgs.std_msgs.Header import Header +from dimos.msgs.tf2_msgs.TFMessage import TFMessage +from dimos.protocol.tf import TF +from dimos.robot.ros_bridge import ROSBridge, BridgeDirection +from dimos.utils.transform_utils import euler_to_quaternion +from geometry_msgs.msg import TwistStamped as ROSTwistStamped +from geometry_msgs.msg import PoseStamped as ROSPoseStamped +from nav_msgs.msg import Odometry as ROSOdometry +from sensor_msgs.msg import PointCloud2 as ROSPointCloud2 +from std_msgs.msg import Bool as ROSBool +from tf2_msgs.msg import TFMessage as ROSTFMessage +from dimos.utils.logging_config import setup_logger +from dimos.protocol.pubsub.lcmpubsub import LCM, Topic + +logger = setup_logger("dimos.robot.unitree_webrtc.nav_bot", level=logging.INFO) + + +class NavigationModule(Module): + goal_pose: Out[PoseStamped] = None + goal_reached: In[Bool] = None + cancel_goal: Out[Bool] = None + joy: Out[Joy] = None + + def __init__(self, *args, **kwargs): + """Initialize NavigationModule.""" + Module.__init__(self, *args, **kwargs) + self.goal_reach = None + + @rpc + def start(self): + """Start the navigation module.""" + if self.goal_reached: + self.goal_reached.subscribe(self._on_goal_reached) + logger.info("NavigationModule started") + + def _on_goal_reached(self, msg: Bool): + """Handle goal reached status messages.""" + self.goal_reach = msg.data + + def _set_autonomy_mode(self): + """ + Set autonomy mode by publishing Joy message. + """ + + joy_msg = Joy( + frame_id="dimos", + axes=[ + 0.0, # axis 0 + 0.0, # axis 1 + -1.0, # axis 2 + 0.0, # axis 3 + 1.0, # axis 4 + 1.0, # axis 5 + 0.0, # axis 6 + 0.0, # axis 7 + ], + buttons=[ + 0, # button 0 + 0, # button 1 + 0, # button 2 + 0, # button 3 + 0, # button 4 + 0, # button 5 + 0, # button 6 + 1, # button 7 - controls autonomy mode + 0, # button 8 + 0, # button 9 + 0, # button 10 + ], + ) + + if self.joy: + self.joy.publish(joy_msg) + logger.info(f"Setting autonomy mode via Joy message") + + @rpc + def go_to(self, pose: PoseStamped, timeout: float = 60.0) -> bool: + """ + Navigate to a target pose by publishing to LCM topics. + + Args: + pose: Target pose to navigate to + blocking: If True, block until goal is reached + timeout: Maximum time to wait for goal (seconds) + + Returns: + True if navigation was successful (or started if non-blocking) + """ + logger.info( + f"Navigating to goal: ({pose.position.x:.2f}, {pose.position.y:.2f}, {pose.position.z:.2f})" + ) + + self.goal_reach = None + self._set_autonomy_mode() + self.goal_pose.publish(pose) + time.sleep(0.2) + self.goal_pose.publish(pose) + + start_time = time.time() + while time.time() - start_time < timeout: + if self.goal_reach is not None: + return self.goal_reach + time.sleep(0.1) + + self.stop() + + logger.warning(f"Navigation timed out after {timeout} seconds") + return False + + @rpc + def stop(self) -> bool: + """ + Cancel current navigation by publishing to cancel_goal. + + Returns: + True if cancel command was sent successfully + """ + logger.info("Cancelling navigation") + + if self.cancel_goal: + cancel_msg = Bool(data=True) + self.cancel_goal.publish(cancel_msg) + return True + + return False diff --git a/dimos/robot/unitree_webrtc/unitree_b1/test_connection.py b/dimos/robot/unitree_webrtc/unitree_b1/test_connection.py index c54f124f02..e595a1adde 100644 --- a/dimos/robot/unitree_webrtc/unitree_b1/test_connection.py +++ b/dimos/robot/unitree_webrtc/unitree_b1/test_connection.py @@ -271,7 +271,7 @@ def test_watchdog_timing_accuracy(self): # Check timing (should be close to 200ms + up to 50ms watchdog interval) elapsed = timeout_time - start_time print(f"\nWatchdog timeout occurred at exactly {elapsed:.3f} seconds") - assert 0.19 <= elapsed <= 0.26, f"Watchdog timed out at {elapsed:.3f}s, expected ~0.2-0.25s" + assert 0.19 <= elapsed <= 0.3, f"Watchdog timed out at {elapsed:.3f}s, expected ~0.2-0.25s" conn.running = False conn.watchdog_running = False diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py index 0011eac760..08a23bc2dc 100644 --- a/dimos/robot/unitree_webrtc/unitree_g1.py +++ b/dimos/robot/unitree_webrtc/unitree_g1.py @@ -18,43 +18,62 @@ Minimal implementation using WebRTC connection for robot control. """ +import logging import os import time -import logging from typing import Optional +from dimos_lcm.foxglove_msgs import SceneUpdate +from geometry_msgs.msg import PoseStamped as ROSPoseStamped +from geometry_msgs.msg import TwistStamped as ROSTwistStamped +from nav_msgs.msg import Odometry as ROSOdometry +from sensor_msgs.msg import Image as ROSImage +from sensor_msgs.msg import Joy as ROSJoy +from sensor_msgs.msg import PointCloud2 as ROSPointCloud2 +from tf2_msgs.msg import TFMessage as ROSTFMessage + from dimos import core -from dimos.constants import DEFAULT_CAPACITY_COLOR_IMAGE, DEFAULT_CAPACITY_DEPTH_IMAGE -from dimos.core import Module, In, Out, rpc -from dimos.msgs.geometry_msgs import PoseStamped, Twist, TwistStamped +from dimos.agents2 import Agent +from dimos.agents2.cli.human import HumanInput +from dimos.agents2.skills.ros_navigation import RosNavigation +from dimos.agents2.spec import Model, Provider +from dimos.core import In, Module, Out, rpc +from dimos.hardware.camera import zed +from dimos.hardware.camera.module import CameraModule +from dimos.hardware.camera.webcam import Webcam +from dimos.msgs.foxglove_msgs import ImageAnnotations +from dimos.msgs.geometry_msgs import ( + PoseStamped, + Quaternion, + Transform, + Twist, + TwistStamped, + Vector3, +) from dimos.msgs.nav_msgs.Odometry import Odometry -from dimos.msgs.sensor_msgs import CameraInfo, PointCloud2 +from dimos.msgs.sensor_msgs import CameraInfo, Image, Joy, PointCloud2 +from dimos.msgs.std_msgs.Bool import Bool from dimos.msgs.tf2_msgs.TFMessage import TFMessage +from dimos.msgs.vision_msgs import Detection2DArray +from dimos.perception.detection2d import Detection3DModule +from dimos.perception.detection2d.moduleDB import ObjectDBModule +from dimos.perception.spatial_perception import SpatialMemory from dimos.protocol import pubsub from dimos.protocol.pubsub.lcmpubsub import LCM from dimos.robot.foxglove_bridge import FoxgloveBridge -from dimos.web.websocket_vis.websocket_vis_module import WebsocketVisModule +from dimos.robot.robot import Robot +from dimos.robot.ros_bridge import BridgeDirection, ROSBridge from dimos.robot.unitree_webrtc.connection import UnitreeWebRTCConnection +from dimos.robot.unitree_webrtc.rosnav import NavigationModule +from dimos.robot.unitree_webrtc.unitree_g1_skill_container import UnitreeG1SkillContainer from dimos.robot.unitree_webrtc.unitree_skills import MyUnitreeSkills -from dimos.robot.ros_bridge import ROSBridge, BridgeDirection -from geometry_msgs.msg import TwistStamped as ROSTwistStamped -from nav_msgs.msg import Odometry as ROSOdometry -from sensor_msgs.msg import PointCloud2 as ROSPointCloud2 -from tf2_msgs.msg import TFMessage as ROSTFMessage from dimos.skills.skills import SkillLibrary -from dimos.robot.robot import Robot - from dimos.types.robot_capabilities import RobotCapability from dimos.utils.logging_config import setup_logger +from dimos.web.websocket_vis.websocket_vis_module import WebsocketVisModule logger = setup_logger("dimos.robot.unitree_webrtc.unitree_g1", level=logging.INFO) -try: - from dimos.hardware.camera.zed import ZEDModule -except ImportError: - logger.warning("ZEDModule not found. Please install pyzed to use ZED camera functionality.") - ZEDModule = None - # Suppress verbose loggers logging.getLogger("aiortc.codecs.h264").setLevel(logging.ERROR) logging.getLogger("lcm_foxglove_bridge").setLevel(logging.ERROR) @@ -123,6 +142,7 @@ def __init__( enable_joystick: bool = False, enable_connection: bool = True, enable_ros_bridge: bool = True, + enable_perception: bool = False, enable_camera: bool = False, ): """Initialize the G1 robot. @@ -137,7 +157,7 @@ def __init__( enable_joystick: Enable pygame joystick control enable_connection: Enable robot connection module enable_ros_bridge: Enable ROS bridge - enable_camera: Enable ZED camera module + enable_camera: Enable web camera module """ super().__init__() self.ip = ip @@ -147,6 +167,7 @@ def __init__( self.enable_joystick = enable_joystick self.enable_connection = enable_connection self.enable_ros_bridge = enable_ros_bridge + self.enable_perception = enable_perception self.enable_camera = enable_camera self.websocket_port = websocket_port self.lcm = LCM() @@ -166,28 +187,71 @@ def __init__( self.connection = None self.websocket_vis = None self.foxglove_bridge = None + self.spatial_memory_module = None self.joystick = None self.ros_bridge = None - self.zed_camera = None - + self.camera = None self._setup_directories() def _setup_directories(self): - """Setup output directories.""" + """Setup directories for spatial memory storage.""" os.makedirs(self.output_dir, exist_ok=True) logger.info(f"Robot outputs will be saved to: {self.output_dir}") + # Initialize memory directories + self.memory_dir = os.path.join(self.output_dir, "memory") + os.makedirs(self.memory_dir, exist_ok=True) + + # Initialize spatial memory properties + self.spatial_memory_dir = os.path.join(self.memory_dir, "spatial_memory") + self.spatial_memory_collection = "spatial_memory" + self.db_path = os.path.join(self.spatial_memory_dir, "chromadb_data") + self.visual_memory_path = os.path.join(self.spatial_memory_dir, "visual_memory.pkl") + + # Create spatial memory directories + os.makedirs(self.spatial_memory_dir, exist_ok=True) + os.makedirs(self.db_path, exist_ok=True) + + def _deploy_detection(self, goto): + detection = self.dimos.deploy( + ObjectDBModule, goto=goto, camera_info=zed.CameraInfo.SingleWebcam + ) + + detection.image.connect(self.camera.image) + detection.pointcloud.transport = core.LCMTransport("/map", PointCloud2) + + detection.annotations.transport = core.LCMTransport("/annotations", ImageAnnotations) + detection.detections.transport = core.LCMTransport("/detections", Detection2DArray) + + detection.scene_update.transport = core.LCMTransport("/scene_update", SceneUpdate) + detection.target.transport = core.LCMTransport("/target", PoseStamped) + detection.detected_pointcloud_0.transport = core.LCMTransport( + "/detected/pointcloud/0", PointCloud2 + ) + detection.detected_pointcloud_1.transport = core.LCMTransport( + "/detected/pointcloud/1", PointCloud2 + ) + detection.detected_pointcloud_2.transport = core.LCMTransport( + "/detected/pointcloud/2", PointCloud2 + ) + + detection.detected_image_0.transport = core.LCMTransport("/detected/image/0", Image) + detection.detected_image_1.transport = core.LCMTransport("/detected/image/1", Image) + detection.detected_image_2.transport = core.LCMTransport("/detected/image/2", Image) + + self.detection = detection + def start(self): """Start the robot system with all modules.""" - self.dimos = core.start(4) # 2 workers for connection and visualization + self.dimos = core.start(8) # 2 workers for connection and visualization if self.enable_connection: self._deploy_connection() self._deploy_visualization() - if self.enable_camera: - self._deploy_camera() + if self.enable_perception: + self._deploy_perception() if self.enable_joystick: self._deploy_joystick() @@ -195,12 +259,65 @@ def start(self): if self.enable_ros_bridge: self._deploy_ros_bridge() - self._start_modules() + self.nav = self.dimos.deploy(NavigationModule) + self.nav.goal_reached.transport = core.LCMTransport("/goal_reached", Bool) + self.nav.goal_pose.transport = core.LCMTransport("/goal_pose", PoseStamped) + self.nav.cancel_goal.transport = core.LCMTransport("/cancel_goal", Bool) + self.nav.joy.transport = core.LCMTransport("/joy", Joy) + self.nav.start() + + self._deploy_camera() + self._deploy_detection(self.nav.go_to) self.lcm.start() + # Setup agent with G1 skills + logger.info("Setting up agent with G1 skills...") + + agent = Agent( + system_prompt="You are a helpful assistant controlling a Unitree G1 humanoid robot. You can control the robot's arms, movement modes, and navigation.", + model=Model.GPT_4O, + provider=Provider.OPENAI, + ) + + # Register G1-specific skill container + g1_skills = UnitreeG1SkillContainer(robot=self) + agent.register_skills(g1_skills) + + human_input = self.dimos.deploy(HumanInput) + agent.register_skills(human_input) + + if self.enable_perception: + agent.register_skills(self.detection) + + # Register ROS navigation + ros_nav = RosNavigation(self) + ros_nav.__enter__() + agent.register_skills(ros_nav) + + agent.run_implicit_skill("human") + agent.start() + + # For logging + skills = [tool.name for tool in agent.get_tools()] + logger.info(f"Agent configured with {len(skills)} skills: {', '.join(skills)}") + + agent.loop_thread() + logger.info("UnitreeG1 initialized and started") logger.info(f"WebSocket visualization available at http://localhost:{self.websocket_port}") + self._start_modules() + + def stop(self): + self.lcm.stop() + + def __enter__(self) -> "UnitreeG1": + self.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.stop() + return False def _deploy_connection(self): """Deploy and configure the connection module.""" @@ -212,46 +329,28 @@ def _deploy_connection(self): self.connection.odom_pose.transport = core.LCMTransport("/odom", PoseStamped) def _deploy_camera(self): - """Deploy and configure the ZED camera module (real or fake based on replay_path).""" - - if self.replay_path: - # Use FakeZEDModule for replay - from dimos.hardware.fake_zed_module import FakeZEDModule - - logger.info(f"Deploying FakeZEDModule for replay from: {self.replay_path}") - self.zed_camera = self.dimos.deploy( - FakeZEDModule, - recording_path=self.replay_path, - frame_id="zed_camera", - ) - else: - # Use real ZEDModule (with optional recording) - logger.info("Deploying ZED camera module...") - self.zed_camera = self.dimos.deploy( - ZEDModule, - camera_id=0, - resolution="HD720", - depth_mode="NEURAL", - fps=30, - enable_tracking=True, # Enable for G1 pose estimation - enable_imu_fusion=True, - set_floor_as_origin=True, - publish_rate=30.0, - frame_id="zed_camera", - recording_path=self.recording_path, # Pass recording path if provided - ) - - # Configure ZED LCM transports (same for both real and fake) - self.zed_camera.color_image.transport = core.pSHMTransport( - "/zed/color_image", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE - ) - self.zed_camera.depth_image.transport = core.pSHMTransport( - "/zed/depth_image", default_capacity=DEFAULT_CAPACITY_DEPTH_IMAGE + """Deploy and configure a standard webcam module.""" + logger.info("Deploying standard webcam module...") + + self.camera = self.dimos.deploy( + CameraModule, + transform=Transform( + translation=Vector3(0.05, 0.0, 0.0), + rotation=Quaternion.from_euler(Vector3(0.0, 0.2, 0.0)), + frame_id="sensor", + child_frame_id="camera_link", + ), + hardware=lambda: Webcam( + camera_index=0, + frequency=15, + stereo_slice="left", + camera_info=zed.CameraInfo.SingleWebcam, + ), ) - self.zed_camera.camera_info.transport = core.LCMTransport("/zed/camera_info", CameraInfo) - self.zed_camera.pose.transport = core.LCMTransport("/zed/pose", PoseStamped) - logger.info("ZED camera module configured") + self.camera.image.transport = core.LCMTransport("/image", Image) + self.camera.camera_info.transport = core.LCMTransport("/camera_info", CameraInfo) + logger.info("Webcam module configured") def _deploy_visualization(self): """Deploy and configure visualization modules.""" @@ -269,6 +368,20 @@ def _deploy_visualization(self): ] ) + def _deploy_perception(self): + self.spatial_memory_module = self.dimos.deploy( + SpatialMemory, + collection_name=self.spatial_memory_collection, + db_path=self.db_path, + visual_memory_path=self.visual_memory_path, + output_dir=self.spatial_memory_dir, + ) + + self.spatial_memory_module.video.transport = core.LCMTransport("/image", Image) + self.spatial_memory_module.odom.transport = core.LCMTransport("/odom", PoseStamped) + + logger.info("Spatial memory module deployed and connected") + def _deploy_joystick(self): """Deploy joystick control module.""" from dimos.robot.unitree_webrtc.g1_joystick_module import G1JoystickModule @@ -297,9 +410,30 @@ def _deploy_ros_bridge(self): "/tf", TFMessage, ROSTFMessage, direction=BridgeDirection.ROS_TO_DIMOS ) - # Add /registered_scan topic from ROS to DIMOS + from geometry_msgs.msg import PoseStamped as ROSPoseStamped + from std_msgs.msg import Bool as ROSBool + + from dimos.msgs.std_msgs import Bool + + # Navigation control topics from autonomy stack + self.ros_bridge.add_topic( + "/goal_pose", PoseStamped, ROSPoseStamped, direction=BridgeDirection.DIMOS_TO_ROS + ) + self.ros_bridge.add_topic( + "/cancel_goal", Bool, ROSBool, direction=BridgeDirection.DIMOS_TO_ROS + ) + self.ros_bridge.add_topic( + "/goal_reached", Bool, ROSBool, direction=BridgeDirection.ROS_TO_DIMOS + ) + + self.ros_bridge.add_topic("/joy", Joy, ROSJoy, direction=BridgeDirection.DIMOS_TO_ROS) + self.ros_bridge.add_topic( - "/registered_scan", PointCloud2, ROSPointCloud2, direction=BridgeDirection.ROS_TO_DIMOS + "/registered_scan", + PointCloud2, + ROSPointCloud2, + direction=BridgeDirection.ROS_TO_DIMOS, + remap_topic="/map", ) logger.info( @@ -310,11 +444,17 @@ def _start_modules(self): """Start all deployed modules.""" if self.connection: self.connection.start() - self.websocket_vis.start() + # self.websocket_vis.start() self.foxglove_bridge.start() - if self.joystick: - self.joystick.start() + # if self.joystick: + # self.joystick.start() + + self.camera.start() + self.detection.start() + + if self.enable_perception: + self.spatial_memory_module.start() # Initialize skills after connection is established if self.skill_library is not None: @@ -356,11 +496,16 @@ def shutdown(self): logger.info("UnitreeG1 shutdown complete") + @property + def spatial_memory(self) -> Optional[SpatialMemory]: + return self.spatial_memory_module + def main(): """Main entry point for testing.""" - import os import argparse + import os + from dotenv import load_dotenv load_dotenv() @@ -368,7 +513,7 @@ def main(): parser = argparse.ArgumentParser(description="Unitree G1 Humanoid Robot Control") parser.add_argument("--ip", default=os.getenv("ROBOT_IP"), help="Robot IP address") parser.add_argument("--joystick", action="store_true", help="Enable pygame joystick control") - parser.add_argument("--camera", action="store_true", help="Enable ZED camera module") + parser.add_argument("--camera", action="store_true", help="Enable usb camera module") parser.add_argument("--output-dir", help="Output directory for logs/data") parser.add_argument("--record", help="Path to save recording") parser.add_argument("--replay", help="Path to replay recording from") @@ -386,9 +531,23 @@ def main(): enable_camera=args.camera, enable_connection=os.getenv("ROBOT_IP") is not None, enable_ros_bridge=True, + enable_perception=True, ) robot.start() + # time.sleep(7) + # print("Starting navigation...") + # print( + # robot.nav.go_to( + # PoseStamped( + # ts=time.time(), + # frame_id="map", + # position=Vector3(0.0, 0.0, 0.03), + # orientation=Quaternion(0, 0, 0, 0), + # ), + # timeout=10, + # ), + # ) try: if args.joystick: print("\n" + "=" * 50) diff --git a/dimos/robot/unitree_webrtc/unitree_g1_skill_container.py b/dimos/robot/unitree_webrtc/unitree_g1_skill_container.py new file mode 100644 index 0000000000..ff1bcfad1f --- /dev/null +++ b/dimos/robot/unitree_webrtc/unitree_g1_skill_container.py @@ -0,0 +1,225 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Unitree G1 skill container for the new agents2 framework. +Dynamically generates skills for G1 humanoid robot including arm controls and movement modes. +""" + +from __future__ import annotations + +import datetime +import time +from typing import TYPE_CHECKING, Optional, Union + +from dimos.msgs.geometry_msgs import Twist, TwistStamped, Vector3 +from dimos.protocol.skill.skill import skill +from dimos.protocol.skill.type import Reducer, Stream +from dimos.robot.unitree_webrtc.unitree_skill_container import UnitreeSkillContainer +from dimos.utils.logging_config import setup_logger + +if TYPE_CHECKING: + from dimos.robot.unitree_webrtc.unitree_g1 import UnitreeG1 + from dimos.robot.unitree_webrtc.unitree_go2 import UnitreeGo2 + +logger = setup_logger("dimos.robot.unitree_webrtc.unitree_g1_skill_container") + +# G1 Arm Actions - all use api_id 7106 on topic "rt/api/arm/request" +G1_ARM_CONTROLS = [ + ("Handshake", 27, "Perform a handshake gesture with the right hand."), + ("HighFive", 18, "Give a high five with the right hand."), + ("Hug", 19, "Perform a hugging gesture with both arms."), + ("HighWave", 26, "Wave with the hand raised high."), + ("Clap", 17, "Clap hands together."), + ("FaceWave", 25, "Wave near the face level."), + ("LeftKiss", 12, "Blow a kiss with the left hand."), + ("ArmHeart", 20, "Make a heart shape with both arms overhead."), + ("RightHeart", 21, "Make a heart gesture with the right hand."), + ("HandsUp", 15, "Raise both hands up in the air."), + ("XRay", 24, "Hold arms in an X-ray pose position."), + ("RightHandUp", 23, "Raise only the right hand up."), + ("Reject", 22, "Make a rejection or 'no' gesture."), + ("CancelAction", 99, "Cancel any current arm action and return hands to neutral position."), +] + +# G1 Movement Modes - all use api_id 7101 on topic "rt/api/sport/request" +G1_MODE_CONTROLS = [ + ("WalkMode", 500, "Switch to normal walking mode."), + ("WalkControlWaist", 501, "Switch to walking mode with waist control."), + ("RunMode", 801, "Switch to running mode."), +] + + +class UnitreeG1SkillContainer(UnitreeSkillContainer): + """Container for Unitree G1 humanoid robot skills. + + Inherits all Go2 skills and adds G1-specific arm controls and movement modes. + """ + + def __init__(self, robot: Optional[Union[UnitreeG1, UnitreeGo2]] = None): + """Initialize the skill container with robot reference. + + Args: + robot: The UnitreeG1 or UnitreeGo2 robot instance + """ + # Initialize parent class to get all base Unitree skills + super().__init__(robot) + + # Add G1-specific skills on top + self._generate_arm_skills() + self._generate_mode_skills() + + def _generate_arm_skills(self): + """Dynamically generate arm control skills from G1_ARM_CONTROLS list.""" + logger.info(f"Generating {len(G1_ARM_CONTROLS)} G1 arm control skills") + + for name, data_value, description in G1_ARM_CONTROLS: + skill_name = self._convert_to_snake_case(name) + self._create_arm_skill(skill_name, data_value, description, name) + + def _generate_mode_skills(self): + """Dynamically generate movement mode skills from G1_MODE_CONTROLS list.""" + logger.info(f"Generating {len(G1_MODE_CONTROLS)} G1 movement mode skills") + + for name, data_value, description in G1_MODE_CONTROLS: + skill_name = self._convert_to_snake_case(name) + self._create_mode_skill(skill_name, data_value, description, name) + + def _create_arm_skill( + self, skill_name: str, data_value: int, description: str, original_name: str + ): + """Create a dynamic arm control skill method with the @skill decorator. + + Args: + skill_name: Snake_case name for the method + data_value: The arm action data value + description: Human-readable description + original_name: Original CamelCase name for display + """ + + def dynamic_skill_func(self) -> str: + """Dynamic arm skill function.""" + return self._execute_arm_command(data_value, original_name) + + # Set the function's metadata + dynamic_skill_func.__name__ = skill_name + dynamic_skill_func.__doc__ = description + + # Apply the @skill decorator + decorated_skill = skill()(dynamic_skill_func) + + # Bind the method to the instance + bound_method = decorated_skill.__get__(self, self.__class__) + + # Add it as an attribute + setattr(self, skill_name, bound_method) + + logger.debug(f"Generated arm skill: {skill_name} (data={data_value})") + + def _create_mode_skill( + self, skill_name: str, data_value: int, description: str, original_name: str + ): + """Create a dynamic movement mode skill method with the @skill decorator. + + Args: + skill_name: Snake_case name for the method + data_value: The mode data value + description: Human-readable description + original_name: Original CamelCase name for display + """ + + def dynamic_skill_func(self) -> str: + """Dynamic mode skill function.""" + return self._execute_mode_command(data_value, original_name) + + # Set the function's metadata + dynamic_skill_func.__name__ = skill_name + dynamic_skill_func.__doc__ = description + + # Apply the @skill decorator + decorated_skill = skill()(dynamic_skill_func) + + # Bind the method to the instance + bound_method = decorated_skill.__get__(self, self.__class__) + + # Add it as an attribute + setattr(self, skill_name, bound_method) + + logger.debug(f"Generated mode skill: {skill_name} (data={data_value})") + + # ========== Override Skills for G1 ========== + + @skill() + def move(self, x: float, y: float = 0.0, yaw: float = 0.0, duration: float = 0.0) -> str: + """Move the robot using direct velocity commands (G1 version with TwistStamped). + + Args: + x: Forward velocity (m/s) + y: Left/right velocity (m/s) + yaw: Rotational velocity (rad/s) + duration: How long to move (seconds) + """ + if self._robot is None: + return "Error: Robot not connected" + + # G1 uses TwistStamped instead of Twist + twist_stamped = TwistStamped(linear=Vector3(x, y, 0), angular=Vector3(0, 0, yaw)) + self._robot.move(twist_stamped, duration=duration) + return f"Started moving with velocity=({x}, {y}, {yaw}) for {duration} seconds" + + # ========== Helper Methods ========== + + def _execute_arm_command(self, data_value: int, name: str) -> str: + """Execute an arm command through WebRTC interface. + + Args: + data_value: The arm action data value + name: Human-readable name of the command + """ + if self._robot is None: + return f"Error: Robot not connected (cannot execute {name})" + + try: + result = self._robot.connection.publish_request( + "rt/api/arm/request", {"api_id": 7106, "parameter": {"data": data_value}} + ) + message = f"G1 arm action {name} executed successfully (data={data_value})" + logger.info(message) + return message + except Exception as e: + error_msg = f"Failed to execute G1 arm action {name}: {e}" + logger.error(error_msg) + return error_msg + + def _execute_mode_command(self, data_value: int, name: str) -> str: + """Execute a movement mode command through WebRTC interface. + + Args: + data_value: The mode data value + name: Human-readable name of the command + """ + if self._robot is None: + return f"Error: Robot not connected (cannot execute {name})" + + try: + result = self._robot.connection.publish_request( + "rt/api/sport/request", {"api_id": 7101, "parameter": {"data": data_value}} + ) + message = f"G1 mode {name} activated successfully (data={data_value})" + logger.info(message) + return message + except Exception as e: + error_msg = f"Failed to execute G1 mode {name}: {e}" + logger.error(error_msg) + return error_msg diff --git a/dimos/types/test_timestamped.py b/dimos/types/test_timestamped.py index 44f359dbc7..a4617774ef 100644 --- a/dimos/types/test_timestamped.py +++ b/dimos/types/test_timestamped.py @@ -268,6 +268,7 @@ def test_time_window_collection(): assert window.end_ts == 5.5 +@pytest.mark.tofix def test_timestamp_alignment(): # Create a dedicated scheduler for this test to avoid thread leaks test_scheduler = ThreadPoolScheduler(max_workers=6) diff --git a/dimos/types/test_weaklist.py b/dimos/types/test_weaklist.py new file mode 100644 index 0000000000..fd0b05d74f --- /dev/null +++ b/dimos/types/test_weaklist.py @@ -0,0 +1,163 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for WeakList implementation.""" + +import gc +import pytest +from dimos.types.weaklist import WeakList + + +class TestObject: + """Simple test object.""" + + def __init__(self, value): + self.value = value + + def __repr__(self): + return f"TestObject({self.value})" + + +def test_weaklist_basic_operations(): + """Test basic append, iterate, and length operations.""" + wl = WeakList() + + # Add objects + obj1 = TestObject(1) + obj2 = TestObject(2) + obj3 = TestObject(3) + + wl.append(obj1) + wl.append(obj2) + wl.append(obj3) + + # Check length and iteration + assert len(wl) == 3 + assert list(wl) == [obj1, obj2, obj3] + + # Check contains + assert obj1 in wl + assert obj2 in wl + assert TestObject(4) not in wl + + +def test_weaklist_auto_removal(): + """Test that objects are automatically removed when garbage collected.""" + wl = WeakList() + + obj1 = TestObject(1) + obj2 = TestObject(2) + obj3 = TestObject(3) + + wl.append(obj1) + wl.append(obj2) + wl.append(obj3) + + assert len(wl) == 3 + + # Delete one object and force garbage collection + del obj2 + gc.collect() + + # Should only have 2 objects now + assert len(wl) == 2 + assert list(wl) == [obj1, obj3] + + +def test_weaklist_explicit_remove(): + """Test explicit removal of objects.""" + wl = WeakList() + + obj1 = TestObject(1) + obj2 = TestObject(2) + + wl.append(obj1) + wl.append(obj2) + + # Remove obj1 + wl.remove(obj1) + assert len(wl) == 1 + assert obj1 not in wl + assert obj2 in wl + + # Try to remove non-existent object + with pytest.raises(ValueError): + wl.remove(TestObject(3)) + + +def test_weaklist_indexing(): + """Test index access.""" + wl = WeakList() + + obj1 = TestObject(1) + obj2 = TestObject(2) + obj3 = TestObject(3) + + wl.append(obj1) + wl.append(obj2) + wl.append(obj3) + + assert wl[0] is obj1 + assert wl[1] is obj2 + assert wl[2] is obj3 + + # Test index out of range + with pytest.raises(IndexError): + _ = wl[3] + + +def test_weaklist_clear(): + """Test clearing the list.""" + wl = WeakList() + + obj1 = TestObject(1) + obj2 = TestObject(2) + + wl.append(obj1) + wl.append(obj2) + + assert len(wl) == 2 + + wl.clear() + assert len(wl) == 0 + assert obj1 not in wl + + +def test_weaklist_iteration_during_modification(): + """Test that iteration works even if objects are deleted during iteration.""" + wl = WeakList() + + objects = [TestObject(i) for i in range(5)] + for obj in objects: + wl.append(obj) + + # Verify initial state + assert len(wl) == 5 + + # Iterate and check that we can safely delete objects + seen_values = [] + for obj in wl: + seen_values.append(obj.value) + if obj.value == 2: + # Delete another object (not the current one) + del objects[3] # Delete TestObject(3) + gc.collect() + + # The object with value 3 gets garbage collected during iteration + # so we might not see it (depends on timing) + assert len(seen_values) in [4, 5] + assert all(v in [0, 1, 2, 3, 4] for v in seen_values) + + # After iteration, the list should have 4 objects (one was deleted) + assert len(wl) == 4 diff --git a/dimos/types/weaklist.py b/dimos/types/weaklist.py new file mode 100644 index 0000000000..8722455c66 --- /dev/null +++ b/dimos/types/weaklist.py @@ -0,0 +1,85 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Weak reference list implementation that automatically removes dead references.""" + +import weakref +from typing import Any, Iterator, Optional + + +class WeakList: + """A list that holds weak references to objects. + + Objects are automatically removed when garbage collected. + Supports iteration, append, remove, and length operations. + """ + + def __init__(self): + self._refs = [] + + def append(self, obj: Any) -> None: + """Add an object to the list (stored as weak reference).""" + + def _cleanup(ref): + try: + self._refs.remove(ref) + except ValueError: + pass + + self._refs.append(weakref.ref(obj, _cleanup)) + + def remove(self, obj: Any) -> None: + """Remove an object from the list.""" + for i, ref in enumerate(self._refs): + if ref() is obj: + del self._refs[i] + return + raise ValueError(f"{obj} not in WeakList") + + def discard(self, obj: Any) -> None: + """Remove an object from the list if present, otherwise do nothing.""" + try: + self.remove(obj) + except ValueError: + pass + + def __iter__(self) -> Iterator[Any]: + """Iterate over live objects, skipping dead references.""" + # Create a copy to avoid modification during iteration + for ref in self._refs[:]: + obj = ref() + if obj is not None: + yield obj + + def __len__(self) -> int: + """Return count of live objects.""" + return sum(1 for _ in self) + + def __contains__(self, obj: Any) -> bool: + """Check if object is in the list.""" + return any(ref() is obj for ref in self._refs) + + def clear(self) -> None: + """Remove all references.""" + self._refs.clear() + + def __getitem__(self, index: int) -> Any: + """Get object at index (only counting live objects).""" + for i, obj in enumerate(self): + if i == index: + return obj + raise IndexError("WeakList index out of range") + + def __repr__(self) -> str: + return f"WeakList({list(self)})" diff --git a/dimos/utils/data.py b/dimos/utils/data.py index 62ef6da851..0a2656ca82 100644 --- a/dimos/utils/data.py +++ b/dimos/utils/data.py @@ -12,18 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import glob -import os -import pickle import subprocess import tarfile from functools import cache from pathlib import Path -from typing import Any, Callable, Generic, Iterator, Optional, Type, TypeVar, Union - -from reactivex import from_iterable, interval -from reactivex import operators as ops -from reactivex.observable import Observable +from typing import Optional, Union @cache @@ -38,7 +31,9 @@ def _get_repo_root() -> Path: @cache -def _get_data_dir() -> Path: +def _get_data_dir(extra_path: Optional[str] = None) -> Path: + if extra_path: + return _get_repo_root() / "data" / extra_path return _get_repo_root() / "data" diff --git a/dimos/utils/reactive.py b/dimos/utils/reactive.py index abc8512dfb..8ab8fe66ae 100644 --- a/dimos/utils/reactive.py +++ b/dimos/utils/reactive.py @@ -13,13 +13,13 @@ # limitations under the License. import threading -from typing import Optional, TypeVar, Generic, Any, Callable +from typing import Any, Callable, Generic, Optional, TypeVar import reactivex as rx from reactivex import operators as ops -from reactivex.scheduler import ThreadPoolScheduler from reactivex.disposable import Disposable from reactivex.observable import Observable +from reactivex.scheduler import ThreadPoolScheduler from rxpy_backpressure import BackPressure from dimos.utils.threadpool import get_scheduler @@ -197,3 +197,33 @@ def spyfun(x): return x return ops.map(spyfun) + + +def quality_barrier(quality_func: Callable[[T], float], target_frequency: float): + """ + RxPY pipe operator that selects the highest quality item within each time window. + + Args: + quality_func: Function to compute quality score for each item + target_frequency: Output frequency in Hz (e.g., 1.0 for 1 item per second) + + Returns: + A pipe operator that can be used with .pipe() + """ + window_duration = 1.0 / target_frequency # Duration of each window in seconds + + def _quality_barrier(source: Observable[T]) -> Observable[T]: + return source.pipe( + # Create time-based windows + ops.window_with_time(window_duration), + # For each window, find the highest quality item + ops.flat_map( + lambda window: window.pipe( + ops.to_list(), + ops.map(lambda items: max(items, key=quality_func) if items else None), + ops.filter(lambda x: x is not None), + ) + ), + ) + + return _quality_barrier diff --git a/pyproject.toml b/pyproject.toml index 7979293411..670f98a0b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -111,6 +111,7 @@ dependencies = [ [project.scripts] lcmspy = "dimos.utils.cli.lcmspy.run_lcmspy:main" +lcm-recorder = "dimos.utils.cli.recorder.run_recorder:main" foxglove-bridge = "dimos.utils.cli.foxglove_bridge.run_foxglove_bridge:main" skillspy = "dimos.utils.cli.skillspy.skillspy:main" agentspy = "dimos.utils.cli.agentspy.agentspy:main"