diff --git a/data/.lfs/models_edgetam.tar.gz b/data/.lfs/models_edgetam.tar.gz new file mode 100644 index 0000000000..64baa5d139 --- /dev/null +++ b/data/.lfs/models_edgetam.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd452096f91415ce7ca90548a06a87354ccdb19a66925c0242413c80b08f5c57 +size 51988780 diff --git a/data/.lfs/models_yoloe.tar.gz b/data/.lfs/models_yoloe.tar.gz new file mode 100644 index 0000000000..a0870d71d2 --- /dev/null +++ b/data/.lfs/models_yoloe.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a78e39477667b25c9454f846cd66dc044dd05981b2f7ebb0d331ef3626de9bc +size 184892540 diff --git a/dimos/hardware/sensors/camera/realsense/README.md b/dimos/hardware/sensors/camera/realsense/README.md new file mode 100644 index 0000000000..665833047e --- /dev/null +++ b/dimos/hardware/sensors/camera/realsense/README.md @@ -0,0 +1,9 @@ +# RealSense SDK Install + +1) Install the Intel RealSense SDK: + - https://github.com/IntelRealSense/librealsense + +2) Install the Python bindings: + ```bash + pip install pyrealsense2 + ``` diff --git a/dimos/manipulation/manip_aio_pipeline.py b/dimos/manipulation/manip_aio_pipeline.py deleted file mode 100644 index fe3598ab1e..0000000000 --- a/dimos/manipulation/manip_aio_pipeline.py +++ /dev/null @@ -1,592 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Asynchronous, reactive manipulation pipeline for realtime detection, filtering, and grasp generation. -""" - -import asyncio -import json -import threading -import time - -import cv2 -import numpy as np -import reactivex as rx -import reactivex.operators as ops -import websockets - -from dimos.perception.common.utils import colorize_depth -from dimos.perception.detection2d.detic_2d_det import ( # type: ignore[import-not-found, import-untyped] - Detic2DDetector, -) -from dimos.perception.grasp_generation.utils import draw_grasps_on_image -from dimos.perception.object_detection_stream import ObjectDetectionStream -from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering -from dimos.perception.pointcloud.utils import create_point_cloud_overlay_visualization -from dimos.utils.logging_config import setup_logger - -logger = setup_logger() - - -class ManipulationPipeline: - """ - Clean separated stream pipeline with frame buffering. - - - Object detection runs independently on RGB stream - - Point cloud processing subscribes to both detection and ZED streams separately - - Simple frame buffering to match RGB+depth+objects - """ - - def __init__( - self, - camera_intrinsics: list[float], # [fx, fy, cx, cy] - min_confidence: float = 0.6, - max_objects: int = 10, - vocabulary: str | None = None, - grasp_server_url: str | None = None, - enable_grasp_generation: bool = False, - ) -> None: - """ - Initialize the manipulation pipeline. - - Args: - camera_intrinsics: [fx, fy, cx, cy] camera parameters - min_confidence: Minimum detection confidence threshold - max_objects: Maximum number of objects to process - vocabulary: Optional vocabulary for Detic detector - grasp_server_url: Optional WebSocket URL for Dimensional Grasp server - enable_grasp_generation: Whether to enable async grasp generation - """ - self.camera_intrinsics = camera_intrinsics - self.min_confidence = min_confidence - - # Grasp generation settings - self.grasp_server_url = grasp_server_url - self.enable_grasp_generation = enable_grasp_generation - - # Asyncio event loop for WebSocket communication - self.grasp_loop = None - self.grasp_loop_thread = None - - # Storage for grasp results and filtered objects - self.latest_grasps: list[dict] = [] # type: ignore[type-arg] # Simplified: just a list of grasps - self.grasps_consumed = False - self.latest_filtered_objects = [] # type: ignore[var-annotated] - self.latest_rgb_for_grasps = None # Store RGB image for grasp overlay - self.grasp_lock = threading.Lock() - - # Track pending requests - simplified to single task - self.grasp_task: asyncio.Task | None = None # type: ignore[type-arg] - - # Reactive subjects for streaming filtered objects and grasps - self.filtered_objects_subject = rx.subject.Subject() # type: ignore[var-annotated] - self.grasps_subject = rx.subject.Subject() # type: ignore[var-annotated] - self.grasp_overlay_subject = rx.subject.Subject() # type: ignore[var-annotated] # Add grasp overlay subject - - # Initialize grasp client if enabled - if self.enable_grasp_generation and self.grasp_server_url: - self._start_grasp_loop() - - # Initialize object detector - self.detector = Detic2DDetector(vocabulary=vocabulary, threshold=min_confidence) - - # Initialize point cloud processor - self.pointcloud_filter = PointcloudFiltering( - color_intrinsics=camera_intrinsics, - depth_intrinsics=camera_intrinsics, # ZED uses same intrinsics - max_num_objects=max_objects, - ) - - logger.info(f"Initialized ManipulationPipeline with confidence={min_confidence}") - - def create_streams(self, zed_stream: rx.Observable) -> dict[str, rx.Observable]: # type: ignore[type-arg] - """ - Create streams using exact old main logic. - """ - # Create ZED streams (from old main) - zed_frame_stream = zed_stream.pipe(ops.share()) - - # RGB stream for object detection (from old main) - video_stream = zed_frame_stream.pipe( - ops.map(lambda x: x.get("rgb") if x is not None else None), # type: ignore[attr-defined] - ops.filter(lambda x: x is not None), - ops.share(), - ) - object_detector = ObjectDetectionStream( - camera_intrinsics=self.camera_intrinsics, - min_confidence=self.min_confidence, - class_filter=None, - detector=self.detector, - video_stream=video_stream, - disable_depth=True, - ) - - # Store latest frames for point cloud processing (from old main) - latest_rgb = None - latest_depth = None - latest_point_cloud_overlay = None - frame_lock = threading.Lock() - - # Subscribe to combined ZED frames (from old main) - def on_zed_frame(zed_data) -> None: # type: ignore[no-untyped-def] - nonlocal latest_rgb, latest_depth - if zed_data is not None: - with frame_lock: - latest_rgb = zed_data.get("rgb") - latest_depth = zed_data.get("depth") - - # Depth stream for point cloud filtering (from old main) - def get_depth_or_overlay(zed_data): # type: ignore[no-untyped-def] - if zed_data is None: - return None - - # Check if we have a point cloud overlay available - with frame_lock: - overlay = latest_point_cloud_overlay - - if overlay is not None: - return overlay - else: - # Return regular colorized depth - return colorize_depth(zed_data.get("depth"), max_depth=10.0) - - depth_stream = zed_frame_stream.pipe( - ops.map(get_depth_or_overlay), ops.filter(lambda x: x is not None), ops.share() - ) - - # Process object detection results with point cloud filtering (from old main) - def on_detection_next(result) -> None: # type: ignore[no-untyped-def] - nonlocal latest_point_cloud_overlay - if result.get("objects"): - # Get latest RGB and depth frames - with frame_lock: - rgb = latest_rgb - depth = latest_depth - - if rgb is not None and depth is not None: - try: - filtered_objects = self.pointcloud_filter.process_images( - rgb, depth, result["objects"] - ) - - if filtered_objects: - # Store filtered objects - with self.grasp_lock: - self.latest_filtered_objects = filtered_objects - self.filtered_objects_subject.on_next(filtered_objects) - - # Create base image (colorized depth) - base_image = colorize_depth(depth, max_depth=10.0) - - # Create point cloud overlay visualization - overlay_viz = create_point_cloud_overlay_visualization( - base_image=base_image, # type: ignore[arg-type] - objects=filtered_objects, # type: ignore[arg-type] - intrinsics=self.camera_intrinsics, # type: ignore[arg-type] - ) - - # Store the overlay for the stream - with frame_lock: - latest_point_cloud_overlay = overlay_viz - - # Request grasps if enabled - if self.enable_grasp_generation and len(filtered_objects) > 0: - # Save RGB image for later grasp overlay - with frame_lock: - self.latest_rgb_for_grasps = rgb.copy() - - task = self.request_scene_grasps(filtered_objects) # type: ignore[arg-type] - if task: - # Check for results after a delay - def check_grasps_later() -> None: - time.sleep(2.0) # Wait for grasp processing - # Wait for task to complete - if hasattr(self, "grasp_task") and self.grasp_task: - try: - self.grasp_task.result( # type: ignore[call-arg] - timeout=3.0 - ) # Get result with timeout - except Exception as e: - logger.warning(f"Grasp task failed or timeout: {e}") - - # Try to get latest grasps and create overlay - with self.grasp_lock: - grasps = self.latest_grasps - - if grasps and hasattr(self, "latest_rgb_for_grasps"): - # Create grasp overlay on the saved RGB image - try: - bgr_image = cv2.cvtColor( # type: ignore[call-overload] - self.latest_rgb_for_grasps, cv2.COLOR_RGB2BGR - ) - result_bgr = draw_grasps_on_image( - bgr_image, - grasps, - self.camera_intrinsics, - max_grasps=-1, # Show all grasps - ) - result_rgb = cv2.cvtColor( - result_bgr, cv2.COLOR_BGR2RGB - ) - - # Emit grasp overlay immediately - self.grasp_overlay_subject.on_next(result_rgb) - - except Exception as e: - logger.error(f"Error creating grasp overlay: {e}") - - # Emit grasps to stream - self.grasps_subject.on_next(grasps) - - threading.Thread(target=check_grasps_later, daemon=True).start() - else: - logger.warning("Failed to create grasp task") - except Exception as e: - logger.error(f"Error in point cloud filtering: {e}") - with frame_lock: - latest_point_cloud_overlay = None - - def on_error(error) -> None: # type: ignore[no-untyped-def] - logger.error(f"Error in stream: {error}") - - def on_completed() -> None: - logger.info("Stream completed") - - def start_subscriptions() -> None: - """Start subscriptions in background thread (from old main)""" - # Subscribe to combined ZED frames - zed_frame_stream.subscribe(on_next=on_zed_frame) - - # Start subscriptions in background thread (from old main) - subscription_thread = threading.Thread(target=start_subscriptions, daemon=True) - subscription_thread.start() - time.sleep(2) # Give subscriptions time to start - - # Subscribe to object detection stream (from old main) - object_detector.get_stream().subscribe( # type: ignore[no-untyped-call] - on_next=on_detection_next, on_error=on_error, on_completed=on_completed - ) - - # Create visualization stream for web interface (from old main) - viz_stream = object_detector.get_stream().pipe( # type: ignore[no-untyped-call] - ops.map(lambda x: x["viz_frame"] if x is not None else None), # type: ignore[index] - ops.filter(lambda x: x is not None), - ) - - # Create filtered objects stream - filtered_objects_stream = self.filtered_objects_subject - - # Create grasps stream - grasps_stream = self.grasps_subject - - # Create grasp overlay subject for immediate emission - grasp_overlay_stream = self.grasp_overlay_subject - - return { - "detection_viz": viz_stream, - "pointcloud_viz": depth_stream, - "objects": object_detector.get_stream().pipe(ops.map(lambda x: x.get("objects", []))), # type: ignore[attr-defined, no-untyped-call] - "filtered_objects": filtered_objects_stream, - "grasps": grasps_stream, - "grasp_overlay": grasp_overlay_stream, - } - - def _start_grasp_loop(self) -> None: - """Start asyncio event loop in a background thread for WebSocket communication.""" - - def run_loop() -> None: - self.grasp_loop = asyncio.new_event_loop() # type: ignore[assignment] - asyncio.set_event_loop(self.grasp_loop) - self.grasp_loop.run_forever() # type: ignore[attr-defined] - - self.grasp_loop_thread = threading.Thread(target=run_loop, daemon=True) # type: ignore[assignment] - self.grasp_loop_thread.start() # type: ignore[attr-defined] - - # Wait for loop to start - while self.grasp_loop is None: - time.sleep(0.01) - - async def _send_grasp_request( - self, - points: np.ndarray, # type: ignore[type-arg] - colors: np.ndarray | None, # type: ignore[type-arg] - ) -> list[dict] | None: # type: ignore[type-arg] - """Send grasp request to Dimensional Grasp server.""" - try: - # Comprehensive client-side validation to prevent server errors - - # Validate points array - if points is None: - logger.error("Points array is None") - return None - if not isinstance(points, np.ndarray): - logger.error(f"Points is not numpy array: {type(points)}") - return None - if points.size == 0: - logger.error("Points array is empty") - return None - if len(points.shape) != 2 or points.shape[1] != 3: - logger.error(f"Points has invalid shape {points.shape}, expected (N, 3)") - return None - if points.shape[0] < 100: # Minimum points for stable grasp detection - logger.error(f"Insufficient points for grasp detection: {points.shape[0]} < 100") - return None - - # Validate and prepare colors - if colors is not None: - if not isinstance(colors, np.ndarray): - colors = None - elif colors.size == 0: - colors = None - elif len(colors.shape) != 2 or colors.shape[1] != 3: - colors = None - elif colors.shape[0] != points.shape[0]: - colors = None - - # If no valid colors, create default colors (required by server) - if colors is None: - # Create default white colors for all points - colors = np.ones((points.shape[0], 3), dtype=np.float32) * 0.5 - - # Ensure data types are correct (server expects float32) - points = points.astype(np.float32) - colors = colors.astype(np.float32) - - # Validate ranges (basic sanity checks) - if np.any(np.isnan(points)) or np.any(np.isinf(points)): - logger.error("Points contain NaN or Inf values") - return None - if np.any(np.isnan(colors)) or np.any(np.isinf(colors)): - logger.error("Colors contain NaN or Inf values") - return None - - # Clamp color values to valid range [0, 1] - colors = np.clip(colors, 0.0, 1.0) - - async with websockets.connect(self.grasp_server_url) as websocket: # type: ignore[arg-type] - request = { - "points": points.tolist(), - "colors": colors.tolist(), # Always send colors array - "lims": [-0.19, 0.12, 0.02, 0.15, 0.0, 1.0], # Default workspace limits - } - - await websocket.send(json.dumps(request)) - - response = await websocket.recv() - grasps = json.loads(response) - - # Handle server response validation - if isinstance(grasps, dict) and "error" in grasps: - logger.error(f"Server returned error: {grasps['error']}") - return None - elif isinstance(grasps, int | float) and grasps == 0: - return None - elif not isinstance(grasps, list): - logger.error( - f"Server returned unexpected response type: {type(grasps)}, value: {grasps}" - ) - return None - elif len(grasps) == 0: - return None - - converted_grasps = self._convert_grasp_format(grasps) - with self.grasp_lock: - self.latest_grasps = converted_grasps - self.grasps_consumed = False # Reset consumed flag - - # Emit to reactive stream - self.grasps_subject.on_next(self.latest_grasps) - - return converted_grasps - except websockets.exceptions.ConnectionClosed as e: - logger.error(f"WebSocket connection closed: {e}") - except websockets.exceptions.WebSocketException as e: - logger.error(f"WebSocket error: {e}") - except json.JSONDecodeError as e: - logger.error(f"Failed to parse server response as JSON: {e}") - except Exception as e: - logger.error(f"Error requesting grasps: {e}") - - return None - - def request_scene_grasps(self, objects: list[dict]) -> asyncio.Task | None: # type: ignore[type-arg] - """Request grasps for entire scene by combining all object point clouds.""" - if not self.grasp_loop or not objects: - return None - - all_points = [] - all_colors = [] - valid_objects = 0 - - for _i, obj in enumerate(objects): - # Validate point cloud data - if "point_cloud_numpy" not in obj or obj["point_cloud_numpy"] is None: - continue - - points = obj["point_cloud_numpy"] - if not isinstance(points, np.ndarray) or points.size == 0: - continue - - # Ensure points have correct shape (N, 3) - if len(points.shape) != 2 or points.shape[1] != 3: - continue - - # Validate colors if present - colors = None - if "colors_numpy" in obj and obj["colors_numpy"] is not None: - colors = obj["colors_numpy"] - if isinstance(colors, np.ndarray) and colors.size > 0: - # Ensure colors match points count and have correct shape - if colors.shape[0] != points.shape[0]: - colors = None # Ignore colors for this object - elif len(colors.shape) != 2 or colors.shape[1] != 3: - colors = None # Ignore colors for this object - - all_points.append(points) - if colors is not None: - all_colors.append(colors) - valid_objects += 1 - - if not all_points: - return None - - try: - combined_points = np.vstack(all_points) - - # Only combine colors if ALL objects have valid colors - combined_colors = None - if len(all_colors) == valid_objects and len(all_colors) > 0: - combined_colors = np.vstack(all_colors) - - # Validate final combined data - if combined_points.size == 0: - logger.warning("Combined point cloud is empty") - return None - - if combined_colors is not None and combined_colors.shape[0] != combined_points.shape[0]: - logger.warning( - f"Color/point count mismatch: {combined_colors.shape[0]} colors vs {combined_points.shape[0]} points, dropping colors" - ) - combined_colors = None - - except Exception as e: - logger.error(f"Failed to combine point clouds: {e}") - return None - - try: - # Check if there's already a grasp task running - if hasattr(self, "grasp_task") and self.grasp_task and not self.grasp_task.done(): - return self.grasp_task - - task = asyncio.run_coroutine_threadsafe( - self._send_grasp_request(combined_points, combined_colors), self.grasp_loop - ) - - self.grasp_task = task - return task - except Exception: - logger.warning("Failed to create grasp task") - return None - - def get_latest_grasps(self, timeout: float = 5.0) -> list[dict] | None: # type: ignore[type-arg] - """Get latest grasp results, waiting for new ones if current ones have been consumed.""" - # Mark current grasps as consumed and get a reference - with self.grasp_lock: - current_grasps = self.latest_grasps - self.grasps_consumed = True - - # If we already have grasps and they haven't been consumed, return them - if current_grasps is not None and not getattr(self, "grasps_consumed", False): - return current_grasps - - # Wait for new grasps - start_time = time.time() - while time.time() - start_time < timeout: - with self.grasp_lock: - # Check if we have new grasps (different from what we marked as consumed) - if self.latest_grasps is not None and not getattr(self, "grasps_consumed", False): - return self.latest_grasps - time.sleep(0.1) # Check every 100ms - - return None # Timeout reached - - def clear_grasps(self) -> None: - """Clear all stored grasp results.""" - with self.grasp_lock: - self.latest_grasps = [] - - def _prepare_colors(self, colors: np.ndarray | None) -> np.ndarray | None: # type: ignore[type-arg] - """Prepare colors array, converting from various formats if needed.""" - if colors is None: - return None - - if colors.max() > 1.0: - colors = colors / 255.0 - - return colors - - def _convert_grasp_format(self, grasps: list[dict]) -> list[dict]: # type: ignore[type-arg] - """Convert Grasp format to our visualization format.""" - converted = [] - - for i, grasp in enumerate(grasps): - rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3))) - euler_angles = self._rotation_matrix_to_euler(rotation_matrix) - - converted_grasp = { - "id": f"grasp_{i}", - "score": grasp.get("score", 0.0), - "width": grasp.get("width", 0.0), - "height": grasp.get("height", 0.0), - "depth": grasp.get("depth", 0.0), - "translation": grasp.get("translation", [0, 0, 0]), - "rotation_matrix": rotation_matrix.tolist(), - "euler_angles": euler_angles, - } - converted.append(converted_grasp) - - converted.sort(key=lambda x: x["score"], reverse=True) - - return converted - - def _rotation_matrix_to_euler(self, rotation_matrix: np.ndarray) -> dict[str, float]: # type: ignore[type-arg] - """Convert rotation matrix to Euler angles (in radians).""" - sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2) - - singular = sy < 1e-6 - - if not singular: - x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2]) - y = np.arctan2(-rotation_matrix[2, 0], sy) - z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0]) - else: - x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1]) - y = np.arctan2(-rotation_matrix[2, 0], sy) - z = 0 - - return {"roll": x, "pitch": y, "yaw": z} - - def cleanup(self) -> None: - """Clean up resources.""" - if hasattr(self.detector, "cleanup"): - self.detector.cleanup() - - if self.grasp_loop and self.grasp_loop_thread: - self.grasp_loop.call_soon_threadsafe(self.grasp_loop.stop) - self.grasp_loop_thread.join(timeout=1.0) - - if hasattr(self.pointcloud_filter, "cleanup"): - self.pointcloud_filter.cleanup() - logger.info("ManipulationPipeline cleaned up") diff --git a/dimos/manipulation/manip_aio_processer.py b/dimos/manipulation/manip_aio_processer.py deleted file mode 100644 index 71ed42bff3..0000000000 --- a/dimos/manipulation/manip_aio_processer.py +++ /dev/null @@ -1,422 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Sequential manipulation processor for single-frame processing without reactive streams. -""" - -import time -from typing import Any - -import cv2 -import numpy as np - -from dimos.perception.common.utils import ( - colorize_depth, - combine_object_data, - detection_results_to_object_data, -) -from dimos.perception.detection2d.detic_2d_det import ( # type: ignore[import-not-found, import-untyped] - Detic2DDetector, -) -from dimos.perception.grasp_generation.grasp_generation import HostedGraspGenerator -from dimos.perception.grasp_generation.utils import create_grasp_overlay -from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering -from dimos.perception.pointcloud.utils import ( - create_point_cloud_overlay_visualization, - extract_and_cluster_misc_points, - overlay_point_clouds_on_image, -) -from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter -from dimos.utils.logging_config import setup_logger - -logger = setup_logger() - - -class ManipulationProcessor: - """ - Sequential manipulation processor for single-frame processing. - - Processes RGB-D frames through object detection, point cloud filtering, - and grasp generation in a single thread without reactive streams. - """ - - def __init__( - self, - camera_intrinsics: list[float], # [fx, fy, cx, cy] - min_confidence: float = 0.6, - max_objects: int = 20, - vocabulary: str | None = None, - enable_grasp_generation: bool = False, - grasp_server_url: str | None = None, # Required when enable_grasp_generation=True - enable_segmentation: bool = True, - ) -> None: - """ - Initialize the manipulation processor. - - Args: - camera_intrinsics: [fx, fy, cx, cy] camera parameters - min_confidence: Minimum detection confidence threshold - max_objects: Maximum number of objects to process - vocabulary: Optional vocabulary for Detic detector - enable_grasp_generation: Whether to enable grasp generation - grasp_server_url: WebSocket URL for Dimensional Grasp server (required when enable_grasp_generation=True) - enable_segmentation: Whether to enable semantic segmentation - segmentation_model: Segmentation model to use (SAM 2 or FastSAM) - """ - self.camera_intrinsics = camera_intrinsics - self.min_confidence = min_confidence - self.max_objects = max_objects - self.enable_grasp_generation = enable_grasp_generation - self.grasp_server_url = grasp_server_url - self.enable_segmentation = enable_segmentation - - # Validate grasp generation requirements - if enable_grasp_generation and not grasp_server_url: - raise ValueError("grasp_server_url is required when enable_grasp_generation=True") - - # Initialize object detector - self.detector = Detic2DDetector(vocabulary=vocabulary, threshold=min_confidence) - - # Initialize point cloud processor - self.pointcloud_filter = PointcloudFiltering( - color_intrinsics=camera_intrinsics, - depth_intrinsics=camera_intrinsics, # ZED uses same intrinsics - max_num_objects=max_objects, - ) - - # Initialize semantic segmentation - self.segmenter = None - if self.enable_segmentation: - self.segmenter = Sam2DSegmenter( - use_tracker=False, # Disable tracker for simple segmentation - use_analyzer=False, # Disable analyzer for simple segmentation - ) - - # Initialize grasp generator if enabled - self.grasp_generator = None - if self.enable_grasp_generation: - try: - self.grasp_generator = HostedGraspGenerator(server_url=grasp_server_url) # type: ignore[arg-type] - logger.info("Hosted grasp generator initialized successfully") - except Exception as e: - logger.error(f"Failed to initialize hosted grasp generator: {e}") - self.grasp_generator = None - self.enable_grasp_generation = False - - logger.info( - f"Initialized ManipulationProcessor with confidence={min_confidence}, " - f"grasp_generation={enable_grasp_generation}" - ) - - def process_frame( - self, - rgb_image: np.ndarray, # type: ignore[type-arg] - depth_image: np.ndarray, # type: ignore[type-arg] - generate_grasps: bool | None = None, - ) -> dict[str, Any]: - """ - Process a single RGB-D frame through the complete pipeline. - - Args: - rgb_image: RGB image (H, W, 3) - depth_image: Depth image (H, W) in meters - generate_grasps: Override grasp generation setting for this frame - - Returns: - Dictionary containing: - - detection_viz: Visualization of object detection - - pointcloud_viz: Visualization of point cloud overlay - - segmentation_viz: Visualization of semantic segmentation (if enabled) - - detection2d_objects: Raw detection results as ObjectData - - segmentation2d_objects: Raw segmentation results as ObjectData (if enabled) - - detected_objects: Detection (Object Detection) objects with point clouds filtered - - all_objects: Combined objects with intelligent duplicate removal - - full_pointcloud: Complete scene point cloud (if point cloud processing enabled) - - misc_clusters: List of clustered background/miscellaneous point clouds (DBSCAN) - - misc_voxel_grid: Open3D voxel grid approximating all misc/background points - - misc_pointcloud_viz: Visualization of misc/background cluster overlay - - grasps: Grasp results (list of dictionaries, if enabled) - - grasp_overlay: Grasp visualization overlay (if enabled) - - processing_time: Total processing time - """ - start_time = time.time() - results = {} - - try: - # Step 1: Object Detection - step_start = time.time() - detection_results = self.run_object_detection(rgb_image) - results["detection2d_objects"] = detection_results.get("objects", []) - results["detection_viz"] = detection_results.get("viz_frame") - detection_time = time.time() - step_start - - # Step 2: Semantic Segmentation (if enabled) - segmentation_time = 0 - if self.enable_segmentation: - step_start = time.time() - segmentation_results = self.run_segmentation(rgb_image) - results["segmentation2d_objects"] = segmentation_results.get("objects", []) - results["segmentation_viz"] = segmentation_results.get("viz_frame") - segmentation_time = time.time() - step_start # type: ignore[assignment] - - # Step 3: Point Cloud Processing - pointcloud_time = 0 - detection2d_objects = results.get("detection2d_objects", []) - segmentation2d_objects = results.get("segmentation2d_objects", []) - - # Process detection objects if available - detected_objects = [] - if detection2d_objects: - step_start = time.time() - detected_objects = self.run_pointcloud_filtering( - rgb_image, depth_image, detection2d_objects - ) - pointcloud_time += time.time() - step_start # type: ignore[assignment] - - # Process segmentation objects if available - segmentation_filtered_objects = [] - if segmentation2d_objects: - step_start = time.time() - segmentation_filtered_objects = self.run_pointcloud_filtering( - rgb_image, depth_image, segmentation2d_objects - ) - pointcloud_time += time.time() - step_start # type: ignore[assignment] - - # Combine all objects using intelligent duplicate removal - all_objects = combine_object_data( - detected_objects, # type: ignore[arg-type] - segmentation_filtered_objects, # type: ignore[arg-type] - overlap_threshold=0.8, - ) - - # Get full point cloud - full_pcd = self.pointcloud_filter.get_full_point_cloud() - - # Extract misc/background points and create voxel grid - misc_start = time.time() - misc_clusters, misc_voxel_grid = extract_and_cluster_misc_points( - full_pcd, - all_objects, # type: ignore[arg-type] - eps=0.03, - min_points=100, - enable_filtering=True, - voxel_size=0.02, - ) - misc_time = time.time() - misc_start - - # Store results - results.update( - { - "detected_objects": detected_objects, - "all_objects": all_objects, - "full_pointcloud": full_pcd, - "misc_clusters": misc_clusters, - "misc_voxel_grid": misc_voxel_grid, - } - ) - - # Create point cloud visualizations - base_image = colorize_depth(depth_image, max_depth=10.0) - - # Create visualizations - results["pointcloud_viz"] = ( - create_point_cloud_overlay_visualization( - base_image=base_image, # type: ignore[arg-type] - objects=all_objects, # type: ignore[arg-type] - intrinsics=self.camera_intrinsics, # type: ignore[arg-type] - ) - if all_objects - else base_image - ) - - results["detected_pointcloud_viz"] = ( - create_point_cloud_overlay_visualization( - base_image=base_image, # type: ignore[arg-type] - objects=detected_objects, - intrinsics=self.camera_intrinsics, # type: ignore[arg-type] - ) - if detected_objects - else base_image - ) - - if misc_clusters: - # Generate consistent colors for clusters - cluster_colors = [ - tuple((np.random.RandomState(i + 100).rand(3) * 255).astype(int)) - for i in range(len(misc_clusters)) - ] - results["misc_pointcloud_viz"] = overlay_point_clouds_on_image( - base_image=base_image, # type: ignore[arg-type] - point_clouds=misc_clusters, - camera_intrinsics=self.camera_intrinsics, - colors=cluster_colors, - point_size=2, - alpha=0.6, - ) - else: - results["misc_pointcloud_viz"] = base_image - - # Step 4: Grasp Generation (if enabled) - should_generate_grasps = ( - generate_grasps if generate_grasps is not None else self.enable_grasp_generation - ) - - if should_generate_grasps and all_objects and full_pcd: - grasps = self.run_grasp_generation(all_objects, full_pcd) # type: ignore[arg-type] - results["grasps"] = grasps - if grasps: - results["grasp_overlay"] = create_grasp_overlay( - rgb_image, grasps, self.camera_intrinsics - ) - - except Exception as e: - logger.error(f"Error processing frame: {e}") - results["error"] = str(e) - - # Add timing information - total_time = time.time() - start_time - results.update( - { - "processing_time": total_time, - "timing_breakdown": { - "detection": detection_time if "detection_time" in locals() else 0, - "segmentation": segmentation_time if "segmentation_time" in locals() else 0, - "pointcloud": pointcloud_time if "pointcloud_time" in locals() else 0, - "misc_extraction": misc_time if "misc_time" in locals() else 0, - "total": total_time, - }, - } - ) - - return results - - def run_object_detection(self, rgb_image: np.ndarray) -> dict[str, Any]: # type: ignore[type-arg] - """Run object detection on RGB image.""" - try: - # Convert RGB to BGR for Detic detector - bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) - - # Use process_image method from Detic detector - bboxes, track_ids, class_ids, confidences, names, masks = self.detector.process_image( - bgr_image - ) - - # Convert to ObjectData format using utility function - objects = detection_results_to_object_data( - bboxes=bboxes, - track_ids=track_ids, - class_ids=class_ids, - confidences=confidences, - names=names, - masks=masks, - source="detection", - ) - - # Create visualization using detector's built-in method - viz_frame = self.detector.visualize_results( - rgb_image, bboxes, track_ids, class_ids, confidences, names - ) - - return {"objects": objects, "viz_frame": viz_frame} - - except Exception as e: - logger.error(f"Object detection failed: {e}") - return {"objects": [], "viz_frame": rgb_image.copy()} - - def run_pointcloud_filtering( - self, - rgb_image: np.ndarray, # type: ignore[type-arg] - depth_image: np.ndarray, # type: ignore[type-arg] - objects: list[dict], # type: ignore[type-arg] - ) -> list[dict]: # type: ignore[type-arg] - """Run point cloud filtering on detected objects.""" - try: - filtered_objects = self.pointcloud_filter.process_images( - rgb_image, - depth_image, - objects, # type: ignore[arg-type] - ) - return filtered_objects if filtered_objects else [] # type: ignore[return-value] - except Exception as e: - logger.error(f"Point cloud filtering failed: {e}") - return [] - - def run_segmentation(self, rgb_image: np.ndarray) -> dict[str, Any]: # type: ignore[type-arg] - """Run semantic segmentation on RGB image.""" - if not self.segmenter: - return {"objects": [], "viz_frame": rgb_image.copy()} - - try: - # Convert RGB to BGR for segmenter - bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) - - # Get segmentation results - masks, bboxes, track_ids, probs, names = self.segmenter.process_image(bgr_image) # type: ignore[no-untyped-call] - - # Convert to ObjectData format using utility function - objects = detection_results_to_object_data( - bboxes=bboxes, - track_ids=track_ids, - class_ids=list(range(len(bboxes))), # Use indices as class IDs for segmentation - confidences=probs, - names=names, - masks=masks, - source="segmentation", - ) - - # Create visualization - if masks: - viz_bgr = self.segmenter.visualize_results( - bgr_image, masks, bboxes, track_ids, probs, names - ) - # Convert back to RGB - viz_frame = cv2.cvtColor(viz_bgr, cv2.COLOR_BGR2RGB) - else: - viz_frame = rgb_image.copy() - - return {"objects": objects, "viz_frame": viz_frame} - - except Exception as e: - logger.error(f"Segmentation failed: {e}") - return {"objects": [], "viz_frame": rgb_image.copy()} - - def run_grasp_generation(self, filtered_objects: list[dict], full_pcd) -> list[dict] | None: # type: ignore[no-untyped-def, type-arg] - """Run grasp generation using the configured generator.""" - if not self.grasp_generator: - logger.warning("Grasp generation requested but no generator available") - return None - - try: - # Generate grasps using the configured generator - grasps = self.grasp_generator.generate_grasps_from_objects(filtered_objects, full_pcd) # type: ignore[arg-type] - - # Return parsed results directly (list of grasp dictionaries) - return grasps - - except Exception as e: - logger.error(f"Grasp generation failed: {e}") - return None - - def cleanup(self) -> None: - """Clean up resources.""" - if hasattr(self.detector, "cleanup"): - self.detector.cleanup() - if hasattr(self.pointcloud_filter, "cleanup"): - self.pointcloud_filter.cleanup() - if self.segmenter and hasattr(self.segmenter, "cleanup"): - self.segmenter.cleanup() - if self.grasp_generator and hasattr(self.grasp_generator, "cleanup"): - self.grasp_generator.cleanup() - logger.info("ManipulationProcessor cleaned up") diff --git a/dimos/manipulation/manipulation_interface.py b/dimos/manipulation/manipulation_interface.py index edeb99c0f0..10e71fbc66 100644 --- a/dimos/manipulation/manipulation_interface.py +++ b/dimos/manipulation/manipulation_interface.py @@ -26,7 +26,6 @@ from dimos.manipulation.manipulation_history import ( ManipulationHistory, ) -from dimos.perception.object_detection_stream import ObjectDetectionStream from dimos.types.manipulation import ( AbstractConstraint, ManipulationTask, @@ -53,7 +52,7 @@ def __init__( self, output_dir: str, new_memory: bool = False, - perception_stream: ObjectDetectionStream = None, # type: ignore[assignment] + perception_stream: Any = None, ) -> None: """ Initialize a new ManipulationInterface instance. diff --git a/dimos/manipulation/visual_servoing/detection3d.py b/dimos/manipulation/visual_servoing/detection3d.py deleted file mode 100644 index fca085df8c..0000000000 --- a/dimos/manipulation/visual_servoing/detection3d.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Real-time 3D object detection processor that extracts object poses from RGB-D data. -""" - -import cv2 -from dimos_lcm.vision_msgs import ( - BoundingBox2D, - BoundingBox3D, - Detection2D, - Detection3D, - ObjectHypothesis, - ObjectHypothesisWithPose, - Point2D, - Pose2D, -) -import numpy as np - -from dimos.manipulation.visual_servoing.utils import ( - estimate_object_depth, - transform_pose, - visualize_detections_3d, -) -from dimos.msgs.geometry_msgs import Pose, Quaternion, Vector3 -from dimos.msgs.std_msgs import Header -from dimos.msgs.vision_msgs import Detection2DArray, Detection3DArray -from dimos.perception.common.utils import bbox2d_to_corners -from dimos.perception.detection2d.utils import calculate_object_size_from_bbox -from dimos.perception.pointcloud.utils import extract_centroids_from_masks -from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter -from dimos.utils.logging_config import setup_logger - -logger = setup_logger() - - -class Detection3DProcessor: - """ - Real-time 3D detection processor optimized for speed. - - Uses Sam (FastSAM) for segmentation and mask generation, then extracts - 3D centroids from depth data. - """ - - def __init__( - self, - camera_intrinsics: list[float], # [fx, fy, cx, cy] - min_confidence: float = 0.6, - min_points: int = 30, - max_depth: float = 1.0, - max_object_size: float = 0.15, - ) -> None: - """ - Initialize the real-time 3D detection processor. - - Args: - camera_intrinsics: [fx, fy, cx, cy] camera parameters - min_confidence: Minimum detection confidence threshold - min_points: Minimum 3D points required for valid detection - max_depth: Maximum valid depth in meters - """ - self.camera_intrinsics = camera_intrinsics - self.min_points = min_points - self.max_depth = max_depth - self.max_object_size = max_object_size - - # Initialize Sam segmenter with tracking enabled but analysis disabled - self.detector = Sam2DSegmenter( - use_tracker=False, - use_analyzer=False, - use_filtering=True, - ) - - self.min_confidence = min_confidence - - logger.info( - f"Initialized Detection3DProcessor with Sam segmenter, confidence={min_confidence}, " - f"min_points={min_points}, max_depth={max_depth}m, max_object_size={max_object_size}m" - ) - - def process_frame( - self, - rgb_image: np.ndarray, # type: ignore[type-arg] - depth_image: np.ndarray, # type: ignore[type-arg] - transform: np.ndarray | None = None, # type: ignore[type-arg] - ) -> tuple[Detection3DArray, Detection2DArray]: - """ - Process a single RGB-D frame to extract 3D object detections. - - Args: - rgb_image: RGB image (H, W, 3) - depth_image: Depth image (H, W) in meters - transform: Optional 4x4 transformation matrix to transform objects from camera frame to desired frame - - Returns: - Tuple of (Detection3DArray, Detection2DArray) with 3D and 2D information - """ - - # Convert RGB to BGR for Sam (OpenCV format) - bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) - - # Run Sam segmentation with tracking - masks, bboxes, track_ids, probs, names = self.detector.process_image(bgr_image) # type: ignore[no-untyped-call] - - if not masks or len(masks) == 0: - return Detection3DArray( - detections_length=0, header=Header(), detections=[] - ), Detection2DArray(detections_length=0, header=Header(), detections=[]) - - # Convert CUDA tensors to numpy arrays if needed - numpy_masks = [] - for mask in masks: - if hasattr(mask, "cpu"): # PyTorch tensor - numpy_masks.append(mask.cpu().numpy()) - else: # Already numpy array - numpy_masks.append(mask) - - # Extract 3D centroids from masks - poses = extract_centroids_from_masks( - rgb_image=rgb_image, - depth_image=depth_image, - masks=numpy_masks, - camera_intrinsics=self.camera_intrinsics, - ) - - detections_3d = [] - detections_2d = [] - pose_dict = {p["mask_idx"]: p for p in poses if p["centroid"][2] < self.max_depth} - - for i, (bbox, name, prob, track_id) in enumerate( - zip(bboxes, names, probs, track_ids, strict=False) - ): - if i not in pose_dict: - continue - - pose = pose_dict[i] - obj_cam_pos = pose["centroid"] - - if obj_cam_pos[2] > self.max_depth: - continue - - # Calculate object size from bbox and depth - width_m, height_m = calculate_object_size_from_bbox( - bbox, obj_cam_pos[2], self.camera_intrinsics - ) - - # Calculate depth dimension using segmentation mask - depth_m = estimate_object_depth( - depth_image, numpy_masks[i] if i < len(numpy_masks) else None, bbox - ) - - size_x = max(width_m, 0.01) # Minimum 1cm width - size_y = max(height_m, 0.01) # Minimum 1cm height - size_z = max(depth_m, 0.01) # Minimum 1cm depth - - if min(size_x, size_y, size_z) > self.max_object_size: - continue - - # Transform to desired frame if transform matrix is provided - if transform is not None: - # Get orientation as euler angles, default to no rotation if not available - obj_cam_orientation = pose.get( - "rotation", np.array([0.0, 0.0, 0.0]) - ) # Default to no rotation - transformed_pose = transform_pose( - obj_cam_pos, obj_cam_orientation, transform, to_robot=True - ) - center_pose = transformed_pose - else: - # If no transform, use camera coordinates - center_pose = Pose( - position=Vector3(obj_cam_pos[0], obj_cam_pos[1], obj_cam_pos[2]), - orientation=Quaternion(0.0, 0.0, 0.0, 1.0), # Default orientation - ) - - # Create Detection3D object - detection = Detection3D( - results_length=1, - header=Header(), # Empty header - results=[ - ObjectHypothesisWithPose( - hypothesis=ObjectHypothesis(class_id=name, score=float(prob)) - ) - ], - bbox=BoundingBox3D(center=center_pose, size=Vector3(size_x, size_y, size_z)), - id=str(track_id), - ) - - detections_3d.append(detection) - - # Create corresponding Detection2D - x1, y1, x2, y2 = bbox - center_x = (x1 + x2) / 2.0 - center_y = (y1 + y2) / 2.0 - width = x2 - x1 - height = y2 - y1 - - detection_2d = Detection2D( - results_length=1, - header=Header(), - results=[ - ObjectHypothesisWithPose( - hypothesis=ObjectHypothesis(class_id=name, score=float(prob)) - ) - ], - bbox=BoundingBox2D( - center=Pose2D(position=Point2D(center_x, center_y), theta=0.0), - size_x=float(width), - size_y=float(height), - ), - id=str(track_id), - ) - detections_2d.append(detection_2d) - - # Create and return both arrays - return ( - Detection3DArray( - detections_length=len(detections_3d), header=Header(), detections=detections_3d - ), - Detection2DArray( - detections_length=len(detections_2d), header=Header(), detections=detections_2d - ), - ) - - def visualize_detections( - self, - rgb_image: np.ndarray, # type: ignore[type-arg] - detections_3d: list[Detection3D], - detections_2d: list[Detection2D], - show_coordinates: bool = True, - ) -> np.ndarray: # type: ignore[type-arg] - """ - Visualize detections with 3D position overlay next to bounding boxes. - - Args: - rgb_image: Original RGB image - detections_3d: List of Detection3D objects - detections_2d: List of Detection2D objects (must be 1:1 correspondence) - show_coordinates: Whether to show 3D coordinates - - Returns: - Visualization image - """ - # Extract 2D bboxes from Detection2D objects - - bboxes_2d = [] - for det_2d in detections_2d: - if det_2d.bbox: - x1, y1, x2, y2 = bbox2d_to_corners(det_2d.bbox) - bboxes_2d.append([x1, y1, x2, y2]) - - return visualize_detections_3d(rgb_image, detections_3d, show_coordinates, bboxes_2d) - - def get_closest_detection( - self, detections: list[Detection3D], class_filter: str | None = None - ) -> Detection3D | None: - """ - Get the closest detection with valid 3D data. - - Args: - detections: List of Detection3D objects - class_filter: Optional class name to filter by - - Returns: - Closest Detection3D or None - """ - valid_detections = [] - for d in detections: - # Check if has valid bbox center position - if d.bbox and d.bbox.center and d.bbox.center.position: - # Check class filter if specified - if class_filter is None or ( - d.results_length > 0 and d.results[0].hypothesis.class_id == class_filter - ): - valid_detections.append(d) - - if not valid_detections: - return None - - # Sort by depth (Z coordinate) - def get_z_coord(d): # type: ignore[no-untyped-def] - return abs(d.bbox.center.position.z) - - return min(valid_detections, key=get_z_coord) - - def cleanup(self) -> None: - """Clean up resources.""" - if hasattr(self.detector, "cleanup"): - self.detector.cleanup() - logger.info("Detection3DProcessor cleaned up") diff --git a/dimos/manipulation/visual_servoing/manipulation_module.py b/dimos/manipulation/visual_servoing/manipulation_module.py deleted file mode 100644 index 088db9eb26..0000000000 --- a/dimos/manipulation/visual_servoing/manipulation_module.py +++ /dev/null @@ -1,951 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Manipulation module for robotic grasping with visual servoing. -Handles grasping logic, state machine, and hardware coordination as a Dimos module. -""" - -from collections import deque -from enum import Enum -import threading -import time -from typing import Any - -import cv2 -from dimos_lcm.sensor_msgs import CameraInfo -import numpy as np -from reactivex.disposable import Disposable - -from dimos.core import In, Module, Out, rpc -from dimos.hardware.manipulators.piper.piper_arm import ( # type: ignore[import-not-found, import-untyped] - PiperArm, -) -from dimos.manipulation.visual_servoing.detection3d import Detection3DProcessor -from dimos.manipulation.visual_servoing.pbvs import PBVS -from dimos.manipulation.visual_servoing.utils import ( - create_manipulation_visualization, - is_target_reached, - select_points_from_depth, - transform_points_3d, - update_target_grasp_pose, -) -from dimos.msgs.geometry_msgs import Pose, Quaternion, Vector3 -from dimos.msgs.sensor_msgs import Image -from dimos.msgs.vision_msgs import Detection2DArray, Detection3DArray -from dimos.perception.common.utils import find_clicked_detection -from dimos.utils.logging_config import setup_logger -from dimos.utils.transform_utils import ( - compose_transforms, - create_transform_from_6dof, - matrix_to_pose, - pose_to_matrix, -) - -logger = setup_logger() - - -class GraspStage(Enum): - """Enum for different grasp stages.""" - - IDLE = "idle" - PRE_GRASP = "pre_grasp" - GRASP = "grasp" - CLOSE_AND_RETRACT = "close_and_retract" - PLACE = "place" - RETRACT = "retract" - - -class Feedback: - """Feedback data containing state information about the manipulation process.""" - - def __init__( - self, - grasp_stage: GraspStage, - target_tracked: bool, - current_executed_pose: Pose | None = None, - current_ee_pose: Pose | None = None, - current_camera_pose: Pose | None = None, - target_pose: Pose | None = None, - waiting_for_reach: bool = False, - success: bool | None = None, - ) -> None: - self.grasp_stage = grasp_stage - self.target_tracked = target_tracked - self.current_executed_pose = current_executed_pose - self.current_ee_pose = current_ee_pose - self.current_camera_pose = current_camera_pose - self.target_pose = target_pose - self.waiting_for_reach = waiting_for_reach - self.success = success - - -class ManipulationModule(Module): - """ - Manipulation module for visual servoing and grasping. - - Subscribes to: - - ZED RGB images - - ZED depth images - - ZED camera info - - Publishes: - - Visualization images - - RPC methods: - - handle_keyboard_command: Process keyboard input - - pick_and_place: Execute pick and place task - """ - - # LCM inputs - rgb_image: In[Image] - depth_image: In[Image] - camera_info: In[CameraInfo] - - # LCM outputs - viz_image: Out[Image] - - def __init__( # type: ignore[no-untyped-def] - self, - ee_to_camera_6dof: list | None = None, # type: ignore[type-arg] - **kwargs, - ) -> None: - """ - Initialize manipulation module. - - Args: - ee_to_camera_6dof: EE to camera transform [x, y, z, rx, ry, rz] in meters and radians - workspace_min_radius: Minimum workspace radius in meters - workspace_max_radius: Maximum workspace radius in meters - min_grasp_pitch_degrees: Minimum grasp pitch angle (at max radius) - max_grasp_pitch_degrees: Maximum grasp pitch angle (at min radius) - """ - super().__init__(**kwargs) - - self.arm = PiperArm() - - if ee_to_camera_6dof is None: - ee_to_camera_6dof = [-0.065, 0.03, -0.095, 0.0, -1.57, 0.0] - pos = Vector3(ee_to_camera_6dof[0], ee_to_camera_6dof[1], ee_to_camera_6dof[2]) - rot = Vector3(ee_to_camera_6dof[3], ee_to_camera_6dof[4], ee_to_camera_6dof[5]) - self.T_ee_to_camera = create_transform_from_6dof(pos, rot) - - self.camera_intrinsics = None - self.detector = None - self.pbvs = None - - # Control state - self.last_valid_target = None - self.waiting_for_reach = False - self.current_executed_pose = None # Track the actual pose sent to arm - self.target_updated = False - self.waiting_start_time = None - self.reach_pose_timeout = 20.0 - - # Grasp parameters - self.grasp_width_offset = 0.03 - self.pregrasp_distance = 0.25 - self.grasp_distance_range = 0.03 - self.grasp_close_delay = 2.0 - self.grasp_reached_time = None - self.gripper_max_opening = 0.07 - - # Workspace limits and dynamic pitch parameters - self.workspace_min_radius = 0.2 - self.workspace_max_radius = 0.75 - self.min_grasp_pitch_degrees = 5.0 - self.max_grasp_pitch_degrees = 60.0 - - # Grasp stage tracking - self.grasp_stage = GraspStage.IDLE - - # Pose stabilization tracking - self.pose_history_size = 4 - self.pose_stabilization_threshold = 0.01 - self.stabilization_timeout = 25.0 - self.stabilization_start_time = None - self.reached_poses = deque(maxlen=self.pose_history_size) # type: ignore[var-annotated] - self.adjustment_count = 0 - - # Pose reachability tracking - self.ee_pose_history = deque(maxlen=20) # type: ignore[var-annotated] # Keep history of EE poses - self.stuck_pose_threshold = 0.001 # 1mm movement threshold - self.stuck_pose_adjustment_degrees = 5.0 - self.stuck_count = 0 - self.max_stuck_reattempts = 7 - - # State for visualization - self.current_visualization = None - self.last_detection_3d_array = None - self.last_detection_2d_array = None - - # Grasp result and task tracking - self.pick_success = None - self.final_pregrasp_pose = None - self.task_failed = False - self.overall_success = None - - # Task control - self.task_running = False - self.task_thread = None - self.stop_event = threading.Event() - - # Latest sensor data - self.latest_rgb = None - self.latest_depth = None - self.latest_camera_info = None - - # Target selection - self.target_click = None - - # Place target position and object info - self.home_pose = Pose( - position=Vector3(0.0, 0.0, 0.0), orientation=Quaternion(0.0, 0.0, 0.0, 1.0) - ) - self.place_target_position = None - self.target_object_height = None - self.retract_distance = 0.12 - self.place_pose = None - self.retract_pose = None - self.arm.gotoObserve() - - @rpc - def start(self) -> None: - """Start the manipulation module.""" - - unsub = self.rgb_image.subscribe(self._on_rgb_image) - self._disposables.add(Disposable(unsub)) - - unsub = self.depth_image.subscribe(self._on_depth_image) - self._disposables.add(Disposable(unsub)) - - unsub = self.camera_info.subscribe(self._on_camera_info) - self._disposables.add(Disposable(unsub)) - - logger.info("Manipulation module started") - - @rpc - def stop(self) -> None: - """Stop the manipulation module.""" - # Stop any running task - self.stop_event.set() - if self.task_thread and self.task_thread.is_alive(): - self.task_thread.join(timeout=5.0) - - self.reset_to_idle() - - if self.detector and hasattr(self.detector, "cleanup"): - self.detector.cleanup() - self.arm.disable() - - logger.info("Manipulation module stopped") - - def _on_rgb_image(self, msg: Image) -> None: - """Handle RGB image messages.""" - try: - self.latest_rgb = msg.data - except Exception as e: - logger.error(f"Error processing RGB image: {e}") - - def _on_depth_image(self, msg: Image) -> None: - """Handle depth image messages.""" - try: - self.latest_depth = msg.data - except Exception as e: - logger.error(f"Error processing depth image: {e}") - - def _on_camera_info(self, msg: CameraInfo) -> None: - """Handle camera info messages.""" - try: - self.camera_intrinsics = [msg.K[0], msg.K[4], msg.K[2], msg.K[5]] # type: ignore[assignment] - - if self.detector is None: - self.detector = Detection3DProcessor(self.camera_intrinsics) # type: ignore[arg-type, assignment] - self.pbvs = PBVS() # type: ignore[assignment] - logger.info("Initialized detection and PBVS processors") - - self.latest_camera_info = msg - except Exception as e: - logger.error(f"Error processing camera info: {e}") - - @rpc - def get_single_rgb_frame(self) -> np.ndarray | None: # type: ignore[type-arg] - """ - get the latest rgb frame from the camera - """ - return self.latest_rgb - - @rpc - def handle_keyboard_command(self, key: str) -> str: - """ - Handle keyboard commands for robot control. - - Args: - key: Keyboard key as string - - Returns: - Action taken as string, or empty string if no action - """ - key_code = ord(key) if len(key) == 1 else int(key) - - if key_code == ord("r"): - self.stop_event.set() - self.task_running = False - self.reset_to_idle() - return "reset" - elif key_code == ord("s"): - logger.info("SOFT STOP - Emergency stopping robot!") - self.arm.softStop() - self.stop_event.set() - self.task_running = False - return "stop" - elif key_code == ord(" ") and self.pbvs and self.pbvs.target_grasp_pose: - if self.grasp_stage == GraspStage.PRE_GRASP: - self.set_grasp_stage(GraspStage.GRASP) - logger.info("Executing target pose") - return "execute" - elif key_code == ord("g"): - logger.info("Opening gripper") - self.arm.release_gripper() - return "release" - - return "" - - @rpc - def pick_and_place( - self, - target_x: int | None = None, - target_y: int | None = None, - place_x: int | None = None, - place_y: int | None = None, - ) -> dict[str, Any]: - """ - Start a pick and place task. - - Args: - target_x: Optional X coordinate of target object - target_y: Optional Y coordinate of target object - place_x: Optional X coordinate of place location - place_y: Optional Y coordinate of place location - - Returns: - Dict with status and message - """ - if self.task_running: - return {"status": "error", "message": "Task already running"} - - if self.camera_intrinsics is None: - return {"status": "error", "message": "Camera not initialized"} - - if target_x is not None and target_y is not None: - self.target_click = (target_x, target_y) - if place_x is not None and self.latest_depth is not None: - points_3d_camera = select_points_from_depth( - self.latest_depth, - (place_x, place_y), - self.camera_intrinsics, - radius=10, - ) - - if points_3d_camera.size > 0: - ee_pose = self.arm.get_ee_pose() - ee_transform = pose_to_matrix(ee_pose) - camera_transform = compose_transforms(ee_transform, self.T_ee_to_camera) - - points_3d_world = transform_points_3d( - points_3d_camera, - camera_transform, - to_robot=True, - ) - - place_position = np.mean(points_3d_world, axis=0) - self.place_target_position = place_position - logger.info( - f"Place target set at position: ({place_position[0]:.3f}, {place_position[1]:.3f}, {place_position[2]:.3f})" - ) - else: - logger.warning("No valid depth points found at place location") - self.place_target_position = None - else: - self.place_target_position = None - - self.task_failed = False - self.stop_event.clear() - - if self.task_thread and self.task_thread.is_alive(): - self.stop_event.set() - self.task_thread.join(timeout=1.0) - self.task_thread = threading.Thread(target=self._run_pick_and_place, daemon=True) - self.task_thread.start() - - return {"status": "started", "message": "Pick and place task started"} - - def _run_pick_and_place(self) -> None: - """Run the pick and place task loop.""" - self.task_running = True - logger.info("Starting pick and place task") - - try: - while not self.stop_event.is_set(): - if self.task_failed: - logger.error("Task failed, terminating pick and place") - self.stop_event.set() - break - - feedback = self.update() - if feedback is None: - time.sleep(0.01) - continue - - if feedback.success is not None: # type: ignore[attr-defined] - if feedback.success: # type: ignore[attr-defined] - logger.info("Pick and place completed successfully!") - else: - logger.warning("Pick and place failed") - self.reset_to_idle() - self.stop_event.set() - break - - time.sleep(0.01) - - except Exception as e: - logger.error(f"Error in pick and place task: {e}") - self.task_failed = True - finally: - self.task_running = False - logger.info("Pick and place task ended") - - def set_grasp_stage(self, stage: GraspStage) -> None: - """Set the grasp stage.""" - self.grasp_stage = stage - logger.info(f"Grasp stage: {stage.value}") - - def calculate_dynamic_grasp_pitch(self, target_pose: Pose) -> float: - """ - Calculate grasp pitch dynamically based on distance from robot base. - Maps workspace radius to grasp pitch angle. - - Args: - target_pose: Target pose - - Returns: - Grasp pitch angle in degrees - """ - # Calculate 3D distance from robot base (assumes robot at origin) - position = target_pose.position - distance = np.sqrt(position.x**2 + position.y**2 + position.z**2) - - # Clamp distance to workspace limits - distance = np.clip(distance, self.workspace_min_radius, self.workspace_max_radius) - - # Linear interpolation: min_radius -> max_pitch, max_radius -> min_pitch - # Normalized distance (0 to 1) - normalized_dist = (distance - self.workspace_min_radius) / ( - self.workspace_max_radius - self.workspace_min_radius - ) - - # Inverse mapping: closer objects need higher pitch - pitch_degrees = self.max_grasp_pitch_degrees - ( - normalized_dist * (self.max_grasp_pitch_degrees - self.min_grasp_pitch_degrees) - ) - - return pitch_degrees # type: ignore[no-any-return] - - def check_within_workspace(self, target_pose: Pose) -> bool: - """ - Check if pose is within workspace limits and log error if not. - - Args: - target_pose: Target pose to validate - - Returns: - True if within workspace, False otherwise - """ - # Calculate 3D distance from robot base - position = target_pose.position - distance = np.sqrt(position.x**2 + position.y**2 + position.z**2) - - if not (self.workspace_min_radius <= distance <= self.workspace_max_radius): - logger.error( - f"Target outside workspace limits: distance {distance:.3f}m not in [{self.workspace_min_radius:.2f}, {self.workspace_max_radius:.2f}]" - ) - return False - - return True - - def _check_reach_timeout(self) -> tuple[bool, float]: - """Check if robot has exceeded timeout while reaching pose. - - Returns: - Tuple of (timed_out, time_elapsed) - """ - if self.waiting_start_time: - time_elapsed = time.time() - self.waiting_start_time - if time_elapsed > self.reach_pose_timeout: - logger.warning( - f"Robot failed to reach pose within {self.reach_pose_timeout}s timeout" - ) - self.task_failed = True - self.reset_to_idle() - return True, time_elapsed - return False, time_elapsed - return False, 0.0 - - def _check_if_stuck(self) -> bool: - """ - Check if robot is stuck by analyzing pose history. - - Returns: - Tuple of (is_stuck, max_std_dev_mm) - """ - if len(self.ee_pose_history) < self.ee_pose_history.maxlen: # type: ignore[operator] - return False - - # Extract positions from pose history - positions = np.array( - [[p.position.x, p.position.y, p.position.z] for p in self.ee_pose_history] - ) - - # Calculate standard deviation of positions - std_devs = np.std(positions, axis=0) - # Check if all standard deviations are below stuck threshold - is_stuck = np.all(std_devs < self.stuck_pose_threshold) - - return is_stuck # type: ignore[return-value] - - def check_reach_and_adjust(self) -> bool: - """ - Check if robot has reached the current executed pose while waiting. - Handles timeout internally by failing the task. - Also detects if the robot is stuck (not moving towards target). - - Returns: - True if reached, False if still waiting or not in waiting state - """ - if not self.waiting_for_reach or not self.current_executed_pose: - return False - - # Get current end-effector pose - ee_pose = self.arm.get_ee_pose() - target_pose = self.current_executed_pose - - # Check for timeout - this will fail task and reset if timeout occurred - timed_out, _time_elapsed = self._check_reach_timeout() - if timed_out: - return False - - self.ee_pose_history.append(ee_pose) - - # Check if robot is stuck - is_stuck = self._check_if_stuck() - if is_stuck: - if self.grasp_stage == GraspStage.RETRACT or self.grasp_stage == GraspStage.PLACE: - self.waiting_for_reach = False - self.waiting_start_time = None - self.stuck_count = 0 - self.ee_pose_history.clear() - return True - self.stuck_count += 1 - pitch_degrees = self.calculate_dynamic_grasp_pitch(target_pose) - if self.stuck_count % 2 == 0: - pitch_degrees += self.stuck_pose_adjustment_degrees * (1 + self.stuck_count // 2) - else: - pitch_degrees -= self.stuck_pose_adjustment_degrees * (1 + self.stuck_count // 2) - - pitch_degrees = max( - self.min_grasp_pitch_degrees, min(self.max_grasp_pitch_degrees, pitch_degrees) - ) - updated_target_pose = update_target_grasp_pose(target_pose, ee_pose, 0.0, pitch_degrees) - self.arm.cmd_ee_pose(updated_target_pose) - self.current_executed_pose = updated_target_pose - self.ee_pose_history.clear() - self.waiting_for_reach = True - self.waiting_start_time = time.time() - return False - - if self.stuck_count >= self.max_stuck_reattempts: - self.task_failed = True - self.reset_to_idle() - return False - - if is_target_reached(target_pose, ee_pose, self.pbvs.target_tolerance): - self.waiting_for_reach = False - self.waiting_start_time = None - self.stuck_count = 0 - self.ee_pose_history.clear() - return True - return False - - def _update_tracking(self, detection_3d_array: Detection3DArray | None) -> bool: - """Update tracking with new detections.""" - if not detection_3d_array or not self.pbvs: - return False - - target_tracked = self.pbvs.update_tracking(detection_3d_array) - if target_tracked: - self.target_updated = True - self.last_valid_target = self.pbvs.get_current_target() - return target_tracked - - def reset_to_idle(self) -> None: - """Reset the manipulation system to IDLE state.""" - if self.pbvs: - self.pbvs.clear_target() - self.grasp_stage = GraspStage.IDLE - self.reached_poses.clear() - self.ee_pose_history.clear() - self.adjustment_count = 0 - self.waiting_for_reach = False - self.current_executed_pose = None - self.target_updated = False - self.stabilization_start_time = None - self.grasp_reached_time = None - self.waiting_start_time = None - self.pick_success = None - self.final_pregrasp_pose = None - self.overall_success = None - self.place_pose = None - self.retract_pose = None - self.stuck_count = 0 - - self.arm.gotoObserve() - - def execute_idle(self) -> None: - """Execute idle stage.""" - pass - - def execute_pre_grasp(self) -> None: - """Execute pre-grasp stage: visual servoing to pre-grasp position.""" - if self.waiting_for_reach: - if self.check_reach_and_adjust(): - self.reached_poses.append(self.current_executed_pose) - self.target_updated = False - time.sleep(0.2) - return - if ( - self.stabilization_start_time - and (time.time() - self.stabilization_start_time) > self.stabilization_timeout - ): - logger.warning( - f"Failed to get stable grasp after {self.stabilization_timeout} seconds, resetting" - ) - self.task_failed = True - self.reset_to_idle() - return - - ee_pose = self.arm.get_ee_pose() - dynamic_pitch = self.calculate_dynamic_grasp_pitch(self.pbvs.current_target.bbox.center) # type: ignore[attr-defined] - - _, _, _, has_target, target_pose = self.pbvs.compute_control( # type: ignore[attr-defined] - ee_pose, self.pregrasp_distance, dynamic_pitch - ) - if target_pose and has_target: - # Validate target pose is within workspace - if not self.check_within_workspace(target_pose): - self.task_failed = True - self.reset_to_idle() - return - - if self.check_target_stabilized(): - logger.info("Target stabilized, transitioning to GRASP") - self.final_pregrasp_pose = self.current_executed_pose - self.grasp_stage = GraspStage.GRASP - self.adjustment_count = 0 - self.waiting_for_reach = False - elif not self.waiting_for_reach and self.target_updated: - self.arm.cmd_ee_pose(target_pose) - self.current_executed_pose = target_pose - self.waiting_for_reach = True - self.waiting_start_time = time.time() # type: ignore[assignment] - self.target_updated = False - self.adjustment_count += 1 - time.sleep(0.2) - - def execute_grasp(self) -> None: - """Execute grasp stage: move to final grasp position.""" - if self.waiting_for_reach: - if self.check_reach_and_adjust() and not self.grasp_reached_time: - self.grasp_reached_time = time.time() # type: ignore[assignment] - return - - if self.grasp_reached_time: - if (time.time() - self.grasp_reached_time) >= self.grasp_close_delay: - logger.info("Grasp delay completed, closing gripper") - self.grasp_stage = GraspStage.CLOSE_AND_RETRACT - return - - if self.last_valid_target: - # Calculate dynamic pitch for current target - dynamic_pitch = self.calculate_dynamic_grasp_pitch(self.last_valid_target.bbox.center) - normalized_pitch = dynamic_pitch / 90.0 - grasp_distance = -self.grasp_distance_range + ( - 2 * self.grasp_distance_range * normalized_pitch - ) - - ee_pose = self.arm.get_ee_pose() - _, _, _, has_target, target_pose = self.pbvs.compute_control( - ee_pose, grasp_distance, dynamic_pitch - ) - - if target_pose and has_target: - # Validate grasp pose is within workspace - if not self.check_within_workspace(target_pose): - self.task_failed = True - self.reset_to_idle() - return - - object_width = self.last_valid_target.bbox.size.x - gripper_opening = max( - 0.005, min(object_width + self.grasp_width_offset, self.gripper_max_opening) - ) - - logger.info(f"Executing grasp: gripper={gripper_opening * 1000:.1f}mm") - self.arm.cmd_gripper_ctrl(gripper_opening) - self.arm.cmd_ee_pose(target_pose, line_mode=True) - self.current_executed_pose = target_pose - self.waiting_for_reach = True - self.waiting_start_time = time.time() - - def execute_close_and_retract(self) -> None: - """Execute the retraction sequence after gripper has been closed.""" - if self.waiting_for_reach and self.final_pregrasp_pose: - if self.check_reach_and_adjust(): - logger.info("Reached pre-grasp retraction position") - self.pick_success = self.arm.gripper_object_detected() - if self.pick_success: - logger.info("Object successfully grasped!") - if self.place_target_position is not None: - logger.info("Transitioning to PLACE stage") - self.grasp_stage = GraspStage.PLACE - else: - self.overall_success = True - else: - logger.warning("No object detected in gripper") - self.task_failed = True - self.overall_success = False - return - if not self.waiting_for_reach: - logger.info("Retracting to pre-grasp position") - self.arm.cmd_ee_pose(self.final_pregrasp_pose, line_mode=True) - self.current_executed_pose = self.final_pregrasp_pose - self.arm.close_gripper() - self.waiting_for_reach = True - self.waiting_start_time = time.time() # type: ignore[assignment] - - def execute_place(self) -> None: - """Execute place stage: move to place position and release object.""" - if self.waiting_for_reach: - # Use the already executed pose instead of recalculating - if self.check_reach_and_adjust(): - logger.info("Reached place position, releasing gripper") - self.arm.release_gripper() - time.sleep(1.0) - self.place_pose = self.current_executed_pose - logger.info("Transitioning to RETRACT stage") - self.grasp_stage = GraspStage.RETRACT - return - - if not self.waiting_for_reach: - place_pose = self.get_place_target_pose() - if place_pose: - logger.info("Moving to place position") - self.arm.cmd_ee_pose(place_pose, line_mode=True) - self.current_executed_pose = place_pose # type: ignore[assignment] - self.waiting_for_reach = True - self.waiting_start_time = time.time() # type: ignore[assignment] - else: - logger.error("Failed to get place target pose") - self.task_failed = True - self.overall_success = False # type: ignore[assignment] - - def execute_retract(self) -> None: - """Execute retract stage: retract from place position.""" - if self.waiting_for_reach and self.retract_pose: - if self.check_reach_and_adjust(): - logger.info("Reached retract position") - logger.info("Returning to observe position") - self.arm.gotoObserve() - self.arm.close_gripper() - self.overall_success = True - logger.info("Pick and place completed successfully!") - return - - if not self.waiting_for_reach: - if self.place_pose: - pose_pitch = self.calculate_dynamic_grasp_pitch(self.place_pose) - self.retract_pose = update_target_grasp_pose( - self.place_pose, self.home_pose, self.retract_distance, pose_pitch - ) - logger.info("Retracting from place position") - self.arm.cmd_ee_pose(self.retract_pose, line_mode=True) - self.current_executed_pose = self.retract_pose - self.waiting_for_reach = True - self.waiting_start_time = time.time() - else: - logger.error("No place pose stored for retraction") - self.task_failed = True - self.overall_success = False # type: ignore[assignment] - - def capture_and_process( - self, - ) -> tuple[np.ndarray | None, Detection3DArray | None, Detection2DArray | None, Pose | None]: # type: ignore[type-arg] - """Capture frame from camera data and process detections.""" - if self.latest_rgb is None or self.latest_depth is None or self.detector is None: - return None, None, None, None - - ee_pose = self.arm.get_ee_pose() - ee_transform = pose_to_matrix(ee_pose) - camera_transform = compose_transforms(ee_transform, self.T_ee_to_camera) - camera_pose = matrix_to_pose(camera_transform) - detection_3d_array, detection_2d_array = self.detector.process_frame( - self.latest_rgb, self.latest_depth, camera_transform - ) - - return self.latest_rgb, detection_3d_array, detection_2d_array, camera_pose - - def pick_target(self, x: int, y: int) -> bool: - """Select a target object at the given pixel coordinates.""" - if not self.last_detection_2d_array or not self.last_detection_3d_array: - logger.warning("No detections available for target selection") - return False - - clicked_3d = find_clicked_detection( - (x, y), self.last_detection_2d_array.detections, self.last_detection_3d_array.detections - ) - if clicked_3d and self.pbvs: - # Validate workspace - if not self.check_within_workspace(clicked_3d.bbox.center): - self.task_failed = True - return False - - self.pbvs.set_target(clicked_3d) - - if clicked_3d.bbox and clicked_3d.bbox.size: - self.target_object_height = clicked_3d.bbox.size.z - logger.info(f"Target object height: {self.target_object_height:.3f}m") - - position = clicked_3d.bbox.center.position - logger.info( - f"Target selected: ID={clicked_3d.id}, pos=({position.x:.3f}, {position.y:.3f}, {position.z:.3f})" - ) - self.grasp_stage = GraspStage.PRE_GRASP - self.reached_poses.clear() - self.adjustment_count = 0 - self.waiting_for_reach = False - self.current_executed_pose = None - self.stabilization_start_time = time.time() - return True - return False - - def update(self) -> dict[str, Any] | None: - """Main update function that handles capture, processing, control, and visualization.""" - rgb, detection_3d_array, detection_2d_array, camera_pose = self.capture_and_process() - if rgb is None: - return None - - self.last_detection_3d_array = detection_3d_array # type: ignore[assignment] - self.last_detection_2d_array = detection_2d_array # type: ignore[assignment] - if self.target_click: - x, y = self.target_click - if self.pick_target(x, y): - self.target_click = None - - if ( - detection_3d_array - and self.grasp_stage in [GraspStage.PRE_GRASP, GraspStage.GRASP] - and not self.waiting_for_reach - ): - self._update_tracking(detection_3d_array) - stage_handlers = { - GraspStage.IDLE: self.execute_idle, - GraspStage.PRE_GRASP: self.execute_pre_grasp, - GraspStage.GRASP: self.execute_grasp, - GraspStage.CLOSE_AND_RETRACT: self.execute_close_and_retract, - GraspStage.PLACE: self.execute_place, - GraspStage.RETRACT: self.execute_retract, - } - if self.grasp_stage in stage_handlers: - stage_handlers[self.grasp_stage]() - - target_tracked = self.pbvs.get_current_target() is not None if self.pbvs else False - ee_pose = self.arm.get_ee_pose() - feedback = Feedback( - grasp_stage=self.grasp_stage, - target_tracked=target_tracked, - current_executed_pose=self.current_executed_pose, - current_ee_pose=ee_pose, - current_camera_pose=camera_pose, - target_pose=self.pbvs.target_grasp_pose if self.pbvs else None, - waiting_for_reach=self.waiting_for_reach, - success=self.overall_success, - ) - - if self.task_running: - self.current_visualization = create_manipulation_visualization( # type: ignore[assignment] - rgb, feedback, detection_3d_array, detection_2d_array - ) - - if self.current_visualization is not None: - self._publish_visualization(self.current_visualization) - - return feedback # type: ignore[return-value] - - def _publish_visualization(self, viz_image: np.ndarray) -> None: # type: ignore[type-arg] - """Publish visualization image to LCM.""" - try: - viz_rgb = cv2.cvtColor(viz_image, cv2.COLOR_BGR2RGB) - msg = Image.from_numpy(viz_rgb) - self.viz_image.publish(msg) - except Exception as e: - logger.error(f"Error publishing visualization: {e}") - - def check_target_stabilized(self) -> bool: - """Check if the commanded poses have stabilized.""" - if len(self.reached_poses) < self.reached_poses.maxlen: # type: ignore[operator] - return False - - positions = np.array( - [[p.position.x, p.position.y, p.position.z] for p in self.reached_poses] - ) - std_devs = np.std(positions, axis=0) - return np.all(std_devs < self.pose_stabilization_threshold) # type: ignore[return-value] - - def get_place_target_pose(self) -> Pose | None: - """Get the place target pose with z-offset applied based on object height.""" - if self.place_target_position is None: - return None - - place_pos = self.place_target_position.copy() - if self.target_object_height is not None: - z_offset = self.target_object_height / 2.0 - place_pos[2] += z_offset + 0.1 - - place_center_pose = Pose( - position=Vector3(place_pos[0], place_pos[1], place_pos[2]), - orientation=Quaternion(0.0, 0.0, 0.0, 1.0), - ) - - ee_pose = self.arm.get_ee_pose() - - # Calculate dynamic pitch for place position - dynamic_pitch = self.calculate_dynamic_grasp_pitch(place_center_pose) - - place_pose = update_target_grasp_pose( - place_center_pose, - ee_pose, - grasp_distance=0.0, - grasp_pitch_degrees=dynamic_pitch, - ) - - return place_pose diff --git a/dimos/manipulation/visual_servoing/pbvs.py b/dimos/manipulation/visual_servoing/pbvs.py deleted file mode 100644 index f94c233834..0000000000 --- a/dimos/manipulation/visual_servoing/pbvs.py +++ /dev/null @@ -1,488 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Position-Based Visual Servoing (PBVS) system for robotic manipulation. -Supports both eye-in-hand and eye-to-hand configurations. -""" - -from collections import deque - -from dimos_lcm.vision_msgs import Detection3D -import numpy as np -from scipy.spatial.transform import Rotation as R # type: ignore[import-untyped] - -from dimos.manipulation.visual_servoing.utils import ( - create_pbvs_visualization, - find_best_object_match, - is_target_reached, - update_target_grasp_pose, -) -from dimos.msgs.geometry_msgs import Pose, Quaternion, Vector3 -from dimos.msgs.vision_msgs import Detection3DArray -from dimos.utils.logging_config import setup_logger - -logger = setup_logger() - - -class PBVS: - """ - High-level Position-Based Visual Servoing orchestrator. - - Handles: - - Object tracking and target management - - Pregrasp distance computation - - Grasp pose generation - - Coordination with low-level controller - - Note: This class is agnostic to camera mounting (eye-in-hand vs eye-to-hand). - The caller is responsible for providing appropriate camera and EE poses. - """ - - def __init__( - self, - position_gain: float = 0.5, - rotation_gain: float = 0.3, - max_velocity: float = 0.1, # m/s - max_angular_velocity: float = 0.5, # rad/s - target_tolerance: float = 0.01, # 1cm - max_tracking_distance_threshold: float = 0.12, # Max distance for target tracking (m) - min_size_similarity: float = 0.6, # Min size similarity threshold (0.0-1.0) - direct_ee_control: bool = True, # If True, output target poses instead of velocities - ) -> None: - """ - Initialize PBVS system. - - Args: - position_gain: Proportional gain for position control - rotation_gain: Proportional gain for rotation control - max_velocity: Maximum linear velocity command magnitude (m/s) - max_angular_velocity: Maximum angular velocity command magnitude (rad/s) - target_tolerance: Distance threshold for considering target reached (m) - max_tracking_distance: Maximum distance for valid target tracking (m) - min_size_similarity: Minimum size similarity for valid target tracking (0.0-1.0) - direct_ee_control: If True, output target poses instead of velocity commands - """ - # Initialize low-level controller only if not in direct control mode - if not direct_ee_control: - self.controller = PBVSController( - position_gain=position_gain, - rotation_gain=rotation_gain, - max_velocity=max_velocity, - max_angular_velocity=max_angular_velocity, - target_tolerance=target_tolerance, - ) - else: - self.controller = None # type: ignore[assignment] - - # Store parameters for direct mode error computation - self.target_tolerance = target_tolerance - - # Target tracking parameters - self.max_tracking_distance_threshold = max_tracking_distance_threshold - self.min_size_similarity = min_size_similarity - self.direct_ee_control = direct_ee_control - - # Target state - self.current_target = None - self.target_grasp_pose = None - - # Detection history for robust tracking - self.detection_history_size = 3 - self.detection_history = deque(maxlen=self.detection_history_size) # type: ignore[var-annotated] - - # For direct control mode visualization - self.last_position_error = None - self.last_target_reached = False - - logger.info( - f"Initialized PBVS system with controller gains: pos={position_gain}, rot={rotation_gain}, " - f"tracking_thresholds: distance={max_tracking_distance_threshold}m, size={min_size_similarity:.2f}" - ) - - def set_target(self, target_object: Detection3D) -> bool: - """ - Set a new target object for servoing. - - Args: - target_object: Detection3D object - - Returns: - True if target was set successfully - """ - if target_object and target_object.bbox and target_object.bbox.center: - self.current_target = target_object - self.target_grasp_pose = None # Will be computed when needed - logger.info(f"New target set: ID {target_object.id}") - return True - return False - - def clear_target(self) -> None: - """Clear the current target.""" - self.current_target = None - self.target_grasp_pose = None - self.last_position_error = None - self.last_target_reached = False - self.detection_history.clear() - if self.controller: - self.controller.clear_state() - logger.info("Target cleared") - - def get_current_target(self) -> Detection3D | None: - """ - Get the current target object. - - Returns: - Current target Detection3D or None if no target selected - """ - return self.current_target - - def update_tracking(self, new_detections: Detection3DArray | None = None) -> bool: - """ - Update target tracking with new detections using a rolling window. - If tracking is lost, keeps the old target pose. - - Args: - new_detections: Optional new detections for target tracking - - Returns: - True if target was successfully tracked, False if lost (but target is kept) - """ - # Check if we have a current target - if not self.current_target: - return False - - # Add new detections to history if provided - if new_detections is not None and new_detections.detections_length > 0: - self.detection_history.append(new_detections) - - # If no detection history, can't track - if not self.detection_history: - logger.debug("No detection history for target tracking - using last known pose") - return False - - # Collect all candidates from detection history - all_candidates = [] - for detection_array in self.detection_history: - all_candidates.extend(detection_array.detections) - - if not all_candidates: - logger.debug("No candidates in detection history") - return False - - # Use stage-dependent distance threshold - max_distance = self.max_tracking_distance_threshold - - # Find best match across all recent detections - match_result = find_best_object_match( - target_obj=self.current_target, - candidates=all_candidates, - max_distance=max_distance, - min_size_similarity=self.min_size_similarity, - ) - - if match_result.is_valid_match: - self.current_target = match_result.matched_object - self.target_grasp_pose = None # Recompute grasp pose - logger.debug( - f"Target tracking successful: distance={match_result.distance:.3f}m, " - f"size_similarity={match_result.size_similarity:.2f}, " - f"confidence={match_result.confidence:.2f}" - ) - return True - - logger.debug( - f"Target tracking lost across {len(self.detection_history)} frames: " - f"distance={match_result.distance:.3f}m, " - f"size_similarity={match_result.size_similarity:.2f}, " - f"thresholds: distance={max_distance:.3f}m, size={self.min_size_similarity:.2f}" - ) - return False - - def compute_control( - self, - ee_pose: Pose, - grasp_distance: float = 0.15, - grasp_pitch_degrees: float = 45.0, - ) -> tuple[Vector3 | None, Vector3 | None, bool, bool, Pose | None]: - """ - Compute PBVS control with position and orientation servoing. - - Args: - ee_pose: Current end-effector pose - grasp_distance: Distance to maintain from target (meters) - - Returns: - Tuple of (velocity_command, angular_velocity_command, target_reached, has_target, target_pose) - - velocity_command: Linear velocity vector or None if no target (None in direct_ee_control mode) - - angular_velocity_command: Angular velocity vector or None if no target (None in direct_ee_control mode) - - target_reached: True if within target tolerance - - has_target: True if currently tracking a target - - target_pose: Target EE pose (only in direct_ee_control mode, otherwise None) - """ - # Check if we have a target - if not self.current_target: - return None, None, False, False, None - - # Update target grasp pose with provided distance and pitch - self.target_grasp_pose = update_target_grasp_pose( - self.current_target.bbox.center, ee_pose, grasp_distance, grasp_pitch_degrees - ) - - if self.target_grasp_pose is None: - logger.warning("Failed to compute grasp pose") - return None, None, False, False, None - - # Compute errors for visualization before checking if reached (in case pose gets cleared) - if self.direct_ee_control and self.target_grasp_pose: - self.last_position_error = Vector3( - self.target_grasp_pose.position.x - ee_pose.position.x, - self.target_grasp_pose.position.y - ee_pose.position.y, - self.target_grasp_pose.position.z - ee_pose.position.z, - ) - - # Check if target reached using our separate function - target_reached = is_target_reached(self.target_grasp_pose, ee_pose, self.target_tolerance) - - # Return appropriate values based on control mode - if self.direct_ee_control: - # Direct control mode - if self.target_grasp_pose: - self.last_target_reached = target_reached - # Return has_target=True since we have a target - return None, None, target_reached, True, self.target_grasp_pose - else: - return None, None, False, True, None - else: - # Velocity control mode - use controller - velocity_cmd, angular_velocity_cmd, _controller_reached = ( - self.controller.compute_control(ee_pose, self.target_grasp_pose) - ) - # Return has_target=True since we have a target, regardless of tracking status - return velocity_cmd, angular_velocity_cmd, target_reached, True, None - - def create_status_overlay( # type: ignore[no-untyped-def] - self, - image: np.ndarray, # type: ignore[type-arg] - grasp_stage=None, - ) -> np.ndarray: # type: ignore[type-arg] - """ - Create PBVS status overlay on image. - - Args: - image: Input image - grasp_stage: Current grasp stage (optional) - - Returns: - Image with PBVS status overlay - """ - stage_value = grasp_stage.value if grasp_stage else "idle" - return create_pbvs_visualization( - image, - self.current_target, - self.last_position_error, - self.last_target_reached, - stage_value, - ) - - -class PBVSController: - """ - Low-level Position-Based Visual Servoing controller. - Pure control logic that computes velocity commands from poses. - - Handles: - - Position and orientation error computation - - Velocity command generation with gain control - - Target reached detection - """ - - def __init__( - self, - position_gain: float = 0.5, - rotation_gain: float = 0.3, - max_velocity: float = 0.1, # m/s - max_angular_velocity: float = 0.5, # rad/s - target_tolerance: float = 0.01, # 1cm - ) -> None: - """ - Initialize PBVS controller. - - Args: - position_gain: Proportional gain for position control - rotation_gain: Proportional gain for rotation control - max_velocity: Maximum linear velocity command magnitude (m/s) - max_angular_velocity: Maximum angular velocity command magnitude (rad/s) - target_tolerance: Distance threshold for considering target reached (m) - """ - self.position_gain = position_gain - self.rotation_gain = rotation_gain - self.max_velocity = max_velocity - self.max_angular_velocity = max_angular_velocity - self.target_tolerance = target_tolerance - - self.last_position_error = None - self.last_rotation_error = None - self.last_velocity_cmd = None - self.last_angular_velocity_cmd = None - self.last_target_reached = False - - logger.info( - f"Initialized PBVS controller: pos_gain={position_gain}, rot_gain={rotation_gain}, " - f"max_vel={max_velocity}m/s, max_ang_vel={max_angular_velocity}rad/s, " - f"target_tolerance={target_tolerance}m" - ) - - def clear_state(self) -> None: - """Clear controller state.""" - self.last_position_error = None - self.last_rotation_error = None - self.last_velocity_cmd = None - self.last_angular_velocity_cmd = None - self.last_target_reached = False - - def compute_control( - self, ee_pose: Pose, grasp_pose: Pose - ) -> tuple[Vector3 | None, Vector3 | None, bool]: - """ - Compute PBVS control with position and orientation servoing. - - Args: - ee_pose: Current end-effector pose - grasp_pose: Target grasp pose - - Returns: - Tuple of (velocity_command, angular_velocity_command, target_reached) - - velocity_command: Linear velocity vector - - angular_velocity_command: Angular velocity vector - - target_reached: True if within target tolerance - """ - # Calculate position error (target - EE position) - error = Vector3( - grasp_pose.position.x - ee_pose.position.x, - grasp_pose.position.y - ee_pose.position.y, - grasp_pose.position.z - ee_pose.position.z, - ) - self.last_position_error = error # type: ignore[assignment] - - # Compute velocity command with proportional control - velocity_cmd = Vector3( - error.x * self.position_gain, - error.y * self.position_gain, - error.z * self.position_gain, - ) - - # Limit velocity magnitude - vel_magnitude = np.linalg.norm([velocity_cmd.x, velocity_cmd.y, velocity_cmd.z]) - if vel_magnitude > self.max_velocity: - scale = self.max_velocity / vel_magnitude - velocity_cmd = Vector3( - float(velocity_cmd.x * scale), - float(velocity_cmd.y * scale), - float(velocity_cmd.z * scale), - ) - - self.last_velocity_cmd = velocity_cmd # type: ignore[assignment] - - # Compute angular velocity for orientation control - angular_velocity_cmd = self._compute_angular_velocity(grasp_pose.orientation, ee_pose) - - # Check if target reached - error_magnitude = np.linalg.norm([error.x, error.y, error.z]) - target_reached = bool(error_magnitude < self.target_tolerance) - self.last_target_reached = target_reached - - return velocity_cmd, angular_velocity_cmd, target_reached - - def _compute_angular_velocity(self, target_rot: Quaternion, current_pose: Pose) -> Vector3: - """ - Compute angular velocity commands for orientation control. - Uses quaternion error computation for better numerical stability. - - Args: - target_rot: Target orientation (quaternion) - current_pose: Current EE pose - - Returns: - Angular velocity command as Vector3 - """ - # Use quaternion error for better numerical stability - - # Convert to scipy Rotation objects - target_rot_scipy = R.from_quat([target_rot.x, target_rot.y, target_rot.z, target_rot.w]) - current_rot_scipy = R.from_quat( - [ - current_pose.orientation.x, - current_pose.orientation.y, - current_pose.orientation.z, - current_pose.orientation.w, - ] - ) - - # Compute rotation error: error = target * current^(-1) - error_rot = target_rot_scipy * current_rot_scipy.inv() - - # Convert to axis-angle representation for control - error_axis_angle = error_rot.as_rotvec() - - # Use axis-angle directly as angular velocity error (small angle approximation) - roll_error = error_axis_angle[0] - pitch_error = error_axis_angle[1] - yaw_error = error_axis_angle[2] - - self.last_rotation_error = Vector3(roll_error, pitch_error, yaw_error) # type: ignore[assignment] - - # Apply proportional control - angular_velocity = Vector3( - roll_error * self.rotation_gain, - pitch_error * self.rotation_gain, - yaw_error * self.rotation_gain, - ) - - # Limit angular velocity magnitude - ang_vel_magnitude = np.sqrt( - angular_velocity.x**2 + angular_velocity.y**2 + angular_velocity.z**2 - ) - if ang_vel_magnitude > self.max_angular_velocity: - scale = self.max_angular_velocity / ang_vel_magnitude - angular_velocity = Vector3( - angular_velocity.x * scale, angular_velocity.y * scale, angular_velocity.z * scale - ) - - self.last_angular_velocity_cmd = angular_velocity # type: ignore[assignment] - - return angular_velocity - - def create_status_overlay( - self, - image: np.ndarray, # type: ignore[type-arg] - current_target: Detection3D | None = None, - ) -> np.ndarray: # type: ignore[type-arg] - """ - Create PBVS status overlay on image. - - Args: - image: Input image - current_target: Current target object Detection3D (for display) - - Returns: - Image with PBVS status overlay - """ - return create_pbvs_visualization( - image, - current_target, - self.last_position_error, - self.last_target_reached, - "velocity_control", - ) diff --git a/dimos/manipulation/visual_servoing/utils.py b/dimos/manipulation/visual_servoing/utils.py deleted file mode 100644 index 5922739429..0000000000 --- a/dimos/manipulation/visual_servoing/utils.py +++ /dev/null @@ -1,801 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass -from typing import Any - -import cv2 -from dimos_lcm.vision_msgs import Detection2D, Detection3D -import numpy as np - -from dimos.msgs.geometry_msgs import Pose, Quaternion, Vector3 -from dimos.perception.common.utils import project_2d_points_to_3d -from dimos.perception.detection2d.utils import plot_results -from dimos.utils.transform_utils import ( - compose_transforms, - euler_to_quaternion, - get_distance, - matrix_to_pose, - offset_distance, - optical_to_robot_frame, - pose_to_matrix, - robot_to_optical_frame, - yaw_towards_point, -) - - -def match_detection_by_id( - detection_3d: Detection3D, detections_3d: list[Detection3D], detections_2d: list[Detection2D] -) -> Detection2D | None: - """ - Find the corresponding Detection2D for a given Detection3D. - - Args: - detection_3d: The Detection3D to match - detections_3d: List of all Detection3D objects - detections_2d: List of all Detection2D objects (must be 1:1 correspondence) - - Returns: - Corresponding Detection2D if found, None otherwise - """ - for i, det_3d in enumerate(detections_3d): - if det_3d.id == detection_3d.id and i < len(detections_2d): - return detections_2d[i] - return None - - -def transform_pose( - obj_pos: np.ndarray, # type: ignore[type-arg] - obj_orientation: np.ndarray, # type: ignore[type-arg] - transform_matrix: np.ndarray, # type: ignore[type-arg] - to_optical: bool = False, - to_robot: bool = False, -) -> Pose: - """ - Transform object pose with optional frame convention conversion. - - Args: - obj_pos: Object position [x, y, z] - obj_orientation: Object orientation [roll, pitch, yaw] in radians - transform_matrix: 4x4 transformation matrix from camera frame to desired frame - to_optical: If True, input is in robot frame → convert result to optical frame - to_robot: If True, input is in optical frame → convert to robot frame first - - Returns: - Object pose in desired frame as Pose - """ - # Convert euler angles to quaternion using utility function - euler_vector = Vector3(obj_orientation[0], obj_orientation[1], obj_orientation[2]) - obj_orientation_quat = euler_to_quaternion(euler_vector) - - input_pose = Pose( - position=Vector3(obj_pos[0], obj_pos[1], obj_pos[2]), orientation=obj_orientation_quat - ) - - # Apply input frame conversion based on flags - if to_robot: - # Input is in optical frame → convert to robot frame first - pose_for_transform = optical_to_robot_frame(input_pose) - else: - # Default or to_optical: use input pose as-is - pose_for_transform = input_pose - - # Create transformation matrix from pose (relative to camera) - T_camera_object = pose_to_matrix(pose_for_transform) - - # Use compose_transforms to combine transformations - T_desired_object = compose_transforms(transform_matrix, T_camera_object) - - # Convert back to pose - result_pose = matrix_to_pose(T_desired_object) - - # Apply output frame conversion based on flags - if to_optical: - # Input was robot frame → convert result to optical frame - desired_pose = robot_to_optical_frame(result_pose) - else: - # Default or to_robot: use result as-is - desired_pose = result_pose - - return desired_pose - - -def transform_points_3d( - points_3d: np.ndarray, # type: ignore[type-arg] - transform_matrix: np.ndarray, # type: ignore[type-arg] - to_optical: bool = False, - to_robot: bool = False, -) -> np.ndarray: # type: ignore[type-arg] - """ - Transform 3D points with optional frame convention conversion. - Applies the same transformation pipeline as transform_pose but for multiple points. - - Args: - points_3d: Nx3 array of 3D points [x, y, z] - transform_matrix: 4x4 transformation matrix from camera frame to desired frame - to_optical: If True, input is in robot frame → convert result to optical frame - to_robot: If True, input is in optical frame → convert to robot frame first - - Returns: - Nx3 array of transformed 3D points in desired frame - """ - if points_3d.size == 0: - return np.zeros((0, 3), dtype=np.float32) - - points_3d = np.asarray(points_3d) - if points_3d.ndim == 1: - points_3d = points_3d.reshape(1, -1) - - transformed_points = [] - - for point in points_3d: - input_point_pose = Pose( - position=Vector3(point[0], point[1], point[2]), - orientation=Quaternion(0.0, 0.0, 0.0, 1.0), # Identity quaternion - ) - - # Apply input frame conversion based on flags - if to_robot: - # Input is in optical frame → convert to robot frame first - pose_for_transform = optical_to_robot_frame(input_point_pose) - else: - # Default or to_optical: use input pose as-is - pose_for_transform = input_point_pose - - # Create transformation matrix from point pose (relative to camera) - T_camera_point = pose_to_matrix(pose_for_transform) - - # Use compose_transforms to combine transformations - T_desired_point = compose_transforms(transform_matrix, T_camera_point) - - # Convert back to pose - result_pose = matrix_to_pose(T_desired_point) - - # Apply output frame conversion based on flags - if to_optical: - # Input was robot frame → convert result to optical frame - desired_pose = robot_to_optical_frame(result_pose) - else: - # Default or to_robot: use result as-is - desired_pose = result_pose - - transformed_point = [ - desired_pose.position.x, - desired_pose.position.y, - desired_pose.position.z, - ] - transformed_points.append(transformed_point) - - return np.array(transformed_points, dtype=np.float32) - - -def select_points_from_depth( - depth_image: np.ndarray, # type: ignore[type-arg] - target_point: tuple[int, int], - camera_intrinsics: list[float] | np.ndarray, # type: ignore[type-arg] - radius: int = 5, -) -> np.ndarray: # type: ignore[type-arg] - """ - Select points around a target point within a bounding box and project them to 3D. - - Args: - depth_image: Depth image in meters (H, W) - target_point: (x, y) target point coordinates - radius: Half-width of the bounding box (so bbox size is radius*2 x radius*2) - camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix - - Returns: - Nx3 array of 3D points (X, Y, Z) in camera frame - """ - x_target, y_target = target_point - height, width = depth_image.shape - - x_min = max(0, x_target - radius) - x_max = min(width, x_target + radius) - y_min = max(0, y_target - radius) - y_max = min(height, y_target + radius) - - # Create coordinate grids for the bounding box (vectorized) - y_coords, x_coords = np.meshgrid(range(y_min, y_max), range(x_min, x_max), indexing="ij") - - # Flatten to get all coordinate pairs - x_flat = x_coords.flatten() - y_flat = y_coords.flatten() - - # Extract corresponding depth values using advanced indexing - depth_flat = depth_image[y_flat, x_flat] - - valid_mask = (depth_flat > 0) & np.isfinite(depth_flat) - - if not np.any(valid_mask): - return np.zeros((0, 3), dtype=np.float32) - - points_2d = np.column_stack([x_flat[valid_mask], y_flat[valid_mask]]).astype(np.float32) - depth_values = depth_flat[valid_mask].astype(np.float32) - - points_3d = project_2d_points_to_3d(points_2d, depth_values, camera_intrinsics) - - return points_3d - - -def update_target_grasp_pose( - target_pose: Pose, ee_pose: Pose, grasp_distance: float = 0.0, grasp_pitch_degrees: float = 45.0 -) -> Pose | None: - """ - Update target grasp pose based on current target pose and EE pose. - - Args: - target_pose: Target pose to grasp - ee_pose: Current end-effector pose - grasp_distance: Distance to maintain from target (pregrasp or grasp distance) - grasp_pitch_degrees: Grasp pitch angle in degrees (default 90° for top-down) - - Returns: - Target grasp pose or None if target is invalid - """ - - target_pos = target_pose.position - - # Calculate orientation pointing from target towards EE - yaw_to_ee = yaw_towards_point(target_pos, ee_pose.position) - - # Create target pose with proper orientation - # Convert grasp pitch from degrees to radians with mapping: - # 0° (level) -> π/2 (1.57 rad), 90° (top-down) -> π (3.14 rad) - pitch_radians = 1.57 + np.radians(grasp_pitch_degrees) - - # Convert euler angles to quaternion using utility function - euler = Vector3(0.0, pitch_radians, yaw_to_ee) # roll=0, pitch=mapped, yaw=calculated - target_orientation = euler_to_quaternion(euler) - - updated_pose = Pose(target_pos, target_orientation) - - if grasp_distance > 0.0: - return offset_distance(updated_pose, grasp_distance) - else: - return updated_pose - - -def is_target_reached(target_pose: Pose, current_pose: Pose, tolerance: float = 0.01) -> bool: - """ - Check if the target pose has been reached within tolerance. - - Args: - target_pose: Target pose to reach - current_pose: Current pose (e.g., end-effector pose) - tolerance: Distance threshold for considering target reached (meters, default 0.01 = 1cm) - - Returns: - True if target is reached within tolerance, False otherwise - """ - # Calculate position error using distance utility - error_magnitude = get_distance(target_pose, current_pose) - return error_magnitude < tolerance - - -@dataclass -class ObjectMatchResult: - """Result of object matching with confidence metrics.""" - - matched_object: Detection3D | None - confidence: float - distance: float - size_similarity: float - is_valid_match: bool - - -def calculate_object_similarity( - target_obj: Detection3D, - candidate_obj: Detection3D, - distance_weight: float = 0.6, - size_weight: float = 0.4, -) -> tuple[float, float, float]: - """ - Calculate comprehensive similarity between two objects. - - Args: - target_obj: Target Detection3D object - candidate_obj: Candidate Detection3D object - distance_weight: Weight for distance component (0-1) - size_weight: Weight for size component (0-1) - - Returns: - Tuple of (total_similarity, distance_m, size_similarity) - """ - # Extract positions - target_pos = target_obj.bbox.center.position - candidate_pos = candidate_obj.bbox.center.position - - target_xyz = np.array([target_pos.x, target_pos.y, target_pos.z]) - candidate_xyz = np.array([candidate_pos.x, candidate_pos.y, candidate_pos.z]) - - # Calculate Euclidean distance - distance = np.linalg.norm(target_xyz - candidate_xyz) - distance_similarity = 1.0 / (1.0 + distance) # Exponential decay - - # Calculate size similarity by comparing each dimension individually - size_similarity = 1.0 # Default if no size info - target_size = target_obj.bbox.size - candidate_size = candidate_obj.bbox.size - - if target_size and candidate_size: - # Extract dimensions - target_dims = [target_size.x, target_size.y, target_size.z] - candidate_dims = [candidate_size.x, candidate_size.y, candidate_size.z] - - # Calculate similarity for each dimension pair - dim_similarities = [] - for target_dim, candidate_dim in zip(target_dims, candidate_dims, strict=False): - if target_dim == 0.0 and candidate_dim == 0.0: - dim_similarities.append(1.0) # Both dimensions are zero - elif target_dim == 0.0 or candidate_dim == 0.0: - dim_similarities.append(0.0) # One dimension is zero, other is not - else: - # Calculate similarity as min/max ratio - max_dim = max(target_dim, candidate_dim) - min_dim = min(target_dim, candidate_dim) - dim_similarity = min_dim / max_dim if max_dim > 0 else 0.0 - dim_similarities.append(dim_similarity) - - # Return average similarity across all dimensions - size_similarity = np.mean(dim_similarities) if dim_similarities else 0.0 # type: ignore[assignment] - - # Weighted combination - total_similarity = distance_weight * distance_similarity + size_weight * size_similarity - - return total_similarity, distance, size_similarity # type: ignore[return-value] - - -def find_best_object_match( - target_obj: Detection3D, - candidates: list[Detection3D], - max_distance: float = 0.1, - min_size_similarity: float = 0.4, - distance_weight: float = 0.7, - size_weight: float = 0.3, -) -> ObjectMatchResult: - """ - Find the best matching object from candidates using distance and size criteria. - - Args: - target_obj: Target Detection3D to match against - candidates: List of candidate Detection3D objects - max_distance: Maximum allowed distance for valid match (meters) - min_size_similarity: Minimum size similarity for valid match (0-1) - distance_weight: Weight for distance in similarity calculation - size_weight: Weight for size in similarity calculation - - Returns: - ObjectMatchResult with best match and confidence metrics - """ - if not candidates or not target_obj.bbox or not target_obj.bbox.center: - return ObjectMatchResult(None, 0.0, float("inf"), 0.0, False) - - best_match = None - best_confidence = 0.0 - best_distance = float("inf") - best_size_sim = 0.0 - - for candidate in candidates: - if not candidate.bbox or not candidate.bbox.center: - continue - - similarity, distance, size_sim = calculate_object_similarity( - target_obj, candidate, distance_weight, size_weight - ) - - # Check validity constraints - is_valid = distance <= max_distance and size_sim >= min_size_similarity - - if is_valid and similarity > best_confidence: - best_match = candidate - best_confidence = similarity - best_distance = distance - best_size_sim = size_sim - - return ObjectMatchResult( - matched_object=best_match, - confidence=best_confidence, - distance=best_distance, - size_similarity=best_size_sim, - is_valid_match=best_match is not None, - ) - - -def parse_zed_pose(zed_pose_data: dict[str, Any]) -> Pose | None: - """ - Parse ZED pose data dictionary into a Pose object. - - Args: - zed_pose_data: Dictionary from ZEDCamera.get_pose() containing: - - position: [x, y, z] in meters - - rotation: [x, y, z, w] quaternion - - euler_angles: [roll, pitch, yaw] in radians - - valid: Whether pose is valid - - Returns: - Pose object with position and orientation, or None if invalid - """ - if not zed_pose_data or not zed_pose_data.get("valid", False): - return None - - # Extract position - position = zed_pose_data.get("position", [0, 0, 0]) - pos_vector = Vector3(position[0], position[1], position[2]) - - quat = zed_pose_data["rotation"] - orientation = Quaternion(quat[0], quat[1], quat[2], quat[3]) - return Pose(position=pos_vector, orientation=orientation) - - -def estimate_object_depth( - depth_image: np.ndarray, # type: ignore[type-arg] - segmentation_mask: np.ndarray | None, # type: ignore[type-arg] - bbox: list[float], -) -> float: - """ - Estimate object depth dimension using segmentation mask and depth data. - Optimized for real-time performance. - - Args: - depth_image: Depth image in meters - segmentation_mask: Binary segmentation mask for the object - bbox: Bounding box [x1, y1, x2, y2] - - Returns: - Estimated object depth in meters - """ - x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]) - - # Extract depth ROI once - roi_depth = depth_image[y1:y2, x1:x2] - - if segmentation_mask is not None and segmentation_mask.size > 0: - # Extract mask ROI efficiently - mask_roi = ( - segmentation_mask[y1:y2, x1:x2] - if segmentation_mask.shape != roi_depth.shape - else segmentation_mask - ) - - # Fast mask application using boolean indexing - valid_mask = mask_roi > 0 - if np.sum(valid_mask) > 10: # Early exit if not enough points - masked_depths = roi_depth[valid_mask] - - # Fast percentile calculation using numpy's optimized functions - depth_90 = np.percentile(masked_depths, 90) - depth_10 = np.percentile(masked_depths, 10) - depth_range = depth_90 - depth_10 - - # Clamp to reasonable bounds with single operation - return np.clip(depth_range, 0.02, 0.5) # type: ignore[no-any-return] - - # Fast fallback using area calculation - bbox_area = (x2 - x1) * (y2 - y1) - - # Vectorized area-based estimation - if bbox_area > 10000: - return 0.15 - elif bbox_area > 5000: - return 0.10 - else: - return 0.05 - - -# ============= Visualization Functions ============= - - -def create_manipulation_visualization( # type: ignore[no-untyped-def] - rgb_image: np.ndarray, # type: ignore[type-arg] - feedback, - detection_3d_array=None, - detection_2d_array=None, -) -> np.ndarray: # type: ignore[type-arg] - """ - Create simple visualization for manipulation class using feedback. - - Args: - rgb_image: RGB image array - feedback: Feedback object containing all state information - detection_3d_array: Optional 3D detections for object visualization - detection_2d_array: Optional 2D detections for object visualization - - Returns: - BGR image with visualization overlays - """ - # Convert to BGR for OpenCV - viz = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) - - # Draw detections if available - if detection_3d_array and detection_2d_array: - # Extract 2D bboxes - bboxes_2d = [] - for det_2d in detection_2d_array.detections: - if det_2d.bbox: - x1 = det_2d.bbox.center.position.x - det_2d.bbox.size_x / 2 - y1 = det_2d.bbox.center.position.y - det_2d.bbox.size_y / 2 - x2 = det_2d.bbox.center.position.x + det_2d.bbox.size_x / 2 - y2 = det_2d.bbox.center.position.y + det_2d.bbox.size_y / 2 - bboxes_2d.append([x1, y1, x2, y2]) - - # Draw basic detections - rgb_with_detections = visualize_detections_3d( - rgb_image, detection_3d_array.detections, show_coordinates=True, bboxes_2d=bboxes_2d - ) - viz = cv2.cvtColor(rgb_with_detections, cv2.COLOR_RGB2BGR) - - # Add manipulation status overlay - status_y = 30 - cv2.putText( - viz, - "Eye-in-Hand Visual Servoing", - (10, status_y), - cv2.FONT_HERSHEY_SIMPLEX, - 0.6, - (0, 255, 255), - 2, - ) - - # Stage information - stage_text = f"Stage: {feedback.grasp_stage.value.upper()}" - stage_color = { - "idle": (100, 100, 100), - "pre_grasp": (0, 255, 255), - "grasp": (0, 255, 0), - "close_and_retract": (255, 0, 255), - "place": (0, 150, 255), - "retract": (255, 150, 0), - }.get(feedback.grasp_stage.value, (255, 255, 255)) - - cv2.putText( - viz, - stage_text, - (10, status_y + 25), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - stage_color, - 1, - ) - - # Target tracking status - if feedback.target_tracked: - cv2.putText( - viz, - "Target: TRACKED", - (10, status_y + 45), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - (0, 255, 0), - 1, - ) - elif feedback.grasp_stage.value != "idle": - cv2.putText( - viz, - "Target: LOST", - (10, status_y + 45), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - (0, 0, 255), - 1, - ) - - # Waiting status - if feedback.waiting_for_reach: - cv2.putText( - viz, - "Status: WAITING FOR ROBOT", - (10, status_y + 65), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - (255, 255, 0), - 1, - ) - - # Overall result - if feedback.success is not None: - result_text = "Pick & Place: SUCCESS" if feedback.success else "Pick & Place: FAILED" - result_color = (0, 255, 0) if feedback.success else (0, 0, 255) - cv2.putText( - viz, - result_text, - (10, status_y + 85), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - result_color, - 2, - ) - - # Control hints (bottom of image) - hint_text = "Click object to grasp | s=STOP | r=RESET | g=RELEASE" - cv2.putText( - viz, - hint_text, - (10, viz.shape[0] - 10), - cv2.FONT_HERSHEY_SIMPLEX, - 0.4, - (200, 200, 200), - 1, - ) - - return viz - - -def create_pbvs_visualization( # type: ignore[no-untyped-def] - image: np.ndarray, # type: ignore[type-arg] - current_target=None, - position_error=None, - target_reached: bool = False, - grasp_stage: str = "idle", -) -> np.ndarray: # type: ignore[type-arg] - """ - Create simple PBVS visualization overlay. - - Args: - image: Input image (RGB or BGR) - current_target: Current target Detection3D - position_error: Position error Vector3 - target_reached: Whether target is reached - grasp_stage: Current grasp stage string - - Returns: - Image with PBVS overlay - """ - viz = image.copy() - - # Only show PBVS info if we have a target - if current_target is None: - return viz - - # Create status panel at bottom - height, width = viz.shape[:2] - panel_height = 100 - panel_y = height - panel_height - - # Semi-transparent overlay - overlay = viz.copy() - cv2.rectangle(overlay, (0, panel_y), (width, height), (0, 0, 0), -1) - viz = cv2.addWeighted(viz, 0.7, overlay, 0.3, 0) - - # PBVS Status - y_offset = panel_y + 20 - cv2.putText( - viz, - "PBVS Control", - (10, y_offset), - cv2.FONT_HERSHEY_SIMPLEX, - 0.6, - (0, 255, 255), - 2, - ) - - # Position error - if position_error: - error_mag = np.linalg.norm([position_error.x, position_error.y, position_error.z]) - error_text = f"Error: {error_mag * 100:.1f}cm" - error_color = (0, 255, 0) if target_reached else (0, 255, 255) - cv2.putText( - viz, - error_text, - (10, y_offset + 25), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - error_color, - 1, - ) - - # Stage - cv2.putText( - viz, - f"Stage: {grasp_stage}", - (10, y_offset + 45), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - (255, 150, 255), - 1, - ) - - # Target reached indicator - if target_reached: - cv2.putText( - viz, - "TARGET REACHED", - (width - 150, y_offset + 25), - cv2.FONT_HERSHEY_SIMPLEX, - 0.6, - (0, 255, 0), - 2, - ) - - return viz - - -def visualize_detections_3d( - rgb_image: np.ndarray, # type: ignore[type-arg] - detections: list[Detection3D], - show_coordinates: bool = True, - bboxes_2d: list[list[float]] | None = None, -) -> np.ndarray: # type: ignore[type-arg] - """ - Visualize detections with 3D position overlay next to bounding boxes. - - Args: - rgb_image: Original RGB image - detections: List of Detection3D objects - show_coordinates: Whether to show 3D coordinates next to bounding boxes - bboxes_2d: Optional list of 2D bounding boxes corresponding to detections - - Returns: - Visualization image - """ - if not detections: - return rgb_image.copy() - - # If no 2D bboxes provided, skip visualization - if bboxes_2d is None: - return rgb_image.copy() - - # Extract data for plot_results function - bboxes = bboxes_2d - track_ids = [int(det.id) if det.id.isdigit() else i for i, det in enumerate(detections)] - class_ids = [i for i in range(len(detections))] - confidences = [ - det.results[0].hypothesis.score if det.results_length > 0 else 0.0 for det in detections - ] - names = [ - det.results[0].hypothesis.class_id if det.results_length > 0 else "unknown" - for det in detections - ] - - # Use plot_results for basic visualization - viz = plot_results(rgb_image, bboxes, track_ids, class_ids, confidences, names) - - # Add 3D position coordinates if requested - if show_coordinates and bboxes_2d is not None: - for i, det in enumerate(detections): - if det.bbox and det.bbox.center and i < len(bboxes_2d): - position = det.bbox.center.position - bbox = bboxes_2d[i] - - pos_xyz = np.array([position.x, position.y, position.z]) - - # Get bounding box coordinates - _x1, y1, x2, _y2 = map(int, bbox) - - # Add position text next to bounding box (top-right corner) - pos_text = f"({pos_xyz[0]:.2f}, {pos_xyz[1]:.2f}, {pos_xyz[2]:.2f})" - text_x = x2 + 5 # Right edge of bbox + small offset - text_y = y1 + 15 # Top edge of bbox + small offset - - # Add background rectangle for better readability - text_size = cv2.getTextSize(pos_text, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)[0] - cv2.rectangle( - viz, - (text_x - 2, text_y - text_size[1] - 2), - (text_x + text_size[0] + 2, text_y + 2), - (0, 0, 0), - -1, - ) - - cv2.putText( - viz, - pos_text, - (text_x, text_y), - cv2.FONT_HERSHEY_SIMPLEX, - 0.4, - (255, 255, 255), - 1, - ) - - return viz # type: ignore[no-any-return] diff --git a/dimos/msgs/geometry_msgs/Quaternion.py b/dimos/msgs/geometry_msgs/Quaternion.py index d19436d441..02c9592ea6 100644 --- a/dimos/msgs/geometry_msgs/Quaternion.py +++ b/dimos/msgs/geometry_msgs/Quaternion.py @@ -138,6 +138,20 @@ def from_euler(cls, vector: Vector3) -> Quaternion: return cls(x, y, z, w) + @classmethod + def from_rotation_matrix(cls, matrix: np.ndarray) -> Quaternion: # type: ignore[type-arg] + """Convert a 3x3 rotation matrix to quaternion. + + Args: + matrix: 3x3 rotation matrix (numpy array) + + Returns: + Quaternion representation + """ + rotation = R.from_matrix(matrix) + quat = rotation.as_quat() # Returns [x, y, z, w] + return cls(quat[0], quat[1], quat[2], quat[3]) + def to_euler(self) -> Vector3: """Convert quaternion to Euler angles (roll, pitch, yaw) in radians. diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py index 26ed141867..de3e7abeca 100644 --- a/dimos/msgs/sensor_msgs/Image.py +++ b/dimos/msgs/sensor_msgs/Image.py @@ -310,6 +310,10 @@ def to_cupy(self) -> Image: def to_opencv(self) -> np.ndarray: # type: ignore[type-arg] return self._impl.to_opencv() + def as_numpy(self) -> np.ndarray: # type: ignore[type-arg] + """Get image data as numpy array in RGB format.""" + return np.asarray(self.data) + def to_rgb(self) -> Image: return Image(self._impl.to_rgb()) @@ -352,6 +356,20 @@ def sharpness(self) -> float: """Return sharpness score.""" return self._impl.sharpness() + def to_depth_meters(self) -> Image: + """Return a depth image normalized to meters as float32.""" + depth_cv = self.to_opencv() + fmt = self.format + + if fmt == ImageFormat.DEPTH16: + depth_cv = depth_cv.astype(np.float32) / 1000.0 + fmt = ImageFormat.DEPTH + elif depth_cv.dtype != np.float32: + depth_cv = depth_cv.astype(np.float32) + fmt = ImageFormat.DEPTH if fmt == ImageFormat.DEPTH else fmt + + return Image.from_numpy(depth_cv, format=fmt, frame_id=self.frame_id, ts=self.ts) + def save(self, filepath: str) -> bool: return self._impl.save(filepath) diff --git a/dimos/msgs/sensor_msgs/PointCloud2.py b/dimos/msgs/sensor_msgs/PointCloud2.py index 772ddad7ab..d68c62f51d 100644 --- a/dimos/msgs/sensor_msgs/PointCloud2.py +++ b/dimos/msgs/sensor_msgs/PointCloud2.py @@ -354,6 +354,8 @@ def transform(self, tf: Transform) -> PointCloud2: def voxel_downsample(self, voxel_size: float = 0.025) -> PointCloud2: """Downsample the pointcloud with a voxel grid.""" + if voxel_size <= 0: + return self if len(self.pointcloud.points) < 20: return self downsampled = self._pcd_tensor.voxel_down_sample(voxel_size) diff --git a/dimos/msgs/vision_msgs/Detection2D.py b/dimos/msgs/vision_msgs/Detection2D.py new file mode 100644 index 0000000000..aa957f8061 --- /dev/null +++ b/dimos/msgs/vision_msgs/Detection2D.py @@ -0,0 +1,27 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dimos_lcm.vision_msgs.Detection2D import Detection2D as LCMDetection2D + +from dimos.types.timestamped import to_timestamp + + +class Detection2D(LCMDetection2D): # type: ignore[misc] + msg_name = "vision_msgs.Detection2D" + + # for _get_field_type() to work when decoding in _decode_one() + __annotations__ = LCMDetection2D.__annotations__ + + @property + def ts(self) -> float: + return to_timestamp(self.header.stamp) diff --git a/dimos/msgs/vision_msgs/Detection3D.py b/dimos/msgs/vision_msgs/Detection3D.py new file mode 100644 index 0000000000..e074ecb0b1 --- /dev/null +++ b/dimos/msgs/vision_msgs/Detection3D.py @@ -0,0 +1,27 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dimos_lcm.vision_msgs.Detection3D import Detection3D as LCMDetection3D + +from dimos.types.timestamped import to_timestamp + + +class Detection3D(LCMDetection3D): # type: ignore[misc] + msg_name = "vision_msgs.Detection3D" + + # for _get_field_type() to work when decoding in _decode_one() + __annotations__ = LCMDetection3D.__annotations__ + + @property + def ts(self) -> float: + return to_timestamp(self.header.stamp) diff --git a/dimos/msgs/vision_msgs/Detection3DArray.py b/dimos/msgs/vision_msgs/Detection3DArray.py index 59905cad4c..2eba82204d 100644 --- a/dimos/msgs/vision_msgs/Detection3DArray.py +++ b/dimos/msgs/vision_msgs/Detection3DArray.py @@ -11,11 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from dimos_lcm.vision_msgs.Detection3DArray import Detection3DArray as LCMDetection3DArray -from dimos_lcm.vision_msgs.Detection3DArray import ( - Detection3DArray as LCMDetection3DArray, -) +from dimos.types.timestamped import to_timestamp class Detection3DArray(LCMDetection3DArray): # type: ignore[misc] msg_name = "vision_msgs.Detection3DArray" + + # for _get_field_type() to work when decoding in _decode_one() + __annotations__ = LCMDetection3DArray.__annotations__ + + @property + def ts(self) -> float: + return to_timestamp(self.header.stamp) diff --git a/dimos/msgs/vision_msgs/__init__.py b/dimos/msgs/vision_msgs/__init__.py index af170cbfab..0f1c9c8dc1 100644 --- a/dimos/msgs/vision_msgs/__init__.py +++ b/dimos/msgs/vision_msgs/__init__.py @@ -1,6 +1,15 @@ from .BoundingBox2DArray import BoundingBox2DArray from .BoundingBox3DArray import BoundingBox3DArray +from .Detection2D import Detection2D from .Detection2DArray import Detection2DArray +from .Detection3D import Detection3D from .Detection3DArray import Detection3DArray -__all__ = ["BoundingBox2DArray", "BoundingBox3DArray", "Detection2DArray", "Detection3DArray"] +__all__ = [ + "BoundingBox2DArray", + "BoundingBox3DArray", + "Detection2D", + "Detection2DArray", + "Detection3D", + "Detection3DArray", +] diff --git a/dimos/perception/common/__init__.py b/dimos/perception/common/__init__.py index 67481bc449..16281fe0b6 100644 --- a/dimos/perception/common/__init__.py +++ b/dimos/perception/common/__init__.py @@ -1,3 +1 @@ -from .detection2d_tracker import get_tracked_results, target2dTracker -from .ibvs import * from .utils import * diff --git a/dimos/perception/common/detection2d_tracker.py b/dimos/perception/common/detection2d_tracker.py deleted file mode 100644 index 9ff36be8a1..0000000000 --- a/dimos/perception/common/detection2d_tracker.py +++ /dev/null @@ -1,396 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import deque -from collections.abc import Sequence - -import numpy as np - - -def compute_iou(bbox1, bbox2): # type: ignore[no-untyped-def] - """ - Compute Intersection over Union (IoU) of two bounding boxes. - Each bbox is [x1, y1, x2, y2]. - """ - x1 = max(bbox1[0], bbox2[0]) - y1 = max(bbox1[1], bbox2[1]) - x2 = min(bbox1[2], bbox2[2]) - y2 = min(bbox1[3], bbox2[3]) - - inter_area = max(0, x2 - x1) * max(0, y2 - y1) - area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) - area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]) - - union_area = area1 + area2 - inter_area - if union_area == 0: - return 0 - return inter_area / union_area - - -def get_tracked_results(tracked_targets): # type: ignore[no-untyped-def] - """ - Extract tracked results from a list of target2d objects. - - Args: - tracked_targets (list[target2d]): List of target2d objects (published targets) - returned by the tracker's update() function. - - Returns: - tuple: (tracked_masks, tracked_bboxes, tracked_track_ids, tracked_probs, tracked_names) - where each is a list of the corresponding attribute from each target. - """ - tracked_masks = [] - tracked_bboxes = [] - tracked_track_ids = [] - tracked_probs = [] - tracked_names = [] - - for target in tracked_targets: - # Extract the latest values stored in each target. - tracked_masks.append(target.latest_mask) - tracked_bboxes.append(target.latest_bbox) - # Here we use the most recent detection's track ID. - tracked_track_ids.append(target.target_id) - # Use the latest probability from the history. - tracked_probs.append(target.score) - # Use the stored name (if any). If not available, you can use a default value. - tracked_names.append(target.name) - - return tracked_masks, tracked_bboxes, tracked_track_ids, tracked_probs, tracked_names - - -class target2d: - """ - Represents a tracked 2D target. - Stores the latest bounding box and mask along with a short history of track IDs, - detection probabilities, and computed texture values. - """ - - def __init__( # type: ignore[no-untyped-def] - self, - initial_mask, - initial_bbox, - track_id, - prob: float, - name: str, - texture_value, - target_id, - history_size: int = 10, - ) -> None: - """ - Args: - initial_mask (torch.Tensor): Latest segmentation mask. - initial_bbox (list): Bounding box in [x1, y1, x2, y2] format. - track_id (int): Detection’s track ID (may be -1 if not provided). - prob (float): Detection probability. - name (str): Object class name. - texture_value (float): Computed average texture value for this detection. - target_id (int): Unique identifier assigned by the tracker. - history_size (int): Maximum number of frames to keep in the history. - """ - self.target_id = target_id - self.latest_mask = initial_mask - self.latest_bbox = initial_bbox - self.name = name - self.score = 1.0 - - self.track_id = track_id - self.probs_history = deque(maxlen=history_size) # type: ignore[var-annotated] - self.texture_history = deque(maxlen=history_size) # type: ignore[var-annotated] - - self.frame_count = deque(maxlen=history_size) # type: ignore[var-annotated] # Total frames this target has been seen. - self.missed_frames = 0 # Consecutive frames when no detection was assigned. - self.history_size = history_size - - def update(self, mask, bbox, track_id, prob: float, name: str, texture_value) -> None: # type: ignore[no-untyped-def] - """ - Update the target with a new detection. - """ - self.latest_mask = mask - self.latest_bbox = bbox - self.name = name - - self.track_id = track_id - self.probs_history.append(prob) - self.texture_history.append(texture_value) - - self.frame_count.append(1) - self.missed_frames = 0 - - def mark_missed(self) -> None: - """ - Increment the count of consecutive frames where this target was not updated. - """ - self.missed_frames += 1 - self.frame_count.append(0) - - def compute_score( # type: ignore[no-untyped-def] - self, - frame_shape, - min_area_ratio, - max_area_ratio, - texture_range=(0.0, 1.0), - border_safe_distance: int = 50, - weights=None, - ): - """ - Compute a combined score for the target based on several factors. - - Factors: - - **Detection probability:** Average over recent frames. - - **Temporal stability:** How consistently the target has appeared. - - **Texture quality:** Normalized using the provided min and max values. - - **Border proximity:** Computed from the minimum distance from the bbox to the frame edges. - - **Size:** How the object's area (relative to the frame) compares to acceptable bounds. - - Args: - frame_shape (tuple): (height, width) of the frame. - min_area_ratio (float): Minimum acceptable ratio (bbox area / frame area). - max_area_ratio (float): Maximum acceptable ratio. - texture_range (tuple): (min_texture, max_texture) expected values. - border_safe_distance (float): Distance (in pixels) considered safe from the border. - weights (dict): Weights for each component. Expected keys: - 'prob', 'temporal', 'texture', 'border', and 'size'. - - Returns: - float: The combined (normalized) score in the range [0, 1]. - """ - # Default weights if none provided. - if weights is None: - weights = {"prob": 1.0, "temporal": 1.0, "texture": 1.0, "border": 1.0, "size": 1.0} - - h, w = frame_shape - x1, y1, x2, y2 = self.latest_bbox - bbox_area = (x2 - x1) * (y2 - y1) - frame_area = w * h - area_ratio = bbox_area / frame_area - - # Detection probability factor. - avg_prob = np.mean(self.probs_history) - # Temporal stability factor: normalized by history size. - temporal_stability = np.mean(self.frame_count) - # Texture factor: normalize average texture using the provided range. - avg_texture = np.mean(self.texture_history) if self.texture_history else 0.0 - min_texture, max_texture = texture_range - if max_texture == min_texture: - normalized_texture = avg_texture - else: - normalized_texture = (avg_texture - min_texture) / (max_texture - min_texture) - normalized_texture = max(0.0, min(normalized_texture, 1.0)) - - # Border factor: compute the minimum distance from the bbox to any frame edge. - left_dist = x1 - top_dist = y1 - right_dist = w - x2 - min_border_dist = min(left_dist, top_dist, right_dist) - # Normalize the border distance: full score (1.0) if at least border_safe_distance away. - border_factor = min(1.0, min_border_dist / border_safe_distance) - - # Size factor: penalize objects that are too small or too big. - if area_ratio < min_area_ratio: - size_factor = area_ratio / min_area_ratio - elif area_ratio > max_area_ratio: - # Here we compute a linear penalty if the area exceeds max_area_ratio. - if 1 - max_area_ratio > 0: - size_factor = max(0, (1 - area_ratio) / (1 - max_area_ratio)) - else: - size_factor = 0.0 - else: - size_factor = 1.0 - - # Combine factors using a weighted sum (each factor is assumed in [0, 1]). - w_prob = weights.get("prob", 1.0) - w_temporal = weights.get("temporal", 1.0) - w_texture = weights.get("texture", 1.0) - w_border = weights.get("border", 1.0) - w_size = weights.get("size", 1.0) - total_weight = w_prob + w_temporal + w_texture + w_border + w_size - - # print(f"track_id: {self.target_id}, avg_prob: {avg_prob:.2f}, temporal_stability: {temporal_stability:.2f}, normalized_texture: {normalized_texture:.2f}, border_factor: {border_factor:.2f}, size_factor: {size_factor:.2f}") - - final_score = ( - w_prob * avg_prob - + w_temporal * temporal_stability - + w_texture * normalized_texture - + w_border * border_factor - + w_size * size_factor - ) / total_weight - - self.score = final_score - - return final_score - - -class target2dTracker: - """ - Tracker that maintains a history of targets across frames. - New segmentation detections (frame, masks, bboxes, track_ids, probabilities, - and computed texture values) are matched to existing targets or used to create new ones. - - The tracker uses a scoring system that incorporates: - - **Detection probability** - - **Temporal stability** - - **Texture quality** (normalized within a specified range) - - **Proximity to image borders** (a continuous penalty based on the distance) - - **Object size** relative to the frame - - Targets are published if their score exceeds the start threshold and are removed if their score - falls below the stop threshold or if they are missed for too many consecutive frames. - """ - - def __init__( # type: ignore[no-untyped-def] - self, - history_size: int = 10, - score_threshold_start: float = 0.5, - score_threshold_stop: float = 0.3, - min_frame_count: int = 10, - max_missed_frames: int = 3, - min_area_ratio: float = 0.001, - max_area_ratio: float = 0.1, - texture_range=(0.0, 1.0), - border_safe_distance: int = 50, - weights=None, - ) -> None: - """ - Args: - history_size (int): Maximum history length (number of frames) per target. - score_threshold_start (float): Minimum score for a target to be published. - score_threshold_stop (float): If a target’s score falls below this, it is removed. - min_frame_count (int): Minimum number of frames a target must be seen to be published. - max_missed_frames (int): Maximum consecutive frames a target can be missing before deletion. - min_area_ratio (float): Minimum acceptable bbox area relative to the frame. - max_area_ratio (float): Maximum acceptable bbox area relative to the frame. - texture_range (tuple): (min_texture, max_texture) expected values. - border_safe_distance (float): Distance (in pixels) considered safe from the border. - weights (dict): Weights for the scoring components (keys: 'prob', 'temporal', - 'texture', 'border', 'size'). - """ - self.history_size = history_size - self.score_threshold_start = score_threshold_start - self.score_threshold_stop = score_threshold_stop - self.min_frame_count = min_frame_count - self.max_missed_frames = max_missed_frames - self.min_area_ratio = min_area_ratio - self.max_area_ratio = max_area_ratio - self.texture_range = texture_range - self.border_safe_distance = border_safe_distance - # Default weights if none are provided. - if weights is None: - weights = {"prob": 1.0, "temporal": 1.0, "texture": 1.0, "border": 1.0, "size": 1.0} - self.weights = weights - - self.targets = {} # type: ignore[var-annotated] # Dictionary mapping target_id -> target2d instance. - self.next_target_id = 0 - - def update( # type: ignore[no-untyped-def] - self, - frame, - masks, - bboxes, - track_ids, - probs: Sequence[float], - names: Sequence[str], - texture_values, - ): - """ - Update the tracker with new detections from the current frame. - - Args: - frame (np.ndarray): Current BGR frame. - masks (list[torch.Tensor]): List of segmentation masks. - bboxes (list): List of bounding boxes [x1, y1, x2, y2]. - track_ids (list): List of detection track IDs. - probs (list): List of detection probabilities. - names (list): List of class names. - texture_values (list): List of computed texture values. - - Returns: - published_targets (list[target2d]): Targets that are active and have scores above - the start threshold. - """ - updated_target_ids = set() - frame_shape = frame.shape[:2] # (height, width) - - # For each detection, try to match with an existing target. - for mask, bbox, det_tid, prob, name, texture in zip( - masks, bboxes, track_ids, probs, names, texture_values, strict=False - ): - matched_target = None - - # First, try matching by detection track ID if valid. - if det_tid != -1: - for target in self.targets.values(): - if target.track_id == det_tid: - matched_target = target - break - - # Otherwise, try matching using IoU. - if matched_target is None: - best_iou = 0 - for target in self.targets.values(): - iou = compute_iou(bbox, target.latest_bbox) # type: ignore[no-untyped-call] - if iou > 0.5 and iou > best_iou: - best_iou = iou - matched_target = target - - # Update existing target or create a new one. - if matched_target is not None: - matched_target.update(mask, bbox, det_tid, prob, name, texture) - updated_target_ids.add(matched_target.target_id) - else: - new_target = target2d( - mask, bbox, det_tid, prob, name, texture, self.next_target_id, self.history_size - ) - self.targets[self.next_target_id] = new_target - updated_target_ids.add(self.next_target_id) - self.next_target_id += 1 - - # Mark targets that were not updated. - for target_id, target in list(self.targets.items()): - if target_id not in updated_target_ids: - target.mark_missed() - if target.missed_frames > self.max_missed_frames: - del self.targets[target_id] - continue # Skip further checks for this target. - # Remove targets whose score falls below the stop threshold. - score = target.compute_score( - frame_shape, - self.min_area_ratio, - self.max_area_ratio, - texture_range=self.texture_range, - border_safe_distance=self.border_safe_distance, - weights=self.weights, - ) - if score < self.score_threshold_stop: - del self.targets[target_id] - - # Publish targets with scores above the start threshold. - published_targets = [] - for target in self.targets.values(): - score = target.compute_score( - frame_shape, - self.min_area_ratio, - self.max_area_ratio, - texture_range=self.texture_range, - border_safe_distance=self.border_safe_distance, - weights=self.weights, - ) - if ( - score >= self.score_threshold_start - and sum(target.frame_count) >= self.min_frame_count - and target.missed_frames <= 5 - ): - published_targets.append(target) - - return published_targets diff --git a/dimos/perception/common/export_tensorrt.py b/dimos/perception/common/export_tensorrt.py deleted file mode 100644 index ca671e36f2..0000000000 --- a/dimos/perception/common/export_tensorrt.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -from ultralytics import YOLO, FastSAM # type: ignore[attr-defined, import-not-found] - - -def parse_args(): # type: ignore[no-untyped-def] - parser = argparse.ArgumentParser(description="Export YOLO/FastSAM models to different formats") - parser.add_argument("--model_path", type=str, required=True, help="Path to the model weights") - parser.add_argument( - "--model_type", - type=str, - choices=["yolo", "fastsam"], - required=True, - help="Type of model to export", - ) - parser.add_argument( - "--precision", - type=str, - choices=["fp32", "fp16", "int8"], - default="fp32", - help="Precision for export", - ) - parser.add_argument( - "--format", type=str, choices=["onnx", "engine"], default="onnx", help="Export format" - ) - return parser.parse_args() - - -def main() -> None: - args = parse_args() # type: ignore[no-untyped-call] - half = args.precision == "fp16" - int8 = args.precision == "int8" - # Load the appropriate model - if args.model_type == "yolo": - model: YOLO | FastSAM = YOLO(args.model_path) - else: - model = FastSAM(args.model_path) - - # Export the model - model.export(format=args.format, half=half, int8=int8) - - -if __name__ == "__main__": - main() diff --git a/dimos/perception/common/ibvs.py b/dimos/perception/common/ibvs.py deleted file mode 100644 index e24819f432..0000000000 --- a/dimos/perception/common/ibvs.py +++ /dev/null @@ -1,280 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np - - -class PersonDistanceEstimator: - def __init__(self, K, camera_pitch, camera_height) -> None: # type: ignore[no-untyped-def] - """ - Initialize the distance estimator using ground plane constraint. - - Args: - K: 3x3 Camera intrinsic matrix in OpenCV format - (Assumed to be already for an undistorted image) - camera_pitch: Upward pitch of the camera (in radians), in the robot frame - Positive means looking up, negative means looking down - camera_height: Height of the camera above the ground (in meters) - """ - self.K = K - self.camera_height = camera_height - - # Precompute the inverse intrinsic matrix - self.K_inv = np.linalg.inv(K) - - # Transform from camera to robot frame (z-forward to x-forward) - self.T = np.array([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) - - # Pitch rotation matrix (positive is upward) - theta = -camera_pitch # Negative since positive pitch is negative rotation about robot Y - self.R_pitch = np.array( - [[np.cos(theta), 0, np.sin(theta)], [0, 1, 0], [-np.sin(theta), 0, np.cos(theta)]] - ) - - # Combined transform from camera to robot frame - self.A = self.R_pitch @ self.T - - # Store focal length and principal point for angle calculation - self.fx = K[0, 0] - self.cx = K[0, 2] - - def estimate_distance_angle(self, bbox: tuple, robot_pitch: float | None = None): # type: ignore[no-untyped-def, type-arg] - """ - Estimate distance and angle to person using ground plane constraint. - - Args: - bbox: tuple (x_min, y_min, x_max, y_max) - where y_max represents the feet position - robot_pitch: Current pitch of the robot body (in radians) - If provided, this will be combined with the camera's fixed pitch - - Returns: - depth: distance to person along camera's z-axis (meters) - angle: horizontal angle in camera frame (radians, positive right) - """ - x_min, _, x_max, y_max = bbox - - # Get center point of feet - u_c = (x_min + x_max) / 2.0 - v_feet = y_max - - # Create homogeneous feet point and get ray direction - p_feet = np.array([u_c, v_feet, 1.0]) - d_feet_cam = self.K_inv @ p_feet - - # If robot_pitch is provided, recalculate the transformation matrix - if robot_pitch is not None: - # Combined pitch (fixed camera pitch + current robot pitch) - total_pitch = -camera_pitch - robot_pitch # Both negated for correct rotation direction - R_total_pitch = np.array( - [ - [np.cos(total_pitch), 0, np.sin(total_pitch)], - [0, 1, 0], - [-np.sin(total_pitch), 0, np.cos(total_pitch)], - ] - ) - # Use the updated transformation matrix - A = R_total_pitch @ self.T - else: - # Use the precomputed transformation matrix - A = self.A - - # Convert ray to robot frame using appropriate transformation - d_feet_robot = A @ d_feet_cam - - # Ground plane intersection (z=0) - # camera_height + t * d_feet_robot[2] = 0 - if abs(d_feet_robot[2]) < 1e-6: - raise ValueError("Feet ray is parallel to ground plane") - - # Solve for scaling factor t - t = -self.camera_height / d_feet_robot[2] - - # Get 3D feet position in robot frame - p_feet_robot = t * d_feet_robot - - # Convert back to camera frame - p_feet_cam = self.A.T @ p_feet_robot - - # Extract depth (z-coordinate in camera frame) - depth = p_feet_cam[2] - - # Calculate horizontal angle from image center - angle = np.arctan((u_c - self.cx) / self.fx) - - return depth, angle - - -class ObjectDistanceEstimator: - """ - Estimate distance to an object using the ground plane constraint. - This class assumes the camera is mounted on a robot and uses the - camera's intrinsic parameters to estimate the distance to a detected object. - """ - - def __init__(self, K, camera_pitch, camera_height) -> None: # type: ignore[no-untyped-def] - """ - Initialize the distance estimator using ground plane constraint. - - Args: - K: 3x3 Camera intrinsic matrix in OpenCV format - (Assumed to be already for an undistorted image) - camera_pitch: Upward pitch of the camera (in radians) - Positive means looking up, negative means looking down - camera_height: Height of the camera above the ground (in meters) - """ - self.K = K - self.camera_height = camera_height - - # Precompute the inverse intrinsic matrix - self.K_inv = np.linalg.inv(K) - - # Transform from camera to robot frame (z-forward to x-forward) - self.T = np.array([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) - - # Pitch rotation matrix (positive is upward) - theta = -camera_pitch # Negative since positive pitch is negative rotation about robot Y - self.R_pitch = np.array( - [[np.cos(theta), 0, np.sin(theta)], [0, 1, 0], [-np.sin(theta), 0, np.cos(theta)]] - ) - - # Combined transform from camera to robot frame - self.A = self.R_pitch @ self.T - - # Store focal length and principal point for angle calculation - self.fx = K[0, 0] - self.fy = K[1, 1] - self.cx = K[0, 2] - self.estimated_object_size = None - - def estimate_object_size(self, bbox: tuple, distance: float): # type: ignore[no-untyped-def, type-arg] - """ - Estimate the physical size of an object based on its bbox and known distance. - - Args: - bbox: tuple (x_min, y_min, x_max, y_max) bounding box in the image - distance: Known distance to the object (in meters) - robot_pitch: Current pitch of the robot body (in radians), if any - - Returns: - estimated_size: Estimated physical height of the object (in meters) - """ - _x_min, y_min, _x_max, y_max = bbox - - # Calculate object height in pixels - object_height_px = y_max - y_min - - # Calculate the physical height using the known distance and focal length - estimated_size = object_height_px * distance / self.fy - self.estimated_object_size = estimated_size - - return estimated_size - - def set_estimated_object_size(self, size: float) -> None: - """ - Set the estimated object size for future distance calculations. - - Args: - size: Estimated physical size of the object (in meters) - """ - self.estimated_object_size = size # type: ignore[assignment] - - def estimate_distance_angle(self, bbox: tuple): # type: ignore[no-untyped-def, type-arg] - """ - Estimate distance and angle to object using size-based estimation. - - Args: - bbox: tuple (x_min, y_min, x_max, y_max) - where y_max represents the bottom of the object - robot_pitch: Current pitch of the robot body (in radians) - If provided, this will be combined with the camera's fixed pitch - initial_distance: Initial distance estimate for the object (in meters) - Used to calibrate object size if not previously known - - Returns: - depth: distance to object along camera's z-axis (meters) - angle: horizontal angle in camera frame (radians, positive right) - or None, None if estimation not possible - """ - # If we don't have estimated object size and no initial distance is provided, - # we can't estimate the distance - if self.estimated_object_size is None: - return None, None - - x_min, y_min, x_max, y_max = bbox - - # Calculate center of the object for angle calculation - u_c = (x_min + x_max) / 2.0 - - # If we have an initial distance estimate and no object size yet, - # calculate and store the object size using the initial distance - object_height_px = y_max - y_min - depth = self.estimated_object_size * self.fy / object_height_px - - # Calculate horizontal angle from image center - angle = np.arctan((u_c - self.cx) / self.fx) - - return depth, angle - - -# Example usage: -if __name__ == "__main__": - # Example camera calibration - K = np.array([[600, 0, 320], [0, 600, 240], [0, 0, 1]], dtype=np.float32) - - # Camera mounted 1.2m high, pitched down 10 degrees - camera_pitch = np.deg2rad(0) # negative for downward pitch - camera_height = 1.0 # meters - - estimator = PersonDistanceEstimator(K, camera_pitch, camera_height) - object_estimator = ObjectDistanceEstimator(K, camera_pitch, camera_height) - - # Example detection - bbox = (300, 100, 380, 400) # x1, y1, x2, y2 - - depth, angle = estimator.estimate_distance_angle(bbox) - # Estimate object size based on the known distance - object_size = object_estimator.estimate_object_size(bbox, depth) - depth_obj, angle_obj = object_estimator.estimate_distance_angle(bbox) - - print(f"Estimated person depth: {depth:.2f} m") - print(f"Estimated person angle: {np.rad2deg(angle):.1f}°") - print(f"Estimated object depth: {depth_obj:.2f} m") - print(f"Estimated object angle: {np.rad2deg(angle_obj):.1f}°") - - # Shrink the bbox by 30 pixels while keeping the same center - x_min, y_min, x_max, y_max = bbox - width = x_max - x_min - height = y_max - y_min - center_x = (x_min + x_max) // 2 - center_y = (y_min + y_max) // 2 - - new_width = max(width - 20, 2) # Ensure width is at least 2 pixels - new_height = max(height - 20, 2) # Ensure height is at least 2 pixels - - x_min = center_x - new_width // 2 - x_max = center_x + new_width // 2 - y_min = center_y - new_height // 2 - y_max = center_y + new_height // 2 - - bbox = (x_min, y_min, x_max, y_max) - - # Re-estimate distance and angle with the new bbox - depth, angle = estimator.estimate_distance_angle(bbox) - depth_obj, angle_obj = object_estimator.estimate_distance_angle(bbox) - - print(f"New estimated person depth: {depth:.2f} m") - print(f"New estimated person angle: {np.rad2deg(angle):.1f}°") - print(f"New estimated object depth: {depth_obj:.2f} m") - print(f"New estimated object angle: {np.rad2deg(angle_obj):.1f}°") diff --git a/dimos/perception/demo_object_scene_registration.py b/dimos/perception/demo_object_scene_registration.py new file mode 100644 index 0000000000..fe9c3100bb --- /dev/null +++ b/dimos/perception/demo_object_scene_registration.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dimos.agents.agent import llm_agent +from dimos.agents.cli.human import human_input +from dimos.agents.spec import Provider +from dimos.core.blueprints import autoconnect +from dimos.hardware.sensors.camera.realsense import realsense_camera +from dimos.hardware.sensors.camera.zed import zed_camera +from dimos.perception.detection.detectors.yoloe import YoloePromptMode +from dimos.perception.object_scene_registration import object_scene_registration_module +from dimos.robot.foxglove_bridge import foxglove_bridge + +camera_choice = "zed" + +if camera_choice == "realsense": + camera_module = realsense_camera(enable_pointcloud=False) +elif camera_choice == "zed": + camera_module = zed_camera(enable_pointcloud=False) +else: + raise ValueError(f"Invalid camera choice: {camera_choice}") + +demo_object_scene_registration = autoconnect( + camera_module, + object_scene_registration_module(target_frame="world", prompt_mode=YoloePromptMode.LRPC), + foxglove_bridge(), + human_input(), + llm_agent(), +).global_config(viewer_backend="foxglove") diff --git a/dimos/perception/detection/detectors/conftest.py b/dimos/perception/detection/detectors/conftest.py index 9cb600aeff..6a2c041a8b 100644 --- a/dimos/perception/detection/detectors/conftest.py +++ b/dimos/perception/detection/detectors/conftest.py @@ -17,6 +17,7 @@ from dimos.msgs.sensor_msgs import Image from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector from dimos.perception.detection.detectors.yolo import Yolo2DDetector +from dimos.perception.detection.detectors.yoloe import Yoloe2DDetector, YoloePromptMode from dimos.utils.data import get_data @@ -36,3 +37,9 @@ def person_detector(): def bbox_detector(): """Create a Yolo2DDetector instance for general object detection.""" return Yolo2DDetector() + + +@pytest.fixture(scope="session") +def yoloe_detector(): + """Create a Yoloe2DDetector instance for general object detection.""" + return Yoloe2DDetector(prompt_mode=YoloePromptMode.LRPC) diff --git a/dimos/perception/detection/detectors/detic.py b/dimos/perception/detection/detectors/detic.py deleted file mode 100644 index 288a3e056d..0000000000 --- a/dimos/perception/detection/detectors/detic.py +++ /dev/null @@ -1,426 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections.abc import Sequence -import os -import sys - -import numpy as np - -# Add Detic to Python path -from dimos.constants import DIMOS_PROJECT_ROOT -from dimos.msgs.sensor_msgs import Image -from dimos.perception.detection.detectors.types import Detector -from dimos.perception.detection2d.utils import plot_results - -detic_path = DIMOS_PROJECT_ROOT / "dimos/models/Detic" -if str(detic_path) not in sys.path: - sys.path.append(str(detic_path)) - sys.path.append(str(detic_path / "third_party/CenterNet2")) - -# PIL patch for compatibility -import PIL.Image - -if not hasattr(PIL.Image, "LINEAR") and hasattr(PIL.Image, "BILINEAR"): - PIL.Image.LINEAR = PIL.Image.BILINEAR # type: ignore[attr-defined] - -# Detectron2 imports -from detectron2.config import get_cfg # type: ignore[import-not-found] -from detectron2.data import MetadataCatalog # type: ignore[import-not-found] - - -# Simple tracking implementation -class SimpleTracker: - """Simple IOU-based tracker implementation without external dependencies""" - - def __init__(self, iou_threshold: float = 0.3, max_age: int = 5) -> None: - self.iou_threshold = iou_threshold - self.max_age = max_age - self.next_id = 1 - self.tracks = {} # type: ignore[var-annotated] # id -> {bbox, class_id, age, mask, etc} - - def _calculate_iou(self, bbox1, bbox2): # type: ignore[no-untyped-def] - """Calculate IoU between two bboxes in format [x1,y1,x2,y2]""" - x1 = max(bbox1[0], bbox2[0]) - y1 = max(bbox1[1], bbox2[1]) - x2 = min(bbox1[2], bbox2[2]) - y2 = min(bbox1[3], bbox2[3]) - - if x2 < x1 or y2 < y1: - return 0.0 - - intersection = (x2 - x1) * (y2 - y1) - area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) - area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]) - union = area1 + area2 - intersection - - return intersection / union if union > 0 else 0 - - def update(self, detections, masks): # type: ignore[no-untyped-def] - """Update tracker with new detections - - Args: - detections: List of [x1,y1,x2,y2,score,class_id] - masks: List of segmentation masks corresponding to detections - - Returns: - List of [track_id, bbox, score, class_id, mask] - """ - if len(detections) == 0: - # Age existing tracks - for track_id in list(self.tracks.keys()): - self.tracks[track_id]["age"] += 1 - # Remove old tracks - if self.tracks[track_id]["age"] > self.max_age: - del self.tracks[track_id] - return [] - - # Convert to numpy for easier handling - if not isinstance(detections, np.ndarray): - detections = np.array(detections) - - result = [] - matched_indices = set() - - # Update existing tracks - for track_id, track in list(self.tracks.items()): - track["age"] += 1 - - if track["age"] > self.max_age: - del self.tracks[track_id] - continue - - # Find best matching detection for this track - best_iou = self.iou_threshold - best_idx = -1 - - for i, det in enumerate(detections): - if i in matched_indices: - continue - - # Check class match - if det[5] != track["class_id"]: - continue - - iou = self._calculate_iou(track["bbox"], det[:4]) # type: ignore[no-untyped-call] - if iou > best_iou: - best_iou = iou - best_idx = i - - # If we found a match, update the track - if best_idx >= 0: - self.tracks[track_id]["bbox"] = detections[best_idx][:4] - self.tracks[track_id]["score"] = detections[best_idx][4] - self.tracks[track_id]["age"] = 0 - self.tracks[track_id]["mask"] = masks[best_idx] - matched_indices.add(best_idx) - - # Add to results with mask - result.append( - [ - track_id, - detections[best_idx][:4], - detections[best_idx][4], - int(detections[best_idx][5]), - self.tracks[track_id]["mask"], - ] - ) - - # Create new tracks for unmatched detections - for i, det in enumerate(detections): - if i in matched_indices: - continue - - # Create new track - new_id = self.next_id - self.next_id += 1 - - self.tracks[new_id] = { - "bbox": det[:4], - "score": det[4], - "class_id": int(det[5]), - "age": 0, - "mask": masks[i], - } - - # Add to results with mask directly from the track - result.append([new_id, det[:4], det[4], int(det[5]), masks[i]]) - - return result - - -class Detic2DDetector(Detector): - def __init__( # type: ignore[no-untyped-def] - self, model_path=None, device: str = "cuda", vocabulary=None, threshold: float = 0.5 - ) -> None: - """ - Initialize the Detic detector with open vocabulary support. - - Args: - model_path (str): Path to a custom Detic model weights (optional) - device (str): Device to run inference on ('cuda' or 'cpu') - vocabulary (list): Custom vocabulary (list of class names) or 'lvis', 'objects365', 'openimages', 'coco' - threshold (float): Detection confidence threshold - """ - self.device = device - self.threshold = threshold - - # Set up Detic paths - already added to sys.path at module level - - # Import Detic modules - from centernet.config import add_centernet_config # type: ignore[import-not-found] - from detic.config import add_detic_config # type: ignore[import-not-found] - from detic.modeling.text.text_encoder import ( # type: ignore[import-not-found] - build_text_encoder, - ) - from detic.modeling.utils import reset_cls_test # type: ignore[import-not-found] - - # Keep reference to these functions for later use - self.reset_cls_test = reset_cls_test - self.build_text_encoder = build_text_encoder - - # Setup model configuration - self.cfg = get_cfg() - add_centernet_config(self.cfg) - add_detic_config(self.cfg) - - # Use default Detic config - self.cfg.merge_from_file( - os.path.join( - detic_path, "configs/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml" - ) - ) - - # Set default weights if not provided - if model_path is None: - self.cfg.MODEL.WEIGHTS = "https://dl.fbaipublicfiles.com/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth" - else: - self.cfg.MODEL.WEIGHTS = model_path - - # Set device - if device == "cpu": - self.cfg.MODEL.DEVICE = "cpu" - - # Set detection threshold - self.cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = threshold - self.cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = "rand" - self.cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = True - - # Built-in datasets for Detic - use absolute paths with detic_path - self.builtin_datasets = { - "lvis": { - "metadata": "lvis_v1_val", - "classifier": os.path.join( - detic_path, "datasets/metadata/lvis_v1_clip_a+cname.npy" - ), - }, - "objects365": { - "metadata": "objects365_v2_val", - "classifier": os.path.join( - detic_path, "datasets/metadata/o365_clip_a+cnamefix.npy" - ), - }, - "openimages": { - "metadata": "oid_val_expanded", - "classifier": os.path.join(detic_path, "datasets/metadata/oid_clip_a+cname.npy"), - }, - "coco": { - "metadata": "coco_2017_val", - "classifier": os.path.join(detic_path, "datasets/metadata/coco_clip_a+cname.npy"), - }, - } - - # Override config paths to use absolute paths - self.cfg.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH = os.path.join( - detic_path, "datasets/metadata/lvis_v1_train_cat_info.json" - ) - - # Initialize model - self.predictor = None - - # Setup with initial vocabulary - vocabulary = vocabulary or "lvis" - self.setup_vocabulary(vocabulary) # type: ignore[no-untyped-call] - - # Initialize our simple tracker - self.tracker = SimpleTracker(iou_threshold=0.5, max_age=5) - - def setup_vocabulary(self, vocabulary): # type: ignore[no-untyped-def] - """ - Setup the model's vocabulary. - - Args: - vocabulary: Either a string ('lvis', 'objects365', 'openimages', 'coco') - or a list of class names for custom vocabulary. - """ - if self.predictor is None: - # Initialize the model - from detectron2.engine import DefaultPredictor # type: ignore[import-not-found] - - self.predictor = DefaultPredictor(self.cfg) - - if isinstance(vocabulary, str) and vocabulary in self.builtin_datasets: - # Use built-in dataset - dataset = vocabulary - metadata = MetadataCatalog.get(self.builtin_datasets[dataset]["metadata"]) - classifier = self.builtin_datasets[dataset]["classifier"] - num_classes = len(metadata.thing_classes) - self.class_names = metadata.thing_classes - else: - # Use custom vocabulary - if isinstance(vocabulary, str): - # If it's a string but not a built-in dataset, treat as a file - try: - with open(vocabulary) as f: - class_names = [line.strip() for line in f if line.strip()] - except: - # Default to LVIS if there's an issue - print(f"Error loading vocabulary from {vocabulary}, using LVIS") - return self.setup_vocabulary("lvis") # type: ignore[no-untyped-call] - else: - # Assume it's a list of class names - class_names = vocabulary - - # Create classifier from text embeddings - metadata = MetadataCatalog.get("__unused") - metadata.thing_classes = class_names - self.class_names = class_names - - # Generate CLIP embeddings for custom vocabulary - classifier = self._get_clip_embeddings(class_names) - num_classes = len(class_names) - - # Reset model with new vocabulary - self.reset_cls_test(self.predictor.model, classifier, num_classes) # type: ignore[attr-defined] - return self.class_names - - def _get_clip_embeddings(self, vocabulary, prompt: str = "a "): # type: ignore[no-untyped-def] - """ - Generate CLIP embeddings for a vocabulary list. - - Args: - vocabulary (list): List of class names - prompt (str): Prompt prefix to use for CLIP - - Returns: - torch.Tensor: Tensor of embeddings - """ - text_encoder = self.build_text_encoder(pretrain=True) - text_encoder.eval() - texts = [prompt + x for x in vocabulary] - emb = text_encoder(texts).detach().permute(1, 0).contiguous().cpu() - return emb - - def process_image(self, image: Image): # type: ignore[no-untyped-def] - """ - Process an image and return detection results. - - Args: - image: Input image in BGR format (OpenCV) - - Returns: - tuple: (bboxes, track_ids, class_ids, confidences, names, masks) - - bboxes: list of [x1, y1, x2, y2] coordinates - - track_ids: list of tracking IDs (or -1 if no tracking) - - class_ids: list of class indices - - confidences: list of detection confidences - - names: list of class names - - masks: list of segmentation masks (numpy arrays) - """ - # Run inference with Detic - outputs = self.predictor(image.to_opencv()) # type: ignore[misc] - instances = outputs["instances"].to("cpu") - - # Extract bounding boxes, classes, scores, and masks - if len(instances) == 0: - return [], [], [], [], [] # , [] - - boxes = instances.pred_boxes.tensor.numpy() - class_ids = instances.pred_classes.numpy() - scores = instances.scores.numpy() - masks = instances.pred_masks.numpy() - - # Convert boxes to [x1, y1, x2, y2] format - bboxes = [] - for box in boxes: - x1, y1, x2, y2 = box.tolist() - bboxes.append([x1, y1, x2, y2]) - - # Get class names - [self.class_names[class_id] for class_id in class_ids] - - # Apply tracking - detections = [] - filtered_masks = [] - for i, bbox in enumerate(bboxes): - if scores[i] >= self.threshold: - # Format for tracker: [x1, y1, x2, y2, score, class_id] - detections.append([*bbox, scores[i], class_ids[i]]) - filtered_masks.append(masks[i]) - - if not detections: - return [], [], [], [], [] # , [] - - # Update tracker with detections and correctly aligned masks - track_results = self.tracker.update(detections, filtered_masks) # type: ignore[no-untyped-call] - - # Process tracking results - track_ids = [] - tracked_bboxes = [] - tracked_class_ids = [] - tracked_scores = [] - tracked_names = [] - tracked_masks = [] - - for track_id, bbox, score, class_id, mask in track_results: - track_ids.append(int(track_id)) - tracked_bboxes.append(bbox.tolist() if isinstance(bbox, np.ndarray) else bbox) - tracked_class_ids.append(int(class_id)) - tracked_scores.append(score) - tracked_names.append(self.class_names[int(class_id)]) - tracked_masks.append(mask) - - return ( - tracked_bboxes, - track_ids, - tracked_class_ids, - tracked_scores, - tracked_names, - # tracked_masks, - ) - - def visualize_results( # type: ignore[no-untyped-def] - self, image, bboxes, track_ids, class_ids, confidences, names: Sequence[str] - ): - """ - Generate visualization of detection results. - - Args: - image: Original input image - bboxes: List of bounding boxes - track_ids: List of tracking IDs - class_ids: List of class indices - confidences: List of detection confidences - names: List of class names - - Returns: - Image with visualized detections - """ - - return plot_results(image, bboxes, track_ids, class_ids, confidences, names) - - def cleanup(self) -> None: - """Clean up resources.""" - # Nothing specific to clean up for Detic - pass diff --git a/dimos/perception/detection/detectors/test_bbox_detectors.py b/dimos/perception/detection/detectors/test_bbox_detectors.py index bd9c1358b5..4047e35e7b 100644 --- a/dimos/perception/detection/detectors/test_bbox_detectors.py +++ b/dimos/perception/detection/detectors/test_bbox_detectors.py @@ -17,7 +17,7 @@ from dimos.perception.detection.type import Detection2D, ImageDetections2D -@pytest.fixture(params=["bbox_detector", "person_detector"], scope="session") +@pytest.fixture(params=["bbox_detector", "person_detector", "yoloe_detector"], scope="session") def detector(request): """Parametrized fixture that provides both bbox and person detectors.""" return request.getfixturevalue(request.param) diff --git a/dimos/perception/detection/detectors/yoloe.py b/dimos/perception/detection/detectors/yoloe.py new file mode 100644 index 0000000000..9c9881209c --- /dev/null +++ b/dimos/perception/detection/detectors/yoloe.py @@ -0,0 +1,177 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum +import threading +from typing import Any + +import numpy as np +from numpy.typing import NDArray +from ultralytics import YOLOE # type: ignore[attr-defined, import-not-found] + +from dimos.msgs.sensor_msgs import Image +from dimos.perception.detection.detectors.types import Detector +from dimos.perception.detection.type import ImageDetections2D +from dimos.utils.data import get_data +from dimos.utils.gpu_utils import is_cuda_available + + +class YoloePromptMode(Enum): + """YOLO-E prompt modes.""" + + LRPC = "lrpc" + PROMPT = "prompt" + + +class Yoloe2DDetector(Detector): + def __init__( + self, + model_path: str = "models_yoloe", + model_name: str | None = None, + device: str | None = None, + prompt_mode: YoloePromptMode = YoloePromptMode.LRPC, + exclude_class_ids: list[int] | None = None, + max_area_ratio: float | None = 0.3, + ) -> None: + """ + Initialize YOLO-E 2D detector. + + Args: + model_path: Path to model directory (fetched via get_data from LFS). + model_name: Model filename. Defaults based on prompt_mode. + device: Device to run inference on ('cuda', 'cpu', or None for auto). + prompt_mode: LRPC for prompt-free detection, PROMPT for text/visual prompting. + exclude_class_ids: Class IDs to filter out from results (pass [] to disable). + max_area_ratio: Maximum bbox area ratio (0-1) relative to image. + """ + if model_name is None: + if prompt_mode == YoloePromptMode.LRPC: + model_name = "yoloe-11s-seg-pf.pt" + else: + model_name = "yoloe-11s-seg.pt" + + self.model = YOLOE(get_data(model_path) / model_name) + self.prompt_mode = prompt_mode + self._visual_prompts: dict[str, NDArray[Any]] | None = None + self.max_area_ratio = max_area_ratio + self._lock = threading.Lock() + + if prompt_mode == YoloePromptMode.PROMPT: + self.set_prompts(text=["nothing"]) + self.exclude_class_ids = set(exclude_class_ids) if exclude_class_ids else set() + + if self.max_area_ratio is not None and not (0.0 < self.max_area_ratio <= 1.0): + raise ValueError("max_area_ratio must be in the range (0, 1].") + + if device: + self.device = device + elif is_cuda_available(): # type: ignore[no-untyped-call] + self.device = "cuda" + else: + self.device = "cpu" + + def set_prompts( + self, + text: list[str] | None = None, + bboxes: NDArray[np.float64] | None = None, + ) -> None: + """ + Set prompts for detection. Provide either text or bboxes, not both. + + Args: + text: List of class names to detect. + bboxes: Bounding boxes in xyxy format, shape (N, 4). + """ + if text is not None and bboxes is not None: + raise ValueError("Provide either text or bboxes, not both.") + if text is None and bboxes is None: + raise ValueError("Must provide either text or bboxes.") + + with self._lock: + self.model.predictor = None + if text is not None: + self.model.set_classes(text, self.model.get_text_pe(text)) # type: ignore[no-untyped-call] + self._visual_prompts = None + else: + cls = np.arange(len(bboxes), dtype=np.int16) # type: ignore[arg-type] + self._visual_prompts = {"bboxes": bboxes, "cls": cls} # type: ignore[dict-item] + + def process_image(self, image: Image) -> "ImageDetections2D[Any]": + """ + Process an image and return detection results. + + Args: + image: Input image + + Returns: + ImageDetections2D containing all detected objects + """ + track_kwargs = { + "source": image.to_opencv(), + "device": self.device, + "conf": 0.6, + "iou": 0.6, + "persist": True, + "verbose": False, + } + + with self._lock: + if self._visual_prompts is not None: + track_kwargs["visual_prompts"] = self._visual_prompts + + results = self.model.track(**track_kwargs) # type: ignore[arg-type] + + detections = ImageDetections2D.from_ultralytics_result(image, results) + return self._apply_filters(image, detections) + + def _apply_filters( + self, + image: Image, + detections: "ImageDetections2D[Any]", + ) -> "ImageDetections2D[Any]": + if not self.exclude_class_ids and self.max_area_ratio is None: + return detections + + predicates = [] + + if self.exclude_class_ids: + predicates.append(lambda det: det.class_id not in self.exclude_class_ids) + + if self.max_area_ratio is not None: + image_area = image.width * image.height + + def area_filter(det): # type: ignore[no-untyped-def] + if image_area <= 0: + return True + return (det.bbox_2d_volume() / image_area) <= self.max_area_ratio + + predicates.append(area_filter) + + filtered = detections.detections + for predicate in predicates: + filtered = [det for det in filtered if predicate(det)] # type: ignore[no-untyped-call] + + return ImageDetections2D(image, filtered) + + def stop(self) -> None: + """Clean up resources used by the detector.""" + if hasattr(self.model, "predictor") and self.model.predictor is not None: + predictor = self.model.predictor + if hasattr(predictor, "trackers") and predictor.trackers: + for tracker in predictor.trackers: + if hasattr(tracker, "tracker") and hasattr(tracker.tracker, "gmc"): + gmc = tracker.tracker.gmc + if hasattr(gmc, "executor") and gmc.executor is not None: + gmc.executor.shutdown(wait=True) + self.model.predictor = None diff --git a/dimos/perception/detection/objectDB.py b/dimos/perception/detection/objectDB.py new file mode 100644 index 0000000000..6ade2d8c8d --- /dev/null +++ b/dimos/perception/detection/objectDB.py @@ -0,0 +1,312 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import threading +import time +from typing import TYPE_CHECKING, Any + +import open3d as o3d # type: ignore[import-untyped] + +from dimos.msgs.sensor_msgs import PointCloud2 +from dimos.utils.logging_config import setup_logger + +if TYPE_CHECKING: + from dimos.msgs.geometry_msgs import Vector3 + from dimos.perception.detection.type.detection3d.object import Object + +logger = setup_logger() + + +class ObjectDB: + """Spatial memory database for 3D object detections. + + Maintains two tiers of objects internally: + - _pending_objects: Recently detected objects (detection_count < threshold) + - _objects: Confirmed permanent objects (detection_count >= threshold) + + Deduplication uses two heuristics: + 1. track_id match from YOLOE tracker (recent match) + 2. Center distance within threshold (spatial proximity match) + """ + + def __init__( + self, + distance_threshold: float = 0.1, + min_detections_for_permanent: int = 6, + pending_ttl_s: float = 5.0, + track_id_ttl_s: float = 2.0, + ) -> None: + self._distance_threshold = distance_threshold + self._min_detections = min_detections_for_permanent + self._pending_ttl_s = pending_ttl_s + self._track_id_ttl_s = track_id_ttl_s + + # Internal storage - keyed by object_id + self._pending_objects: dict[str, Object] = {} + self._objects: dict[str, Object] = {} # Permanent objects + + # track_id -> object_id mapping for fast lookup + self._track_id_map: dict[int, str] = {} + self._last_add_stats: dict[str, int] = {} + + self._lock = threading.RLock() + + # ───────────────────────────────────────────────────────────────── + # Public Methods + # ───────────────────────────────────────────────────────────────── + + def add_objects(self, objects: list[Object]) -> list[Object]: + """Add multiple objects to the database with deduplication. + + Args: + objects: List of Object instances from object_scene_registration + + Returns: + List of updated/created Object instances + """ + stats = { + "input": len(objects), + "created": 0, + "updated": 0, + "promoted": 0, + "matched_track": 0, + "matched_distance": 0, + } + + results: list[Object] = [] + now = time.time() + with self._lock: + self._prune_stale_pending(now) + for obj in objects: + matched, reason = self._match(obj, now) + if matched is None: + results.append(self._insert_pending(obj, now)) + stats["created"] += 1 + continue + + self._update_existing(matched, obj, now) + results.append(matched) + stats["updated"] += 1 + if reason == "track": + stats["matched_track"] += 1 + elif reason == "distance": + stats["matched_distance"] += 1 + if self._check_promotion(matched): + stats["promoted"] += 1 + + stats["pending"] = len(self._pending_objects) + stats["permanent"] = len(self._objects) + self._last_add_stats = stats + return results + + def get_last_add_stats(self) -> dict[str, int]: + with self._lock: + return dict(self._last_add_stats) + + def get_objects(self) -> list[Object]: + """Get all permanent objects (detection_count >= threshold).""" + with self._lock: + return list(self._objects.values()) + + def get_all_objects(self) -> list[Object]: + """Get all objects (both pending and permanent).""" + with self._lock: + return list(self._pending_objects.values()) + list(self._objects.values()) + + def promote(self, object_id: str) -> bool: + """Promote an object from pending to permanent.""" + with self._lock: + if object_id in self._pending_objects: + self._objects[object_id] = self._pending_objects.pop(object_id) + return True + return object_id in self._objects + + def find_by_name(self, name: str) -> list[Object]: + """Find all permanent objects with matching name.""" + with self._lock: + return [obj for obj in self._objects.values() if obj.name == name] + + def find_nearest( + self, + position: Vector3, + name: str | None = None, + ) -> Object | None: + """Find nearest permanent object to a position, optionally filtered by name. + + Args: + position: Position to search from + name: Optional name filter + + Returns: + Nearest Object or None if no objects found + """ + with self._lock: + candidates = [ + obj + for obj in self._objects.values() + if obj.center is not None and (name is None or obj.name == name) + ] + + if not candidates: + return None + + return min(candidates, key=lambda obj: position.distance(obj.center)) # type: ignore[arg-type] + + def clear(self) -> None: + """Clear all objects from the database.""" + with self._lock: + # Drop Open3D pointcloud references before clearing to reduce shutdown warnings. + for obj in list(self._pending_objects.values()) + list(self._objects.values()): + obj.pointcloud = PointCloud2( + pointcloud=o3d.geometry.PointCloud(), + frame_id=obj.pointcloud.frame_id, + ts=obj.pointcloud.ts, + ) + self._pending_objects.clear() + self._objects.clear() + self._track_id_map.clear() + logger.info("ObjectDB cleared") + + def get_stats(self) -> dict[str, int]: + """Get statistics about the database.""" + with self._lock: + return { + "pending_count": len(self._pending_objects), + "permanent_count": len(self._objects), + "total_count": len(self._pending_objects) + len(self._objects), + } + + # ───────────────────────────────────────────────────────────────── + # Internal Methods + # ───────────────────────────────────────────────────────────────── + + def _match(self, obj: Object, now: float) -> tuple[Object | None, str | None]: + if obj.track_id >= 0: + matched = self._match_by_track_id(obj.track_id, now) + if matched is not None: + return matched, "track" + + matched = self._match_by_distance(obj) + if matched is not None: + return matched, "distance" + return None, None + + def _insert_pending(self, obj: Object, now: float) -> Object: + if not obj.ts: + obj.ts = now + self._pending_objects[obj.object_id] = obj + if obj.track_id >= 0: + self._track_id_map[obj.track_id] = obj.object_id + logger.info(f"Created new pending object {obj.object_id} ({obj.name})") + return obj + + def _update_existing(self, existing: Object, obj: Object, now: float) -> None: + existing.update_object(obj) + existing.ts = obj.ts or now + if obj.track_id >= 0: + self._track_id_map[obj.track_id] = existing.object_id + + def _match_by_track_id(self, track_id: int, now: float) -> Object | None: + """Find object with matching track_id from YOLOE.""" + if track_id < 0: + return None + + object_id = self._track_id_map.get(track_id) + if object_id is None: + return None + + # Check in permanent objects first + if object_id in self._objects: + obj = self._objects[object_id] + elif object_id in self._pending_objects: + obj = self._pending_objects[object_id] + else: + del self._track_id_map[track_id] + return None + + last_seen = obj.ts if obj.ts else now + if now - last_seen > self._track_id_ttl_s: + del self._track_id_map[track_id] + return None + + return obj + + def _match_by_distance(self, obj: Object) -> Object | None: + """Find object within distance threshold.""" + if obj.center is None: + return None + + # Combine all objects and filter by valid center + all_objects = list(self._objects.values()) + list(self._pending_objects.values()) + candidates = [ + o + for o in all_objects + if o.center is not None and obj.center.distance(o.center) < self._distance_threshold + ] + + if not candidates: + return None + + return min(candidates, key=lambda o: obj.center.distance(o.center)) # type: ignore[union-attr] + + def _prune_stale_pending(self, now: float) -> None: + if self._pending_ttl_s <= 0: + return + cutoff = now - self._pending_ttl_s + stale_ids = [ + obj_id for obj_id, obj in self._pending_objects.items() if (obj.ts or now) < cutoff + ] + for obj_id in stale_ids: + del self._pending_objects[obj_id] + for track_id, mapped_id in list(self._track_id_map.items()): + if mapped_id == obj_id: + del self._track_id_map[track_id] + + def _check_promotion(self, obj: Object) -> bool: + """Move object from pending to permanent if threshold met.""" + if obj.detections_count >= self._min_detections: + # Check if it's in pending + if obj.object_id in self._pending_objects: + # Promote to permanent + del self._pending_objects[obj.object_id] + self._objects[obj.object_id] = obj + logger.info( + f"Promoted object {obj.object_id} ({obj.name}) to permanent " + f"with {obj.detections_count} detections" + ) + return True + return False + + # ───────────────────────────────────────────────────────────────── + # Agent encoding + # ───────────────────────────────────────────────────────────────── + + def agent_encode(self) -> list[dict[str, Any]]: + """Encode permanent objects for agent consumption.""" + with self._lock: + return [obj.agent_encode() for obj in self._objects.values()] + + def __len__(self) -> int: + """Return number of permanent objects.""" + with self._lock: + return len(self._objects) + + def __repr__(self) -> str: + with self._lock: + return f"ObjectDB(permanent={len(self._objects)}, pending={len(self._pending_objects)})" + + +__all__ = ["ObjectDB"] diff --git a/dimos/perception/detection/person_tracker.py b/dimos/perception/detection/person_tracker.py index 6212080858..50082742f0 100644 --- a/dimos/perception/detection/person_tracker.py +++ b/dimos/perception/detection/person_tracker.py @@ -84,9 +84,7 @@ def detections_stream(self) -> Observable[ImageDetections2D]: buffer_size=2.0, ).pipe( ops.map( - lambda pair: ImageDetections2D.from_ros_detection2d_array( # type: ignore[misc] - *pair - ) + lambda pair: ImageDetections2D.from_ros_detection2d_array(*pair) # type: ignore[misc, arg-type] ) ) ) diff --git a/dimos/perception/detection/reid/module.py b/dimos/perception/detection/reid/module.py index 4e239da39a..f3f2a5a126 100644 --- a/dimos/perception/detection/reid/module.py +++ b/dimos/perception/detection/reid/module.py @@ -65,7 +65,7 @@ def detections_stream(self) -> Observable[ImageDetections2D]: ), match_tolerance=0.0, buffer_size=2.0, - ).pipe(ops.map(lambda pair: ImageDetections2D.from_ros_detection2d_array(*pair))) # type: ignore[misc] + ).pipe(ops.map(lambda pair: ImageDetections2D.from_ros_detection2d_array(*pair))) # type: ignore[misc, arg-type] ) @rpc diff --git a/dimos/perception/detection/type/detection2d/__init__.py b/dimos/perception/detection/type/detection2d/__init__.py index ad3b7fa62e..8994d840b6 100644 --- a/dimos/perception/detection/type/detection2d/__init__.py +++ b/dimos/perception/detection/type/detection2d/__init__.py @@ -17,11 +17,13 @@ from dimos.perception.detection.type.detection2d.imageDetections2D import ImageDetections2D from dimos.perception.detection.type.detection2d.person import Detection2DPerson from dimos.perception.detection.type.detection2d.point import Detection2DPoint +from dimos.perception.detection.type.detection2d.seg import Detection2DSeg __all__ = [ "Detection2D", "Detection2DBBox", "Detection2DPerson", "Detection2DPoint", + "Detection2DSeg", "ImageDetections2D", ] diff --git a/dimos/perception/detection/type/detection2d/imageDetections2D.py b/dimos/perception/detection/type/detection2d/imageDetections2D.py index 680f9dd117..34033a9c50 100644 --- a/dimos/perception/detection/type/detection2d/imageDetections2D.py +++ b/dimos/perception/detection/type/detection2d/imageDetections2D.py @@ -14,28 +14,32 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Generic +from typing import TYPE_CHECKING, Any, Generic from typing_extensions import TypeVar from dimos.perception.detection.type.detection2d.base import Detection2D from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox +from dimos.perception.detection.type.detection2d.person import Detection2DPerson +from dimos.perception.detection.type.detection2d.seg import Detection2DSeg from dimos.perception.detection.type.imageDetections import ImageDetections if TYPE_CHECKING: - from dimos_lcm.vision_msgs import Detection2DArray - from ultralytics.engine.results import Results # type: ignore[import-not-found] + from ultralytics.engine.results import Results from dimos.msgs.sensor_msgs import Image + from dimos.msgs.vision_msgs import Detection2DArray -# TypeVar with default - Detection2DBBox is the default when no type param given T2D = TypeVar("T2D", bound=Detection2D, default=Detection2DBBox) class ImageDetections2D(ImageDetections[T2D], Generic[T2D]): @classmethod - def from_ros_detection2d_array( # type: ignore[no-untyped-def] - cls, image: Image, ros_detections: Detection2DArray, **kwargs + def from_ros_detection2d_array( + cls, + image: Image, + ros_detections: Detection2DArray, + **kwargs: Any, ) -> ImageDetections2D[Detection2DBBox]: """Convert from ROS Detection2DArray message to ImageDetections2D object.""" detections: list[Detection2DBBox] = [] @@ -47,24 +51,25 @@ def from_ros_detection2d_array( # type: ignore[no-untyped-def] return ImageDetections2D(image=image, detections=detections) @classmethod - def from_ultralytics_result( # type: ignore[no-untyped-def] - cls, image: Image, results: list[Results], **kwargs + def from_ultralytics_result( + cls, + image: Image, + results: list[Results], ) -> ImageDetections2D[Detection2DBBox]: """Create ImageDetections2D from ultralytics Results. Dispatches to appropriate Detection2D subclass based on result type: + - If masks present: creates Detection2DSeg - If keypoints present: creates Detection2DPerson - Otherwise: creates Detection2DBBox Args: image: Source image results: List of ultralytics Results objects - **kwargs: Additional arguments passed to detection constructors Returns: ImageDetections2D containing appropriate detection types """ - from dimos.perception.detection.type.detection2d.person import Detection2DPerson detections: list[Detection2DBBox] = [] for result in results: @@ -74,7 +79,10 @@ def from_ultralytics_result( # type: ignore[no-untyped-def] num_detections = len(result.boxes.xyxy) for i in range(num_detections): detection: Detection2DBBox - if result.keypoints is not None: + if result.masks is not None: + # Segmentation detection with mask + detection = Detection2DSeg.from_ultralytics_result(result, i, image) + elif result.keypoints is not None: # Pose detection with keypoints detection = Detection2DPerson.from_ultralytics_result(result, i, image) else: diff --git a/dimos/perception/detection/type/detection2d/seg.py b/dimos/perception/detection/type/detection2d/seg.py new file mode 100644 index 0000000000..21f8e8e689 --- /dev/null +++ b/dimos/perception/detection/type/detection2d/seg.py @@ -0,0 +1,204 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +import cv2 +from dimos_lcm.foxglove_msgs.ImageAnnotations import PointsAnnotation +from dimos_lcm.foxglove_msgs.Point2 import Point2 +import numpy as np +import torch + +from dimos.msgs.foxglove_msgs.Color import Color +from dimos.perception.detection.type.detection2d.bbox import Bbox, Detection2DBBox +from dimos.types.timestamped import to_ros_stamp + +if TYPE_CHECKING: + from ultralytics.engine.results import Results + + from dimos.msgs.sensor_msgs import Image + + +@dataclass +class Detection2DSeg(Detection2DBBox): + """Represents a detection with a segmentation mask.""" + + mask: np.ndarray[Any, np.dtype[np.uint8]] # Binary mask [H, W], uint8 0 or 255 + + @classmethod + def from_sam2_result( + cls, + mask: np.ndarray[Any, Any] | torch.Tensor, + obj_id: int, + image: Image, + class_id: int = 0, + name: str = "object", + confidence: float = 1.0, + ) -> Detection2DSeg: + """Create Detection2DSeg from SAM output (single object). + + Args: + mask: Segmentation mask (logits or binary). Shape [H, W] or [1, H, W]. + obj_id: Tracking ID of the object. + image: Source image. + class_id: Class ID (default 0). + name: Class name (default "object"). + confidence: Confidence score (default 1.0). + + Returns: + Detection2DSeg instance. + """ + # Convert mask to numpy if tensor + if isinstance(mask, torch.Tensor): + mask = mask.detach().cpu().numpy() + + # Handle dimensions (EdgeTAM might return [1, H, W] or [H, W]) + if mask.ndim == 3: + mask = mask.squeeze() + + # Binarize if it's logits (usually < 0 is background, > 0 is foreground) + # or if it's boolean + if mask.dtype == bool: + mask = mask.astype(np.uint8) * 255 + elif np.issubdtype(mask.dtype, np.floating): + mask = (mask > 0.0).astype(np.uint8) * 255 + + # Calculate bbox + y_indices, x_indices = np.where(mask > 0) + if len(x_indices) > 0: + x1_val, y1_val = float(np.min(x_indices)), float(np.min(y_indices)) + x2_val, y2_val = float(np.max(x_indices)), float(np.max(y_indices)) + else: + x1_val = y1_val = x2_val = y2_val = 0.0 + + bbox = (x1_val, y1_val, x2_val, y2_val) + + return cls( + bbox=bbox, + track_id=obj_id, + class_id=class_id, + confidence=confidence, + name=name, + ts=image.ts, + image=image, + mask=mask.astype(np.uint8), # type: ignore[arg-type] + ) + + @classmethod + def from_ultralytics_result(cls, result: Results, idx: int, image: Image) -> Detection2DSeg: + """Create Detection2DSeg from ultralytics Results object with segmentation mask. + + Args: + result: Ultralytics Results object containing detection and mask data + idx: Index of the detection in the results + image: Source image + + Returns: + Detection2DSeg instance + """ + if result.boxes is None: + raise ValueError("Result has no boxes") + + # Extract bounding box coordinates + bbox_array = result.boxes.xyxy[idx].cpu().numpy() + bbox: Bbox = ( + float(bbox_array[0]), + float(bbox_array[1]), + float(bbox_array[2]), + float(bbox_array[3]), + ) + + # Extract confidence + confidence = float(result.boxes.conf[idx].cpu()) + + # Extract class ID and name + class_id = int(result.boxes.cls[idx].cpu()) + if hasattr(result, "names") and result.names is not None: + if isinstance(result.names, dict): + name = result.names.get(class_id, f"class_{class_id}") + elif isinstance(result.names, list) and class_id < len(result.names): + name = result.names[class_id] + else: + name = f"class_{class_id}" + else: + name = f"class_{class_id}" + + # Extract track ID if available + track_id = -1 + if hasattr(result.boxes, "id") and result.boxes.id is not None: + track_id = int(result.boxes.id[idx].cpu()) + + # Extract mask + mask = np.zeros((image.height, image.width), dtype=np.uint8) + if result.masks is not None and idx < len(result.masks.data): + mask_tensor = result.masks.data[idx] + mask_np = mask_tensor.cpu().numpy() + + # Resize mask to image size if needed + if mask_np.shape != (image.height, image.width): + mask_np = cv2.resize( + mask_np.astype(np.float32), + (image.width, image.height), + interpolation=cv2.INTER_LINEAR, + ) + + # Binarize mask + mask = (mask_np > 0.5).astype(np.uint8) * 255 # type: ignore[assignment] + + return cls( + bbox=bbox, + track_id=track_id, + class_id=class_id, + confidence=confidence, + name=name, + ts=image.ts, + image=image, + mask=mask, + ) + + def to_points_annotation(self) -> list[PointsAnnotation]: + """Override to include mask outline.""" + annotations = super().to_points_annotation() + + # Find contours + contours, _ = cv2.findContours(self.mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + for contour in contours: + # Simplify contour to reduce points + epsilon = 0.005 * cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, epsilon, True) + + points = [] + for pt in approx: + points.append(Point2(x=float(pt[0][0]), y=float(pt[0][1]))) + + if len(points) < 3: + continue + + annotations.append( + PointsAnnotation( + timestamp=to_ros_stamp(self.ts), + outline_color=Color.from_string(str(self.class_id), alpha=1.0, brightness=1.25), + fill_color=Color.from_string(str(self.track_id), alpha=0.4), + thickness=1.0, + points_length=len(points), + points=points, + type=PointsAnnotation.LINE_LOOP, + ) + ) + + return annotations diff --git a/dimos/perception/detection/type/detection3d/base.py b/dimos/perception/detection/type/detection3d/base.py index d8cc430c44..b036584f3e 100644 --- a/dimos/perception/detection/type/detection3d/base.py +++ b/dimos/perception/detection/type/detection3d/base.py @@ -15,23 +15,22 @@ from __future__ import annotations from abc import abstractmethod -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import TYPE_CHECKING +from dimos.msgs.geometry_msgs import Transform from dimos.perception.detection.type.detection2d import Detection2DBBox if TYPE_CHECKING: from dimos_lcm.sensor_msgs import CameraInfo - from dimos.msgs.geometry_msgs import Transform - @dataclass class Detection3D(Detection2DBBox): """Abstract base class for 3D detections.""" - transform: Transform - frame_id: str + frame_id: str = "" + transform: Transform = field(default_factory=Transform.identity) @classmethod @abstractmethod diff --git a/dimos/perception/detection/type/detection3d/bbox.py b/dimos/perception/detection/type/detection3d/bbox.py index ac6f82a25e..cf7f4ea3cc 100644 --- a/dimos/perception/detection/type/detection3d/bbox.py +++ b/dimos/perception/detection/type/detection3d/bbox.py @@ -14,11 +14,15 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field import functools from typing import Any -from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3 +from dimos_lcm.vision_msgs import ObjectHypothesis, ObjectHypothesisWithPose + +from dimos.msgs.geometry_msgs import Pose, PoseStamped, Quaternion, Transform, Vector3 +from dimos.msgs.std_msgs import Header +from dimos.msgs.vision_msgs import Detection3D from dimos.perception.detection.type.detection2d import Detection2DBBox @@ -29,11 +33,11 @@ class Detection3DBBox(Detection2DBBox): Represents a 3D detection as an oriented bounding box in world space. """ - transform: Transform # Camera to world transform - frame_id: str # Frame ID (e.g., "world", "map") center: Vector3 # Center point in world frame size: Vector3 # Width, height, depth - orientation: tuple[float, float, float, float] # Quaternion (x, y, z, w) + transform: Transform | None = None # Camera to world transform + frame_id: str = "" # Frame ID (e.g., "world", "map") + orientation: Quaternion = field(default_factory=lambda: Quaternion(0.0, 0.0, 0.0, 1.0)) @functools.cached_property def pose(self) -> PoseStamped: @@ -48,8 +52,34 @@ def pose(self) -> PoseStamped: orientation=self.orientation, ) + def to_detection3d_msg(self) -> Detection3D: + """Convert to ROS Detection3D message.""" + msg = Detection3D() + msg.header = Header(self.ts, self.frame_id) + + # Results + msg.results = [ + ObjectHypothesisWithPose( + hypothesis=ObjectHypothesis( + class_id=str(self.class_id), + score=self.confidence, + ) + ) + ] + + # Bounding Box + msg.bbox.center = Pose( + position=self.center, + orientation=self.orientation, + ) + msg.bbox.size = self.size + + return msg + def to_repr_dict(self) -> dict[str, Any]: # Calculate distance from camera + if self.transform is None: + return super().to_repr_dict() camera_pos = self.transform.translation distance = (self.center - camera_pos).magnitude() diff --git a/dimos/perception/detection/type/detection3d/object.py b/dimos/perception/detection/type/detection3d/object.py new file mode 100644 index 0000000000..00d4d88661 --- /dev/null +++ b/dimos/perception/detection/type/detection3d/object.py @@ -0,0 +1,363 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +import time +from typing import TYPE_CHECKING, Any +import uuid + +import cv2 +from dimos_lcm.geometry_msgs import Pose +import numpy as np +import open3d as o3d # type: ignore[import-untyped] + +from dimos.msgs.geometry_msgs import PoseStamped, Quaternion, Transform, Vector3 +from dimos.msgs.sensor_msgs import Image, PointCloud2 +from dimos.msgs.std_msgs import Header +from dimos.msgs.vision_msgs import Detection3D as ROSDetection3D, Detection3DArray +from dimos.perception.detection.type.detection2d.seg import Detection2DSeg +from dimos.perception.detection.type.detection3d.base import Detection3D + +if TYPE_CHECKING: + from dimos_lcm.sensor_msgs import CameraInfo + + from dimos.perception.detection.type.detection2d import ImageDetections2D + + +@dataclass(kw_only=True) +class Object(Detection3D): + """3D object detection combining bounding box and pointcloud representations. + + Represents a detected object in 3D space with support for accumulating + multiple detections over time. + """ + + object_id: str = field(default_factory=lambda: uuid.uuid4().hex[:8]) + center: Vector3 + size: Vector3 + pose: PoseStamped + pointcloud: PointCloud2 + camera_transform: Transform | None = None + mask: np.ndarray[Any, np.dtype[np.uint8]] | None = None + detections_count: int = 1 + + def update_object(self, other: Object) -> None: + """Update this object with data from another detection. + + Accumulates pointclouds by transforming the new pointcloud to world frame + and adding it to the existing pointcloud. Updates center and camera_transform, + and increments the detections_count. + + Args: + other: Another Object instance with newer detection data. + """ + # Accumulate pointclouds if transforms are available + if other.camera_transform is not None: + # Transform new pointcloud to world frame and add to existing + # transformed_pc = other.pointcloud.transform(other.camera_transform) + # self.pointcloud = self.pointcloud + transformed_pc + + # Recompute center from accumulated pointcloud + self.pointcloud = other.pointcloud + pc_center = other.pointcloud.center + self.center = Vector3(pc_center.x, pc_center.y, pc_center.z) + else: + # No transform available, just replace + self.pointcloud = other.pointcloud + self.center = other.center + + self.camera_transform = other.camera_transform + self.size = other.size + self.pose = other.pose + self.track_id = other.track_id + self.mask = other.mask + self.name = other.name + self.bbox = other.bbox + self.confidence = other.confidence + self.class_id = other.class_id + self.ts = other.ts + self.frame_id = other.frame_id + self.image = other.image + self.detections_count += 1 + + def get_oriented_bounding_box(self) -> Any: + """Get oriented bounding box of the pointcloud.""" + return self.pointcloud.get_oriented_bounding_box() + + def scene_entity_label(self) -> str: + """Get label for scene visualization.""" + if self.detections_count > 1: + return f"{self.name} ({self.detections_count})" + return f"{self.track_id}/{self.name} ({self.confidence:.0%})" + + def to_detection3d_msg(self) -> ROSDetection3D: + """Convert to ROS Detection3D message.""" + obb = self.get_oriented_bounding_box() # type: ignore[no-untyped-call] + orientation = Quaternion.from_rotation_matrix(obb.R) + + msg = ROSDetection3D() + msg.header = Header(self.ts, self.frame_id) + msg.id = str(self.track_id) + msg.bbox.center = Pose( + position=Vector3(obb.center[0], obb.center[1], obb.center[2]), + orientation=orientation, + ) + msg.bbox.size = Vector3(obb.extent[0], obb.extent[1], obb.extent[2]) + + return msg + + def agent_encode(self) -> dict[str, Any]: + """Encode for agent consumption.""" + return { + "id": self.track_id, + "name": self.name, + "detections": self.detections_count, + "last_seen": f"{round(time.time() - self.ts)}s ago", + } + + def to_dict(self) -> dict[str, Any]: + """Convert object to dictionary with all relevant data.""" + return { + "object_id": self.object_id, + "track_id": self.track_id, + "class_id": self.class_id, + "name": self.name, + "mask": self.mask, + "pointcloud": self.pointcloud.as_numpy(), + "image": self.image.as_numpy() if self.image else None, + } + + @classmethod + def from_2d_to_list( + cls, + detections_2d: ImageDetections2D[Detection2DSeg], + color_image: Image, + depth_image: Image, + camera_info: CameraInfo, + camera_transform: Transform | None = None, + depth_scale: float = 1.0, + depth_trunc: float = 10.0, + statistical_nb_neighbors: int = 10, + statistical_std_ratio: float = 0.5, + voxel_downsample: float = 0.005, + mask_erode_pixels: int = 3, + ) -> list[Object]: + """Create 3D Objects from 2D detections and RGBD images. + + Uses Open3D's optimized RGBD projection for efficient processing. + + Args: + detections_2d: 2D detections with segmentation masks + color_image: RGB color image + depth_image: Depth image (in meters if depth_scale=1.0) + camera_info: Camera intrinsics + camera_transform: Optional transform from camera frame to world frame. + If provided, pointclouds will be transformed to world frame. + depth_scale: Scale factor for depth (1.0 for meters, 1000.0 for mm) + depth_trunc: Maximum depth value in meters + statistical_nb_neighbors: Neighbors for statistical outlier removal + statistical_std_ratio: Std ratio for statistical outlier removal + voxel_downsample: Voxel size (meters) for downsampling before filtering. Set <= 0 to skip. + mask_erode_pixels: Number of pixels to erode the mask by to remove + noisy depth edge points. Set to 0 to disable. + + Returns: + List of Object instances with pointclouds + """ + color_cv = color_image.to_opencv() + if color_cv.ndim == 3 and color_cv.shape[2] == 3: + color_cv = cv2.cvtColor(color_cv, cv2.COLOR_BGR2RGB) + + depth_cv = depth_image.to_opencv() + h, w = depth_cv.shape[:2] + + # Build Open3D camera intrinsics + fx, fy = camera_info.K[0], camera_info.K[4] + cx, cy = camera_info.K[2], camera_info.K[5] + intrinsic_o3d = o3d.camera.PinholeCameraIntrinsic(w, h, fx, fy, cx, cy) + + objects: list[Object] = [] + + for det in detections_2d.detections: + if isinstance(det, Detection2DSeg): + mask = det.mask + store_mask = det.mask + else: + mask = np.zeros((h, w), dtype=np.uint8) + x1, y1, x2, y2 = map(int, det.bbox) + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(w, x2), min(h, y2) + mask[y1:y2, x1:x2] = 255 + store_mask = mask + + if mask_erode_pixels > 0: + mask_uint8 = mask.astype(np.uint8) + if mask_uint8.max() == 1: + mask_uint8 = mask_uint8 * 255 # type: ignore[assignment] + kernel_size = 2 * mask_erode_pixels + 1 + erode_kernel = cv2.getStructuringElement( + cv2.MORPH_ELLIPSE, (kernel_size, kernel_size) + ) + mask = cv2.erode(mask_uint8, erode_kernel) # type: ignore[assignment] + + depth_masked = depth_cv.copy() + depth_masked[mask == 0] = 0 + + rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth( + o3d.geometry.Image(color_cv.astype(np.uint8)), + o3d.geometry.Image(depth_masked.astype(np.float32)), + depth_scale=depth_scale, + depth_trunc=depth_trunc, + convert_rgb_to_intensity=False, + ) + pcd = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, intrinsic_o3d) + + pc0 = PointCloud2( + pcd, + frame_id=depth_image.frame_id, + ts=depth_image.ts, + ).voxel_downsample(voxel_downsample) + + pcd_filtered, _ = pc0.pointcloud.remove_statistical_outlier( + nb_neighbors=statistical_nb_neighbors, + std_ratio=statistical_std_ratio, + ) + + if len(pcd_filtered.points) < 10: + continue + + pc = PointCloud2( + pcd_filtered, + frame_id=depth_image.frame_id, + ts=depth_image.ts, + ) + + # Transform pointcloud to world frame if camera_transform is provided + if camera_transform is not None: + pc = pc.transform(camera_transform) + frame_id = camera_transform.frame_id + else: + frame_id = depth_image.frame_id + + # Compute center from pointcloud + obb = pc.pointcloud.get_oriented_bounding_box() + center = Vector3(obb.center[0], obb.center[1], obb.center[2]) + size = Vector3(obb.extent[0], obb.extent[1], obb.extent[2]) + orientation = Quaternion.from_rotation_matrix(obb.R) + pose = PoseStamped( + ts=det.ts, + frame_id=frame_id, + position=center, + orientation=orientation, + ) + + objects.append( + cls( + bbox=det.bbox, + track_id=det.track_id, + class_id=det.class_id, + confidence=det.confidence, + name=det.name, + ts=det.ts, + image=det.image, + frame_id=frame_id, + pointcloud=pc, + center=center, + size=size, + pose=pose, + camera_transform=camera_transform, + mask=store_mask, + ) + ) + + return objects + + +def aggregate_pointclouds(objects: list[Object]) -> PointCloud2: + """Aggregate all object pointclouds into a single colored pointcloud. + + Each object's points are colored based on its track_id. + + Args: + objects: List of Object instances with pointclouds + + Returns: + Combined PointCloud2 with all points colored by object (empty if no points). + """ + if not objects: + return PointCloud2(pointcloud=o3d.geometry.PointCloud(), frame_id="", ts=0.0) + + all_points = [] + all_colors = [] + + for _i, obj in enumerate(objects): + points, colors = obj.pointcloud.as_numpy() + if len(points) == 0: + continue + + try: + seed = int(obj.object_id, 16) + except (ValueError, TypeError): + seed = abs(hash(obj.object_id)) + np.random.seed(abs(seed) % (2**32 - 1)) + track_color = np.random.randint(50, 255, 3) / 255.0 + + if colors is not None: + blended = np.clip(0.6 * colors + 0.4 * track_color, 0.0, 1.0) + else: + blended = np.tile(track_color, (len(points), 1)) + + all_points.append(points) + all_colors.append(blended) + + if not all_points: + return PointCloud2( + pointcloud=o3d.geometry.PointCloud(), frame_id=objects[0].frame_id, ts=objects[0].ts + ) + + combined_points = np.vstack(all_points) + combined_colors = np.vstack(all_colors) + + pc = PointCloud2.from_numpy( + combined_points, + frame_id=objects[0].frame_id, + timestamp=objects[0].ts, + ) + pcd = pc.pointcloud + pcd.colors = o3d.utility.Vector3dVector(combined_colors) + pc.pointcloud = pcd + + return pc + + +def to_detection3d_array(objects: list[Object]) -> Detection3DArray: + """Convert a list of Objects to a ROS Detection3DArray message. + + Args: + objects: List of Object instances + + Returns: + Detection3DArray ROS message + """ + array = Detection3DArray() + + if objects: + array.header = Header(objects[0].ts, objects[0].frame_id) + + for obj in objects: + array.detections.append(obj.to_detection3d_msg()) + + return array diff --git a/dimos/perception/detection/type/detection3d/pointcloud.py b/dimos/perception/detection/type/detection3d/pointcloud.py index fd924a6564..7edceb17a5 100644 --- a/dimos/perception/detection/type/detection3d/pointcloud.py +++ b/dimos/perception/detection/type/detection3d/pointcloud.py @@ -14,7 +14,7 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field import functools from typing import TYPE_CHECKING, Any @@ -52,7 +52,7 @@ @dataclass class Detection3DPC(Detection3D): - pointcloud: PointCloud2 + pointcloud: PointCloud2 = field(default_factory=PointCloud2) @functools.cached_property def center(self) -> Vector3: diff --git a/dimos/perception/detection2d/utils.py b/dimos/perception/detection2d/utils.py deleted file mode 100644 index a505eef7c8..0000000000 --- a/dimos/perception/detection2d/utils.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections.abc import Sequence - -import cv2 -import numpy as np - - -def filter_detections( # type: ignore[no-untyped-def] - bboxes, - track_ids, - class_ids, - confidences, - names: Sequence[str], - class_filter=None, - name_filter=None, - track_id_filter=None, -): - """ - Filter detection results based on class IDs, names, and/or tracking IDs. - - Args: - bboxes: List of bounding boxes [x1, y1, x2, y2] - track_ids: List of tracking IDs - class_ids: List of class indices - confidences: List of detection confidences - names: List of class names - class_filter: List/set of class IDs to keep, or None to keep all - name_filter: List/set of class names to keep, or None to keep all - track_id_filter: List/set of track IDs to keep, or None to keep all - - Returns: - tuple: (filtered_bboxes, filtered_track_ids, filtered_class_ids, - filtered_confidences, filtered_names) - """ - # Convert filters to sets for efficient lookup - if class_filter is not None: - class_filter = set(class_filter) - if name_filter is not None: - name_filter = set(name_filter) - if track_id_filter is not None: - track_id_filter = set(track_id_filter) - - # Initialize lists for filtered results - filtered_bboxes = [] - filtered_track_ids = [] - filtered_class_ids = [] - filtered_confidences = [] - filtered_names = [] - - # Filter detections - for bbox, track_id, class_id, conf, name in zip( - bboxes, track_ids, class_ids, confidences, names, strict=False - ): - # Check if detection passes all specified filters - keep = True - - if class_filter is not None: - keep = keep and (class_id in class_filter) - - if name_filter is not None: - keep = keep and (name in name_filter) - - if track_id_filter is not None: - keep = keep and (track_id in track_id_filter) - - # If detection passes all filters, add it to results - if keep: - filtered_bboxes.append(bbox) - filtered_track_ids.append(track_id) - filtered_class_ids.append(class_id) - filtered_confidences.append(conf) - filtered_names.append(name) - - return ( - filtered_bboxes, - filtered_track_ids, - filtered_class_ids, - filtered_confidences, - filtered_names, - ) - - -def extract_detection_results(result, class_filter=None, name_filter=None, track_id_filter=None): # type: ignore[no-untyped-def] - """ - Extract and optionally filter detection information from a YOLO result object. - - Args: - result: Ultralytics result object - class_filter: List/set of class IDs to keep, or None to keep all - name_filter: List/set of class names to keep, or None to keep all - track_id_filter: List/set of track IDs to keep, or None to keep all - - Returns: - tuple: (bboxes, track_ids, class_ids, confidences, names) - - bboxes: list of [x1, y1, x2, y2] coordinates - - track_ids: list of tracking IDs - - class_ids: list of class indices - - confidences: list of detection confidences - - names: list of class names - """ - bboxes = [] # type: ignore[var-annotated] - track_ids = [] # type: ignore[var-annotated] - class_ids = [] # type: ignore[var-annotated] - confidences = [] # type: ignore[var-annotated] - names = [] # type: ignore[var-annotated] - - if result.boxes is None: - return bboxes, track_ids, class_ids, confidences, names - - for box in result.boxes: - # Extract bounding box coordinates - x1, y1, x2, y2 = box.xyxy[0].tolist() - - # Extract tracking ID if available - track_id = -1 - if hasattr(box, "id") and box.id is not None: - track_id = int(box.id[0].item()) - - # Extract class information - cls_idx = int(box.cls[0]) - name = result.names[cls_idx] - - # Extract confidence - conf = float(box.conf[0]) - - # Check filters before adding to results - keep = True - if class_filter is not None: - keep = keep and (cls_idx in class_filter) - if name_filter is not None: - keep = keep and (name in name_filter) - if track_id_filter is not None: - keep = keep and (track_id in track_id_filter) - - if keep: - bboxes.append([x1, y1, x2, y2]) - track_ids.append(track_id) - class_ids.append(cls_idx) - confidences.append(conf) - names.append(name) - - return bboxes, track_ids, class_ids, confidences, names - - -def plot_results( # type: ignore[no-untyped-def] - image, bboxes, track_ids, class_ids, confidences, names: Sequence[str], alpha: float = 0.5 -): - """ - Draw bounding boxes and labels on the image. - - Args: - image: Original input image - bboxes: List of bounding boxes [x1, y1, x2, y2] - track_ids: List of tracking IDs - class_ids: List of class indices - confidences: List of detection confidences - names: List of class names - alpha: Transparency of the overlay - - Returns: - Image with visualized detections - """ - vis_img = image.copy() - - for bbox, track_id, conf, name in zip(bboxes, track_ids, confidences, names, strict=False): - # Generate consistent color based on track_id or class name - if track_id != -1: - np.random.seed(track_id) - else: - np.random.seed(hash(name) % 100000) - color = np.random.randint(0, 255, (3,), dtype=np.uint8) - np.random.seed(None) - - # Draw bounding box - x1, y1, x2, y2 = map(int, bbox) - cv2.rectangle(vis_img, (x1, y1), (x2, y2), color.tolist(), 2) - - # Prepare label text - if track_id != -1: - label = f"ID:{track_id} {name} {conf:.2f}" - else: - label = f"{name} {conf:.2f}" - - # Calculate text size for background rectangle - (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) - - # Draw background rectangle for text - cv2.rectangle(vis_img, (x1, y1 - text_h - 8), (x1 + text_w + 4, y1), color.tolist(), -1) - - # Draw text with white color for better visibility - cv2.putText( - vis_img, label, (x1 + 2, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1 - ) - - return vis_img - - -def calculate_depth_from_bbox(depth_map, bbox): # type: ignore[no-untyped-def] - """ - Calculate the average depth of an object within a bounding box. - Uses the 25th to 75th percentile range to filter outliers. - - Args: - depth_map: The depth map - bbox: Bounding box in format [x1, y1, x2, y2] - - Returns: - float: Average depth in meters, or None if depth estimation fails - """ - try: - # Extract region of interest from the depth map - x1, y1, x2, y2 = map(int, bbox) - roi_depth = depth_map[y1:y2, x1:x2] - - if roi_depth.size == 0: - return None - - # Calculate 25th and 75th percentile to filter outliers - p25 = np.percentile(roi_depth, 25) - p75 = np.percentile(roi_depth, 75) - - # Filter depth values within this range - filtered_depth = roi_depth[(roi_depth >= p25) & (roi_depth <= p75)] - - # Calculate average depth (convert to meters) - if filtered_depth.size > 0: - return np.mean(filtered_depth) / 1000.0 # Convert mm to meters - - return None - except Exception as e: - print(f"Error calculating depth from bbox: {e}") - return None - - -def calculate_distance_angle_from_bbox(bbox, depth: int, camera_intrinsics): # type: ignore[no-untyped-def] - """ - Calculate distance and angle to object center based on bbox and depth. - - Args: - bbox: Bounding box [x1, y1, x2, y2] - depth: Depth value in meters - camera_intrinsics: List [fx, fy, cx, cy] with camera parameters - - Returns: - tuple: (distance, angle) in meters and radians - """ - if camera_intrinsics is None: - raise ValueError("Camera intrinsics required for distance calculation") - - # Extract camera parameters - fx, _fy, cx, _cy = camera_intrinsics - - # Calculate center of bounding box in pixels - x1, y1, x2, y2 = bbox - center_x = (x1 + x2) / 2 - (y1 + y2) / 2 - - # Calculate normalized image coordinates - x_norm = (center_x - cx) / fx - - # Calculate angle (positive to the right) - angle = np.arctan(x_norm) - - # Calculate distance using depth and angle - distance = depth / np.cos(angle) if np.cos(angle) != 0 else depth - - return distance, angle - - -def calculate_object_size_from_bbox(bbox, depth: int, camera_intrinsics): # type: ignore[no-untyped-def] - """ - Estimate physical width and height of object in meters. - - Args: - bbox: Bounding box [x1, y1, x2, y2] - depth: Depth value in meters - camera_intrinsics: List [fx, fy, cx, cy] with camera parameters - - Returns: - tuple: (width, height) in meters - """ - if camera_intrinsics is None: - return 0.0, 0.0 - - fx, fy, _, _ = camera_intrinsics - - # Calculate bbox dimensions in pixels - x1, y1, x2, y2 = bbox - width_px = x2 - x1 - height_px = y2 - y1 - - # Convert to meters using similar triangles and depth - width_m = (width_px * depth) / fx - height_m = (height_px * depth) / fy - - return width_m, height_m diff --git a/dimos/perception/grasp_generation/__init__.py b/dimos/perception/grasp_generation/__init__.py deleted file mode 100644 index 16281fe0b6..0000000000 --- a/dimos/perception/grasp_generation/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .utils import * diff --git a/dimos/perception/grasp_generation/grasp_generation.py b/dimos/perception/grasp_generation/grasp_generation.py deleted file mode 100644 index 4f2e4b68a1..0000000000 --- a/dimos/perception/grasp_generation/grasp_generation.py +++ /dev/null @@ -1,233 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Dimensional-hosted grasp generation for manipulation pipeline. -""" - -import asyncio - -import numpy as np -import open3d as o3d # type: ignore[import-untyped] - -from dimos.perception.grasp_generation.utils import parse_grasp_results -from dimos.types.manipulation import ObjectData -from dimos.utils.logging_config import setup_logger - -logger = setup_logger() - - -class HostedGraspGenerator: - """ - Dimensional-hosted grasp generator using WebSocket communication. - """ - - def __init__(self, server_url: str) -> None: - """ - Initialize Dimensional-hosted grasp generator. - - Args: - server_url: WebSocket URL for Dimensional-hosted grasp generator server - """ - self.server_url = server_url - logger.info(f"Initialized grasp generator with server: {server_url}") - - def generate_grasps_from_objects( - self, objects: list[ObjectData], full_pcd: o3d.geometry.PointCloud - ) -> list[dict]: # type: ignore[type-arg] - """ - Generate grasps from ObjectData objects using grasp generator. - - Args: - objects: List of ObjectData with point clouds - full_pcd: Open3D point cloud of full scene - - Returns: - Parsed grasp results as list of dictionaries - """ - try: - # Combine all point clouds - all_points = [] - all_colors = [] - valid_objects = 0 - - for obj in objects: - if "point_cloud_numpy" not in obj or obj["point_cloud_numpy"] is None: - continue - - points = obj["point_cloud_numpy"] - if not isinstance(points, np.ndarray) or points.size == 0: - continue - - if len(points.shape) != 2 or points.shape[1] != 3: - continue - - colors = None - if "colors_numpy" in obj and obj["colors_numpy"] is not None: # type: ignore[typeddict-item] - colors = obj["colors_numpy"] # type: ignore[typeddict-item] - if isinstance(colors, np.ndarray) and colors.size > 0: - if ( - colors.shape[0] != points.shape[0] - or len(colors.shape) != 2 - or colors.shape[1] != 3 - ): - colors = None - - all_points.append(points) - if colors is not None: - all_colors.append(colors) - valid_objects += 1 - - if not all_points: - return [] - - # Combine point clouds - combined_points = np.vstack(all_points) - combined_colors = None - if len(all_colors) == valid_objects and len(all_colors) > 0: - combined_colors = np.vstack(all_colors) - - # Send grasp request - grasps = self._send_grasp_request_sync(combined_points, combined_colors) - - if not grasps: - return [] - - # Parse and return results in list of dictionaries format - return parse_grasp_results(grasps) - - except Exception as e: - logger.error(f"Grasp generation failed: {e}") - return [] - - def _send_grasp_request_sync( - self, - points: np.ndarray, # type: ignore[type-arg] - colors: np.ndarray | None, # type: ignore[type-arg] - ) -> list[dict] | None: # type: ignore[type-arg] - """Send synchronous grasp request to grasp server.""" - - try: - # Prepare colors - colors = np.ones((points.shape[0], 3), dtype=np.float32) * 0.5 - - # Ensure correct data types - points = points.astype(np.float32) - colors = colors.astype(np.float32) - - # Validate ranges - if np.any(np.isnan(points)) or np.any(np.isinf(points)): - logger.error("Points contain NaN or Inf values") - return None - if np.any(np.isnan(colors)) or np.any(np.isinf(colors)): - logger.error("Colors contain NaN or Inf values") - return None - - colors = np.clip(colors, 0.0, 1.0) - - # Run async request in sync context - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - result = loop.run_until_complete(self._async_grasp_request(points, colors)) - return result - finally: - loop.close() - - except Exception as e: - logger.error(f"Error in synchronous grasp request: {e}") - return None - - async def _async_grasp_request( - self, - points: np.ndarray, # type: ignore[type-arg] - colors: np.ndarray, # type: ignore[type-arg] - ) -> list[dict] | None: # type: ignore[type-arg] - """Async grasp request helper.""" - import json - - import websockets - - try: - async with websockets.connect(self.server_url) as websocket: - request = { - "points": points.tolist(), - "colors": colors.tolist(), - "lims": [-1.0, 1.0, -1.0, 1.0, 0.0, 2.0], - } - - await websocket.send(json.dumps(request)) - response = await websocket.recv() - grasps = json.loads(response) - - if isinstance(grasps, dict) and "error" in grasps: - logger.error(f"Server returned error: {grasps['error']}") - return None - elif isinstance(grasps, int | float) and grasps == 0: - return None - elif not isinstance(grasps, list): - logger.error(f"Server returned unexpected response type: {type(grasps)}") - return None - elif len(grasps) == 0: - return None - - return self._convert_grasp_format(grasps) - - except Exception as e: - logger.error(f"Async grasp request failed: {e}") - return None - - def _convert_grasp_format(self, grasps: list[dict]) -> list[dict]: # type: ignore[type-arg] - """Convert Dimensional Grasp format to visualization format.""" - converted = [] - - for i, grasp in enumerate(grasps): - rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3))) - euler_angles = self._rotation_matrix_to_euler(rotation_matrix) - - converted_grasp = { - "id": f"grasp_{i}", - "score": grasp.get("score", 0.0), - "width": grasp.get("width", 0.0), - "height": grasp.get("height", 0.0), - "depth": grasp.get("depth", 0.0), - "translation": grasp.get("translation", [0, 0, 0]), - "rotation_matrix": rotation_matrix.tolist(), - "euler_angles": euler_angles, - } - converted.append(converted_grasp) - - converted.sort(key=lambda x: x["score"], reverse=True) - return converted - - def _rotation_matrix_to_euler(self, rotation_matrix: np.ndarray) -> dict[str, float]: # type: ignore[type-arg] - """Convert rotation matrix to Euler angles (in radians).""" - sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2) - - singular = sy < 1e-6 - - if not singular: - x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2]) - y = np.arctan2(-rotation_matrix[2, 0], sy) - z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0]) - else: - x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1]) - y = np.arctan2(-rotation_matrix[2, 0], sy) - z = 0 - - return {"roll": x, "pitch": y, "yaw": z} - - def cleanup(self) -> None: - """Clean up resources.""" - logger.info("Grasp generator cleaned up") diff --git a/dimos/perception/grasp_generation/utils.py b/dimos/perception/grasp_generation/utils.py deleted file mode 100644 index 492a3d1df4..0000000000 --- a/dimos/perception/grasp_generation/utils.py +++ /dev/null @@ -1,529 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Utilities for grasp generation and visualization.""" - -import cv2 -import numpy as np -import open3d as o3d # type: ignore[import-untyped] - -from dimos.perception.common.utils import project_3d_points_to_2d - - -def create_gripper_geometry( - grasp_data: dict, # type: ignore[type-arg] - finger_length: float = 0.08, - finger_thickness: float = 0.004, -) -> list[o3d.geometry.TriangleMesh]: - """ - Create a simple fork-like gripper geometry from grasp data. - - Args: - grasp_data: Dictionary containing grasp parameters - - translation: 3D position list - - rotation_matrix: 3x3 rotation matrix defining gripper coordinate system - * X-axis: gripper width direction (opening/closing) - * Y-axis: finger length direction - * Z-axis: approach direction (toward object) - - width: Gripper opening width - finger_length: Length of gripper fingers (longer) - finger_thickness: Thickness of gripper fingers - base_height: Height of gripper base (longer) - color: RGB color for the gripper (solid blue) - - Returns: - List of Open3D TriangleMesh geometries for the gripper - """ - - translation = np.array(grasp_data["translation"]) - rotation_matrix = np.array(grasp_data["rotation_matrix"]) - - width = grasp_data.get("width", 0.04) - - # Create transformation matrix - transform = np.eye(4) - transform[:3, :3] = rotation_matrix - transform[:3, 3] = translation - - geometries = [] - - # Gripper dimensions - finger_width = 0.006 # Thickness of each finger - handle_length = 0.05 # Length of handle extending backward - - # Build gripper in local coordinate system: - # X-axis = width direction (left/right finger separation) - # Y-axis = finger length direction (fingers extend along +Y) - # Z-axis = approach direction (toward object, handle extends along -Z) - # IMPORTANT: Fingertips should be at origin (translation point) - - # Create left finger extending along +Y, positioned at +X - left_finger = o3d.geometry.TriangleMesh.create_box( - width=finger_width, # Thin finger - height=finger_length, # Extends along Y (finger length direction) - depth=finger_thickness, # Thin in Z direction - ) - left_finger.translate( - [ - width / 2 - finger_width / 2, # Position at +X (half width from center) - -finger_length, # Shift so fingertips are at origin - -finger_thickness / 2, # Center in Z - ] - ) - - # Create right finger extending along +Y, positioned at -X - right_finger = o3d.geometry.TriangleMesh.create_box( - width=finger_width, # Thin finger - height=finger_length, # Extends along Y (finger length direction) - depth=finger_thickness, # Thin in Z direction - ) - right_finger.translate( - [ - -width / 2 - finger_width / 2, # Position at -X (half width from center) - -finger_length, # Shift so fingertips are at origin - -finger_thickness / 2, # Center in Z - ] - ) - - # Create base connecting fingers - flat like a stickman body - base = o3d.geometry.TriangleMesh.create_box( - width=width + finger_width, # Full width plus finger thickness - height=finger_thickness, # Flat like fingers (stickman style) - depth=finger_thickness, # Thin like fingers - ) - base.translate( - [ - -width / 2 - finger_width / 2, # Start from left finger position - -finger_length - finger_thickness, # Behind fingers, adjusted for fingertips at origin - -finger_thickness / 2, # Center in Z - ] - ) - - # Create handle extending backward - flat stick like stickman arm - handle = o3d.geometry.TriangleMesh.create_box( - width=finger_width, # Same width as fingers - height=handle_length, # Extends backward along Y direction (same plane) - depth=finger_thickness, # Thin like fingers (same plane) - ) - handle.translate( - [ - -finger_width / 2, # Center in X - -finger_length - - finger_thickness - - handle_length, # Extend backward from base, adjusted for fingertips at origin - -finger_thickness / 2, # Same Z plane as other components - ] - ) - - # Use solid red color for all parts (user changed to red) - solid_color = [1.0, 0.0, 0.0] # Red color - - left_finger.paint_uniform_color(solid_color) - right_finger.paint_uniform_color(solid_color) - base.paint_uniform_color(solid_color) - handle.paint_uniform_color(solid_color) - - # Apply transformation to all parts - left_finger.transform(transform) - right_finger.transform(transform) - base.transform(transform) - handle.transform(transform) - - geometries.extend([left_finger, right_finger, base, handle]) - - return geometries - - -def create_all_gripper_geometries( - grasp_list: list[dict], # type: ignore[type-arg] - max_grasps: int = -1, -) -> list[o3d.geometry.TriangleMesh]: - """ - Create gripper geometries for multiple grasps. - - Args: - grasp_list: List of grasp dictionaries - max_grasps: Maximum number of grasps to visualize (-1 for all) - - Returns: - List of all gripper geometries - """ - all_geometries = [] - - grasps_to_show = grasp_list if max_grasps < 0 else grasp_list[:max_grasps] - - for grasp in grasps_to_show: - gripper_parts = create_gripper_geometry(grasp) - all_geometries.extend(gripper_parts) - - return all_geometries - - -def draw_grasps_on_image( - image: np.ndarray, # type: ignore[type-arg] - grasp_data: dict | dict[int | str, list[dict]] | list[dict], # type: ignore[type-arg] - camera_intrinsics: list[float] | np.ndarray, # type: ignore[type-arg] # [fx, fy, cx, cy] or 3x3 matrix - max_grasps: int = -1, # -1 means show all grasps - finger_length: float = 0.08, # Match 3D gripper - finger_thickness: float = 0.004, # Match 3D gripper -) -> np.ndarray: # type: ignore[type-arg] - """ - Draw fork-like gripper visualizations on the image matching 3D gripper design. - - Args: - image: Base image to draw on - grasp_data: Can be: - - A single grasp dict - - A list of grasp dicts - - A dictionary mapping object IDs or "scene" to list of grasps - camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix - max_grasps: Maximum number of grasps to visualize (-1 for all) - finger_length: Length of gripper fingers (matches 3D design) - finger_thickness: Thickness of gripper fingers (matches 3D design) - - Returns: - Image with grasps drawn - """ - result = image.copy() - - # Convert camera intrinsics to 3x3 matrix if needed - if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: - fx, fy, cx, cy = camera_intrinsics - camera_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]]) - else: - camera_matrix = np.array(camera_intrinsics) - - # Convert input to standard format - if isinstance(grasp_data, dict) and not any( - key in grasp_data for key in ["scene", 0, 1, 2, 3, 4, 5] - ): - # Single grasp - grasps_to_draw = [(grasp_data, 0)] - elif isinstance(grasp_data, list): - # List of grasps - grasps_to_draw = [(grasp, i) for i, grasp in enumerate(grasp_data)] - else: - # Dictionary of grasps by object ID - grasps_to_draw = [] - for _obj_id, grasps in grasp_data.items(): - for i, grasp in enumerate(grasps): - grasps_to_draw.append((grasp, i)) - - # Limit number of grasps if specified - if max_grasps > 0: - grasps_to_draw = grasps_to_draw[:max_grasps] - - # Define grasp colors (solid red to match 3D design) - def get_grasp_color(index: int) -> tuple: # type: ignore[type-arg] - # Use solid red color for all grasps to match 3D design - return (0, 0, 255) # Red in BGR format for OpenCV - - # Draw each grasp - for grasp, index in grasps_to_draw: - try: - color = get_grasp_color(index) - thickness = max(1, 4 - index // 3) - - # Extract grasp parameters (using translation and rotation_matrix) - if "translation" not in grasp or "rotation_matrix" not in grasp: - continue - - translation = np.array(grasp["translation"]) - rotation_matrix = np.array(grasp["rotation_matrix"]) - width = grasp.get("width", 0.04) - - # Match 3D gripper dimensions - finger_width = 0.006 # Thickness of each finger (matches 3D) - handle_length = 0.05 # Length of handle extending backward (matches 3D) - - # Create gripper geometry in local coordinate system matching 3D design: - # X-axis = width direction (left/right finger separation) - # Y-axis = finger length direction (fingers extend along +Y) - # Z-axis = approach direction (toward object, handle extends along -Z) - # IMPORTANT: Fingertips should be at origin (translation point) - - # Left finger extending along +Y, positioned at +X - left_finger_points = np.array( - [ - [ - width / 2 - finger_width / 2, # type: ignore[operator] - -finger_length, - -finger_thickness / 2, - ], # Back left - [ - width / 2 + finger_width / 2, # type: ignore[operator] - -finger_length, - -finger_thickness / 2, - ], # Back right - [ - width / 2 + finger_width / 2, # type: ignore[operator] - 0, - -finger_thickness / 2, - ], # Front right (at origin) - [ - width / 2 - finger_width / 2, # type: ignore[operator] - 0, - -finger_thickness / 2, - ], # Front left (at origin) - ] - ) - - # Right finger extending along +Y, positioned at -X - right_finger_points = np.array( - [ - [ - -width / 2 - finger_width / 2, # type: ignore[operator] - -finger_length, - -finger_thickness / 2, - ], # Back left - [ - -width / 2 + finger_width / 2, # type: ignore[operator] - -finger_length, - -finger_thickness / 2, - ], # Back right - [ - -width / 2 + finger_width / 2, # type: ignore[operator] - 0, - -finger_thickness / 2, - ], # Front right (at origin) - [ - -width / 2 - finger_width / 2, # type: ignore[operator] - 0, - -finger_thickness / 2, - ], # Front left (at origin) - ] - ) - - # Base connecting fingers - flat rectangle behind fingers - base_points = np.array( - [ - [ - -width / 2 - finger_width / 2, # type: ignore[operator] - -finger_length - finger_thickness, - -finger_thickness / 2, - ], # Back left - [ - width / 2 + finger_width / 2, # type: ignore[operator] - -finger_length - finger_thickness, - -finger_thickness / 2, - ], # Back right - [ - width / 2 + finger_width / 2, # type: ignore[operator] - -finger_length, - -finger_thickness / 2, - ], # Front right - [ - -width / 2 - finger_width / 2, # type: ignore[operator] - -finger_length, - -finger_thickness / 2, - ], # Front left - ] - ) - - # Handle extending backward - thin rectangle - handle_points = np.array( - [ - [ - -finger_width / 2, - -finger_length - finger_thickness - handle_length, - -finger_thickness / 2, - ], # Back left - [ - finger_width / 2, - -finger_length - finger_thickness - handle_length, - -finger_thickness / 2, - ], # Back right - [ - finger_width / 2, - -finger_length - finger_thickness, - -finger_thickness / 2, - ], # Front right - [ - -finger_width / 2, - -finger_length - finger_thickness, - -finger_thickness / 2, - ], # Front left - ] - ) - - # Transform all points to world frame - def transform_points(points): # type: ignore[no-untyped-def] - # Apply rotation and translation - world_points = (rotation_matrix @ points.T).T + translation - return world_points - - left_finger_world = transform_points(left_finger_points) # type: ignore[no-untyped-call] - right_finger_world = transform_points(right_finger_points) # type: ignore[no-untyped-call] - base_world = transform_points(base_points) # type: ignore[no-untyped-call] - handle_world = transform_points(handle_points) # type: ignore[no-untyped-call] - - # Project to 2D - left_finger_2d = project_3d_points_to_2d(left_finger_world, camera_matrix) - right_finger_2d = project_3d_points_to_2d(right_finger_world, camera_matrix) - base_2d = project_3d_points_to_2d(base_world, camera_matrix) - handle_2d = project_3d_points_to_2d(handle_world, camera_matrix) - - # Draw left finger - pts = left_finger_2d.astype(np.int32) - cv2.polylines(result, [pts], True, color, thickness) - - # Draw right finger - pts = right_finger_2d.astype(np.int32) - cv2.polylines(result, [pts], True, color, thickness) - - # Draw base - pts = base_2d.astype(np.int32) - cv2.polylines(result, [pts], True, color, thickness) - - # Draw handle - pts = handle_2d.astype(np.int32) - cv2.polylines(result, [pts], True, color, thickness) - - # Draw grasp center (fingertips at origin) - center_2d = project_3d_points_to_2d(translation.reshape(1, -1), camera_matrix)[0] - cv2.circle(result, tuple(center_2d.astype(int)), 3, color, -1) - - except Exception: - # Skip this grasp if there's an error - continue - - return result - - -def get_standard_coordinate_transform(): # type: ignore[no-untyped-def] - """ - Get a standard coordinate transformation matrix for consistent visualization. - - This transformation ensures that: - - X (red) axis points right - - Y (green) axis points up - - Z (blue) axis points toward viewer - - Returns: - 4x4 transformation matrix - """ - # Standard transformation matrix to ensure consistent coordinate frame orientation - transform = np.array( - [ - [1, 0, 0, 0], # X points right - [0, -1, 0, 0], # Y points up (flip from OpenCV to standard) - [0, 0, -1, 0], # Z points toward viewer (flip depth) - [0, 0, 0, 1], - ] - ) - return transform - - -def visualize_grasps_3d( - point_cloud: o3d.geometry.PointCloud, - grasp_list: list[dict], # type: ignore[type-arg] - max_grasps: int = -1, -) -> None: - """ - Visualize grasps in 3D with point cloud. - - Args: - point_cloud: Open3D point cloud - grasp_list: List of grasp dictionaries - max_grasps: Maximum number of grasps to visualize - """ - # Apply standard coordinate transformation - transform = get_standard_coordinate_transform() # type: ignore[no-untyped-call] - - # Transform point cloud - pc_copy = o3d.geometry.PointCloud(point_cloud) - pc_copy.transform(transform) - geometries = [pc_copy] - - # Transform gripper geometries - gripper_geometries = create_all_gripper_geometries(grasp_list, max_grasps) - for geom in gripper_geometries: - geom.transform(transform) - geometries.extend(gripper_geometries) - - # Add transformed coordinate frame - origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1) - origin_frame.transform(transform) - geometries.append(origin_frame) - - o3d.visualization.draw_geometries(geometries, window_name="3D Grasp Visualization") - - -def parse_grasp_results(grasps: list[dict]) -> list[dict]: # type: ignore[type-arg] - """ - Parse grasp results into visualization format. - - Args: - grasps: List of grasp dictionaries - - Returns: - List of dictionaries containing: - - id: Unique grasp identifier - - score: Confidence score (float) - - width: Gripper opening width (float) - - translation: 3D position [x, y, z] - - rotation_matrix: 3x3 rotation matrix as nested list - """ - if not grasps: - return [] - - parsed_grasps = [] - - for i, grasp in enumerate(grasps): - # Extract data from each grasp - translation = grasp.get("translation", [0, 0, 0]) - rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3))) - score = float(grasp.get("score", 0.0)) - width = float(grasp.get("width", 0.08)) - - parsed_grasp = { - "id": f"grasp_{i}", - "score": score, - "width": width, - "translation": translation, - "rotation_matrix": rotation_matrix.tolist(), - } - parsed_grasps.append(parsed_grasp) - - return parsed_grasps - - -def create_grasp_overlay( - rgb_image: np.ndarray, # type: ignore[type-arg] - grasps: list[dict], # type: ignore[type-arg] - camera_intrinsics: list[float] | np.ndarray, # type: ignore[type-arg] -) -> np.ndarray: # type: ignore[type-arg] - """ - Create grasp visualization overlay on RGB image. - - Args: - rgb_image: RGB input image - grasps: List of grasp dictionaries in viz format - camera_intrinsics: Camera parameters - - Returns: - RGB image with grasp overlay - """ - try: - bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) - - result_bgr = draw_grasps_on_image( - bgr_image, - grasps, - camera_intrinsics, - max_grasps=-1, - ) - return cv2.cvtColor(result_bgr, cv2.COLOR_BGR2RGB) - except Exception: - return rgb_image.copy() diff --git a/dimos/perception/object_detection_stream.py b/dimos/perception/object_detection_stream.py deleted file mode 100644 index 4d93e3ddd4..0000000000 --- a/dimos/perception/object_detection_stream.py +++ /dev/null @@ -1,322 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import numpy as np -from reactivex import Observable, operators as ops - -from dimos.perception.detection2d.yolo_2d_det import ( # type: ignore[import-not-found, import-untyped] - Yolo2DDetector, -) - -try: - from dimos.perception.detection2d.detic_2d_det import ( # type: ignore[import-not-found, import-untyped] - Detic2DDetector, - ) - - DETIC_AVAILABLE = True -except (ModuleNotFoundError, ImportError): - DETIC_AVAILABLE = False - Detic2DDetector = None -from collections.abc import Callable -from typing import TYPE_CHECKING - -from dimos.models.depth.metric3d import Metric3D -from dimos.perception.common.utils import draw_object_detection_visualization -from dimos.perception.detection2d.utils import ( # type: ignore[attr-defined] - calculate_depth_from_bbox, - calculate_object_size_from_bbox, - calculate_position_rotation_from_bbox, -) -from dimos.types.vector import Vector -from dimos.utils.logging_config import setup_logger -from dimos.utils.transform_utils import transform_robot_to_map # type: ignore[attr-defined] - -if TYPE_CHECKING: - from dimos.types.manipulation import ObjectData - -# Initialize logger for the ObjectDetectionStream -logger = setup_logger() - - -class ObjectDetectionStream: - """ - A stream processor that: - 1. Detects objects using a Detector (Detic or Yolo) - 2. Estimates depth using Metric3D - 3. Calculates 3D position and dimensions using camera intrinsics - 4. Transforms coordinates to map frame - 5. Draws bounding boxes and segmentation masks on the frame - - Provides a stream of structured object data with position and rotation information. - """ - - def __init__( # type: ignore[no-untyped-def] - self, - camera_intrinsics=None, # [fx, fy, cx, cy] - device: str = "cuda", - gt_depth_scale: float = 1000.0, - min_confidence: float = 0.7, - class_filter=None, # Optional list of class names to filter (e.g., ["person", "car"]) - get_pose: Callable | None = None, # type: ignore[type-arg] # Optional function to transform coordinates to map frame - detector: Detic2DDetector | Yolo2DDetector | None = None, - video_stream: Observable = None, # type: ignore[assignment, type-arg] - disable_depth: bool = False, # Flag to disable monocular Metric3D depth estimation - draw_masks: bool = False, # Flag to enable drawing segmentation masks - ) -> None: - """ - Initialize the ObjectDetectionStream. - - Args: - camera_intrinsics: List [fx, fy, cx, cy] with camera parameters - device: Device to run inference on ("cuda" or "cpu") - gt_depth_scale: Ground truth depth scale for Metric3D - min_confidence: Minimum confidence for detections - class_filter: Optional list of class names to filter - get_pose: Optional function to transform pose to map coordinates - detector: Optional detector instance (Detic or Yolo) - video_stream: Observable of video frames to process (if provided, returns a stream immediately) - disable_depth: Flag to disable monocular Metric3D depth estimation - draw_masks: Flag to enable drawing segmentation masks - """ - self.min_confidence = min_confidence - self.class_filter = class_filter - self.get_pose = get_pose - self.disable_depth = disable_depth - self.draw_masks = draw_masks - # Initialize object detector - if detector is not None: - self.detector = detector - else: - if DETIC_AVAILABLE: - try: - self.detector = Detic2DDetector(vocabulary=None, threshold=min_confidence) - logger.info("Using Detic2DDetector") - except Exception as e: - logger.warning( - f"Failed to initialize Detic2DDetector: {e}. Falling back to Yolo2DDetector." - ) - self.detector = Yolo2DDetector() - else: - logger.info("Detic not available. Using Yolo2DDetector.") - self.detector = Yolo2DDetector() - # Set up camera intrinsics - self.camera_intrinsics = camera_intrinsics - - # Initialize depth estimation model - self.depth_model = None - if not disable_depth: - try: - self.depth_model = Metric3D(gt_depth_scale=gt_depth_scale) - - if camera_intrinsics is not None: - self.depth_model.update_intrinsic(camera_intrinsics) # type: ignore[no-untyped-call] - - # Create 3x3 camera matrix for calculations - fx, fy, cx, cy = camera_intrinsics - self.camera_matrix = np.array( - [[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32 - ) - else: - raise ValueError("camera_intrinsics must be provided") - - logger.info("Depth estimation enabled with Metric3D") - except Exception as e: - logger.warning(f"Failed to initialize Metric3D depth model: {e}") - logger.warning("Falling back to disable_depth=True mode") - self.disable_depth = True - self.depth_model = None - else: - logger.info("Depth estimation disabled") - - # If video_stream is provided, create and store the stream immediately - self.stream = None - if video_stream is not None: - self.stream = self.create_stream(video_stream) - - def create_stream(self, video_stream: Observable) -> Observable: # type: ignore[type-arg] - """ - Create an Observable stream of object data from a video stream. - - Args: - video_stream: Observable that emits video frames - - Returns: - Observable that emits dictionaries containing object data - with position and rotation information - """ - - def process_frame(frame): # type: ignore[no-untyped-def] - # TODO: More modular detector output interface - bboxes, track_ids, class_ids, confidences, names, *mask_data = ( # type: ignore[misc] - *self.detector.process_image(frame), - [], - ) - - masks = ( - mask_data[0] # type: ignore[has-type] - if mask_data and len(mask_data[0]) == len(bboxes) # type: ignore[has-type] - else [None] * len(bboxes) # type: ignore[has-type] - ) - - # Create visualization - viz_frame = frame.copy() - - # Process detections - objects = [] - if not self.disable_depth: - depth_map = self.depth_model.infer_depth(frame) # type: ignore[union-attr] - depth_map = np.array(depth_map) - else: - depth_map = None - - for i, bbox in enumerate(bboxes): # type: ignore[has-type] - # Skip if confidence is too low - if i < len(confidences) and confidences[i] < self.min_confidence: # type: ignore[has-type] - continue - - # Skip if class filter is active and class not in filter - class_name = names[i] if i < len(names) else None # type: ignore[has-type] - if self.class_filter and class_name not in self.class_filter: - continue - - if not self.disable_depth and depth_map is not None: - # Get depth for this object - depth = calculate_depth_from_bbox(depth_map, bbox) # type: ignore[no-untyped-call] - if depth is None: - # Skip objects with invalid depth - continue - # Calculate object position and rotation - position, rotation = calculate_position_rotation_from_bbox( - bbox, depth, self.camera_intrinsics - ) - # Get object dimensions - width, height = calculate_object_size_from_bbox( - bbox, depth, self.camera_intrinsics - ) - - # Transform to map frame if a transform function is provided - try: - if self.get_pose: - # position and rotation are already Vector objects, no need to convert - robot_pose = self.get_pose() - position, rotation = transform_robot_to_map( - robot_pose["position"], robot_pose["rotation"], position, rotation - ) - except Exception as e: - logger.error(f"Error transforming to map frame: {e}") - position, rotation = position, rotation - - else: - depth = -1 - position = Vector(0, 0, 0) # type: ignore[arg-type] - rotation = Vector(0, 0, 0) # type: ignore[arg-type] - width = -1 - height = -1 - - # Create a properly typed ObjectData instance - object_data: ObjectData = { - "object_id": track_ids[i] if i < len(track_ids) else -1, # type: ignore[has-type] - "bbox": bbox, - "depth": depth, - "confidence": confidences[i] if i < len(confidences) else None, # type: ignore[has-type, typeddict-item] - "class_id": class_ids[i] if i < len(class_ids) else None, # type: ignore[has-type, typeddict-item] - "label": class_name, # type: ignore[typeddict-item] - "position": position, - "rotation": rotation, - "size": {"width": width, "height": height}, - "segmentation_mask": masks[i], - } - - objects.append(object_data) - - # Create visualization using common function - viz_frame = draw_object_detection_visualization( - viz_frame, objects, draw_masks=self.draw_masks, font_scale=1.5 - ) - - return {"frame": frame, "viz_frame": viz_frame, "objects": objects} - - self.stream = video_stream.pipe(ops.map(process_frame)) - - return self.stream - - def get_stream(self): # type: ignore[no-untyped-def] - """ - Returns the current detection stream if available. - Creates a new one with the provided video_stream if not already created. - - Returns: - Observable: The reactive stream of detection results - """ - if self.stream is None: - raise ValueError( - "Stream not initialized. Either provide a video_stream during initialization or call create_stream first." - ) - return self.stream - - def get_formatted_stream(self): # type: ignore[no-untyped-def] - """ - Returns a formatted stream of object detection data for better readability. - This is especially useful for LLMs like Claude that need structured text input. - - Returns: - Observable: A stream of formatted string representations of object data - """ - if self.stream is None: - raise ValueError( - "Stream not initialized. Either provide a video_stream during initialization or call create_stream first." - ) - - def format_detection_data(result): # type: ignore[no-untyped-def] - # Extract objects from result - objects = result.get("objects", []) - - if not objects: - return "No objects detected." - - formatted_data = "[DETECTED OBJECTS]\n" - try: - for i, obj in enumerate(objects): - pos = obj["position"] - rot = obj["rotation"] - size = obj["size"] - bbox = obj["bbox"] - - # Format each object with a multiline f-string for better readability - bbox_str = f"[{bbox[0]}, {bbox[1]}, {bbox[2]}, {bbox[3]}]" - formatted_data += ( - f"Object {i + 1}: {obj['label']}\n" - f" ID: {obj['object_id']}\n" - f" Confidence: {obj['confidence']:.2f}\n" - f" Position: x={pos.x:.2f}m, y={pos.y:.2f}m, z={pos.z:.2f}m\n" - f" Rotation: yaw={rot.z:.2f} rad\n" - f" Size: width={size['width']:.2f}m, height={size['height']:.2f}m\n" - f" Depth: {obj['depth']:.2f}m\n" - f" Bounding box: {bbox_str}\n" - "----------------------------------\n" - ) - except Exception as e: - logger.warning(f"Error formatting object {i}: {e}") - formatted_data += f"Object {i + 1}: [Error formatting data]" - formatted_data += "\n----------------------------------\n" - - return formatted_data - - # Return a new stream with the formatter applied - return self.stream.pipe(ops.map(format_detection_data)) - - def cleanup(self) -> None: - """Clean up resources.""" - pass diff --git a/dimos/perception/object_scene_registration.py b/dimos/perception/object_scene_registration.py new file mode 100644 index 0000000000..c21e31bf33 --- /dev/null +++ b/dimos/perception/object_scene_registration.py @@ -0,0 +1,247 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from typing import Any + +import numpy as np +from numpy.typing import NDArray + +from dimos.core import In, Out, rpc +from dimos.core.skill_module import SkillModule +from dimos.msgs.foxglove_msgs import ImageAnnotations +from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2 +from dimos.msgs.std_msgs import Header +from dimos.msgs.vision_msgs import Detection2DArray, Detection3DArray +from dimos.perception.detection.detectors.yoloe import Yoloe2DDetector, YoloePromptMode +from dimos.perception.detection.objectDB import ObjectDB +from dimos.perception.detection.type import ImageDetections2D +from dimos.perception.detection.type.detection3d.object import ( + Object, + aggregate_pointclouds, + to_detection3d_array, +) +from dimos.protocol.skill.skill import skill +from dimos.types.timestamped import align_timestamped +from dimos.utils.logging_config import setup_logger +from dimos.utils.reactive import backpressure + +logger = setup_logger() + + +class ObjectSceneRegistrationModule(SkillModule): + """Module for detecting objects in camera images using YOLO-E with 2D and 3D detection.""" + + color_image: In[Image] + depth_image: In[Image] + camera_info: In[CameraInfo] + + detections_2d: Out[Detection2DArray] + detections_3d: Out[Detection3DArray] + overlay: Out[ImageAnnotations] + pointcloud: Out[PointCloud2] + + _detector: Yoloe2DDetector | None = None + _camera_info: CameraInfo | None = None + _object_db: ObjectDB + + def __init__( + self, + target_frame: str = "map", + prompt_mode: YoloePromptMode = YoloePromptMode.LRPC, + ) -> None: + super().__init__() + self._target_frame = target_frame + self._prompt_mode = prompt_mode + self._object_db = ObjectDB() + + @rpc + def start(self) -> None: + super().start() + + if self._prompt_mode == YoloePromptMode.LRPC: + model_name = "yoloe-11l-seg-pf.pt" + else: + model_name = "yoloe-11l-seg.pt" + + self._detector = Yoloe2DDetector( + model_name=model_name, + prompt_mode=self._prompt_mode, + ) + + self.camera_info.subscribe(lambda msg: setattr(self, "_camera_info", msg)) + + aligned_frames = align_timestamped( + self.color_image.observable(), # type: ignore[no-untyped-call] + self.depth_image.observable(), # type: ignore[no-untyped-call] + buffer_size=2.0, + match_tolerance=0.1, + ) + backpressure(aligned_frames).subscribe(self._on_aligned_frames) + + @rpc + def stop(self) -> None: + """Stop the module and clean up resources.""" + + if self._detector: + self._detector.stop() + self._detector = None + + self._object_db.clear() + + logger.info("ObjectSceneRegistrationModule stopped") + super().stop() + + @rpc + def set_prompts( + self, + text: list[str] | None = None, + bboxes: NDArray[np.float64] | None = None, + ) -> None: + """Set prompts for detection. Provide either text or bboxes, not both.""" + if self._detector is not None: + self._detector.set_prompts(text=text, bboxes=bboxes) + + @rpc + def select_object(self, track_id: int) -> dict[str, Any] | None: + """Get object data by track_id and promote to permanent.""" + for obj in self._object_db.get_all_objects(): + if obj.track_id == track_id: + self._object_db.promote(obj.object_id) + return obj.to_dict() + return None + + @rpc + def get_object_track_ids(self) -> list[int]: + """Get track_ids of all permanent objects.""" + return [obj.track_id for obj in self._object_db.get_all_objects()] + + @skill() + def detect(self, *prompts: str) -> str: + """Detect objects matching the given text prompts. Returns track_ids after 2 seconds of detection. + + Do NOT call this tool multiple times for one query. Pass all objects in a single call. + For example, to detect a cup and mouse, call detect("cup", "mouse") not detect("cup") then detect("mouse"). + + Args: + prompts (str): Text descriptions of objects to detect (e.g., "person", "car", "dog") + + Returns: + str: A message containing the track_ids of detected objects + + Example: + detect("person", "car", "dog") + detect("person") + """ + if not prompts: + return "No prompts provided." + if self._detector is None: + return "Detector not initialized." + + self._detector.set_prompts(text=list(prompts)) + time.sleep(2.0) + + track_ids = self.get_object_track_ids() + if not track_ids: + return "No objects detected." + return f"Detected objects with track_ids: {track_ids}" + + @skill() + def select(self, track_id: int) -> str: + """Select an object by track_id and promote it to permanent. + + Example: + select(5) + """ + result = self.select_object(track_id) + if result is None: + return f"No object found with track_id {track_id}." + return f"Selected object {track_id}: {result['name']}" + + def _on_aligned_frames(self, frames) -> None: # type: ignore[no-untyped-def] + color_msg, depth_msg = frames + self._process_images(color_msg, depth_msg) + + def _process_images(self, color_msg: Image, depth_msg: Image) -> None: + """Process synchronized color and depth images (runs in background thread).""" + if not self._detector or not self._camera_info: + return + + color_image = color_msg + depth_image = depth_msg.to_depth_meters() + + # Run 2D detection + detections_2d: ImageDetections2D[Any] = self._detector.process_image(color_image) + + detections_2d_msg = Detection2DArray( + detections_length=len(detections_2d.detections), + header=Header(color_image.ts, color_image.frame_id or ""), + detections=[det.to_ros_detection2d() for det in detections_2d.detections], + ) + self.detections_2d.publish(detections_2d_msg) + + overlay_annotations = detections_2d.to_foxglove_annotations() + self.overlay.publish(overlay_annotations) + + # Process 3D detections + self._process_3d_detections(detections_2d, color_image, depth_image) + + def _process_3d_detections( + self, + detections_2d: ImageDetections2D[Any], + color_image: Image, + depth_image: Image, + ) -> None: + """Convert 2D detections to 3D and publish.""" + if self._camera_info is None: + return + + # Look up transform from camera frame to target frame (e.g., map) + camera_transform = None + if self._target_frame != color_image.frame_id: + camera_transform = self.tf.get( + self._target_frame, + color_image.frame_id, + color_image.ts, + 0.1, + ) + if camera_transform is None: + logger.warning("Failed to lookup transform from camera frame to target frame") + return + + objects = Object.from_2d_to_list( + detections_2d=detections_2d, + color_image=color_image, + depth_image=depth_image, + camera_info=self._camera_info, + camera_transform=camera_transform, + ) + if not objects: + return + + # Add objects to spatial memory database + objects = self._object_db.add_objects(objects) + + detections_3d = to_detection3d_array(objects) + self.detections_3d.publish(detections_3d) + + objects_for_pc = self._object_db.get_objects() + aggregated_pc = aggregate_pointclouds(objects_for_pc) + self.pointcloud.publish(aggregated_pc) + return + + +object_scene_registration_module = ObjectSceneRegistrationModule.blueprint + +__all__ = ["ObjectSceneRegistrationModule", "object_scene_registration_module"] diff --git a/dimos/perception/object_tracker.py b/dimos/perception/object_tracker.py index 9260003ce2..54a5873435 100644 --- a/dimos/perception/object_tracker.py +++ b/dimos/perception/object_tracker.py @@ -28,7 +28,6 @@ from reactivex.disposable import Disposable from dimos.core import In, Module, ModuleConfig, Out, rpc -from dimos.manipulation.visual_servoing.utils import visualize_detections_3d from dimos.msgs.geometry_msgs import Pose, Quaternion, Transform, Vector3 from dimos.msgs.sensor_msgs import ( CameraInfo, @@ -540,10 +539,13 @@ def _process_tracking(self) -> None: y2 = det_2d.bbox.center.position.y + det_2d.bbox.size_y / 2 bbox_2d = [[x1, y1, x2, y2]] - # Create visualization - viz_image = visualize_detections_3d( - frame, detections_3d, show_coordinates=True, bboxes_2d=bbox_2d - ) + # Use frame directly for visualization + viz_image = frame.copy() + + # Draw bounding boxes + for bbox in bbox_2d: + x1, y1, x2, y2 = map(int, bbox) + cv2.rectangle(viz_image, (x1, y1), (x2, y2), (0, 255, 0), 2) # Overlay REID feature matches if available if self.last_good_matches and self.last_roi_kps and self.last_roi_bbox: diff --git a/dimos/perception/object_tracker_3d.py b/dimos/perception/object_tracker_3d.py index 22846e1e2f..fa6361ac65 100644 --- a/dimos/perception/object_tracker_3d.py +++ b/dimos/perception/object_tracker_3d.py @@ -14,6 +14,7 @@ # Import LCM messages +import cv2 from dimos_lcm.sensor_msgs import CameraInfo from dimos_lcm.vision_msgs import ( Detection3D, @@ -22,7 +23,6 @@ import numpy as np from dimos.core import In, Out, rpc -from dimos.manipulation.visual_servoing.utils import visualize_detections_3d from dimos.msgs.geometry_msgs import Pose, Quaternion, Transform, Vector3 from dimos.msgs.sensor_msgs import Image, ImageFormat from dimos.msgs.std_msgs import Header @@ -145,10 +145,13 @@ def _process_tracking(self) -> None: y2 = det_2d.bbox.center.position.y + det_2d.bbox.size_y / 2 bbox_2d = [[x1, y1, x2, y2]] - # Create 3D visualization - viz_image = visualize_detections_3d( - frame, detection_3d.detections, show_coordinates=True, bboxes_2d=bbox_2d - ) + # Use frame directly for visualization + viz_image = frame.copy() + + # Draw bounding boxes + for bbox in bbox_2d: + x1, y1, x2, y2 = map(int, bbox) + cv2.rectangle(viz_image, (x1, y1), (x2, y2), (0, 255, 0), 2) # Overlay Re-ID matches if self.last_good_matches and self.last_roi_kps and self.last_roi_bbox: diff --git a/dimos/perception/person_tracker.py b/dimos/perception/person_tracker.py deleted file mode 100644 index a138467850..0000000000 --- a/dimos/perception/person_tracker.py +++ /dev/null @@ -1,262 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import cv2 -import numpy as np -from reactivex import Observable, interval, operators as ops -from reactivex.disposable import Disposable - -from dimos.core import In, Module, Out, rpc -from dimos.msgs.sensor_msgs import Image -from dimos.perception.common.ibvs import PersonDistanceEstimator -from dimos.perception.detection2d.utils import filter_detections -from dimos.perception.detection2d.yolo_2d_det import ( # type: ignore[import-not-found, import-untyped] - Yolo2DDetector, -) -from dimos.utils.logging_config import setup_logger - -logger = setup_logger() - - -class PersonTrackingStream(Module): - """Module for person tracking with LCM input/output.""" - - # LCM inputs - video: In[Image] - - # LCM outputs - tracking_data: Out[dict] # type: ignore[type-arg] - - def __init__( # type: ignore[no-untyped-def] - self, - camera_intrinsics=None, - camera_pitch: float = 0.0, - camera_height: float = 1.0, - ) -> None: - """ - Initialize a person tracking stream using Yolo2DDetector and PersonDistanceEstimator. - - Args: - camera_intrinsics: List in format [fx, fy, cx, cy] where: - - fx: Focal length in x direction (pixels) - - fy: Focal length in y direction (pixels) - - cx: Principal point x-coordinate (pixels) - - cy: Principal point y-coordinate (pixels) - camera_pitch: Camera pitch angle in radians (positive is up) - camera_height: Height of the camera from the ground in meters - """ - # Call parent Module init - super().__init__() - - self.camera_intrinsics = camera_intrinsics - self.camera_pitch = camera_pitch - self.camera_height = camera_height - - self.detector = Yolo2DDetector() - - # Initialize distance estimator - if camera_intrinsics is None: - raise ValueError("Camera intrinsics are required for distance estimation") - - # Validate camera intrinsics format [fx, fy, cx, cy] - if ( - not isinstance(camera_intrinsics, list | tuple | np.ndarray) - or len(camera_intrinsics) != 4 - ): - raise ValueError("Camera intrinsics must be provided as [fx, fy, cx, cy]") - - # Convert [fx, fy, cx, cy] to 3x3 camera matrix - fx, fy, cx, cy = camera_intrinsics - K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32) - - self.distance_estimator = PersonDistanceEstimator( - K=K, camera_pitch=camera_pitch, camera_height=camera_height - ) - - # For tracking latest frame data - self._latest_frame: np.ndarray | None = None # type: ignore[type-arg] - self._process_interval = 0.1 # Process at 10Hz - - # Tracking state - starts disabled - self._tracking_enabled = False - - @rpc - def start(self) -> None: - """Start the person tracking module and subscribe to LCM streams.""" - - super().start() - - # Subscribe to video stream - def set_video(image_msg: Image) -> None: - if hasattr(image_msg, "data"): - self._latest_frame = image_msg.data - else: - logger.warning("Received image message without data attribute") - - unsub = self.video.subscribe(set_video) - self._disposables.add(Disposable(unsub)) - - # Start periodic processing - unsub = interval(self._process_interval).subscribe(lambda _: self._process_frame()) # type: ignore[assignment] - self._disposables.add(unsub) # type: ignore[arg-type] - - logger.info("PersonTracking module started and subscribed to LCM streams") - - @rpc - def stop(self) -> None: - super().stop() - - def _process_frame(self) -> None: - """Process the latest frame if available.""" - if self._latest_frame is None: - return - - # Only process and publish if tracking is enabled - if not self._tracking_enabled: - return - - # Process frame through tracking pipeline - result = self._process_tracking(self._latest_frame) # type: ignore[no-untyped-call] - - # Publish result to LCM - if result: - self.tracking_data.publish(result) - - def _process_tracking(self, frame): # type: ignore[no-untyped-def] - """Process a single frame for person tracking.""" - # Detect people in the frame - bboxes, track_ids, class_ids, confidences, names = self.detector.process_image(frame) - - # Filter to keep only person detections using filter_detections - ( - filtered_bboxes, - filtered_track_ids, - filtered_class_ids, - filtered_confidences, - filtered_names, - ) = filter_detections( - bboxes, - track_ids, - class_ids, - confidences, - names, - class_filter=[0], # 0 is the class_id for person - name_filter=["person"], - ) - - # Create visualization - viz_frame = self.detector.visualize_results( - frame, - filtered_bboxes, - filtered_track_ids, - filtered_class_ids, - filtered_confidences, - filtered_names, - ) - - # Calculate distance and angle for each person - targets = [] - for i, bbox in enumerate(filtered_bboxes): - target_data = { - "target_id": filtered_track_ids[i] if i < len(filtered_track_ids) else -1, - "bbox": bbox, - "confidence": filtered_confidences[i] if i < len(filtered_confidences) else None, - } - - distance, angle = self.distance_estimator.estimate_distance_angle(bbox) - target_data["distance"] = distance - target_data["angle"] = angle - - # Add text to visualization - _x1, y1, x2, _y2 = map(int, bbox) - dist_text = f"{distance:.2f}m, {np.rad2deg(angle):.1f} deg" - - # Add black background for better visibility - text_size = cv2.getTextSize(dist_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0] - # Position at top-right corner - cv2.rectangle( - viz_frame, (x2 - text_size[0], y1 - text_size[1] - 5), (x2, y1), (0, 0, 0), -1 - ) - - # Draw text in white at top-right - cv2.putText( - viz_frame, - dist_text, - (x2 - text_size[0], y1 - 5), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - (255, 255, 255), - 2, - ) - - targets.append(target_data) - - # Create the result dictionary - return {"frame": frame, "viz_frame": viz_frame, "targets": targets} - - @rpc - def enable_tracking(self) -> bool: - """Enable person tracking. - - Returns: - bool: True if tracking was enabled successfully - """ - self._tracking_enabled = True - logger.info("Person tracking enabled") - return True - - @rpc - def disable_tracking(self) -> bool: - """Disable person tracking. - - Returns: - bool: True if tracking was disabled successfully - """ - self._tracking_enabled = False - logger.info("Person tracking disabled") - return True - - @rpc - def is_tracking_enabled(self) -> bool: - """Check if tracking is currently enabled. - - Returns: - bool: True if tracking is enabled - """ - return self._tracking_enabled - - @rpc - def get_tracking_data(self) -> dict: # type: ignore[type-arg] - """Get the latest tracking data. - - Returns: - Dictionary containing tracking results - """ - if self._latest_frame is not None: - return self._process_tracking(self._latest_frame) # type: ignore[no-any-return, no-untyped-call] - return {"frame": None, "viz_frame": None, "targets": []} - - def create_stream(self, video_stream: Observable) -> Observable: # type: ignore[type-arg] - """ - Create an Observable stream of person tracking results from a video stream. - - Args: - video_stream: Observable that emits video frames - - Returns: - Observable that emits dictionaries containing tracking results and visualizations - """ - - return video_stream.pipe(ops.map(self._process_tracking)) diff --git a/dimos/perception/pointcloud/__init__.py b/dimos/perception/pointcloud/__init__.py deleted file mode 100644 index a380e2aadf..0000000000 --- a/dimos/perception/pointcloud/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .cuboid_fit import * -from .pointcloud_filtering import * -from .utils import * diff --git a/dimos/perception/pointcloud/cuboid_fit.py b/dimos/perception/pointcloud/cuboid_fit.py deleted file mode 100644 index dfec2d9297..0000000000 --- a/dimos/perception/pointcloud/cuboid_fit.py +++ /dev/null @@ -1,420 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import cv2 -import numpy as np -import open3d as o3d # type: ignore[import-untyped] - - -def fit_cuboid( - points: np.ndarray | o3d.geometry.PointCloud, # type: ignore[type-arg] - method: str = "minimal", -) -> dict | None: # type: ignore[type-arg] - """ - Fit a cuboid to a point cloud using Open3D's built-in methods. - - Args: - points: Nx3 array of points or Open3D PointCloud - method: Fitting method: - - 'minimal': Minimal oriented bounding box (best fit) - - 'oriented': PCA-based oriented bounding box - - 'axis_aligned': Axis-aligned bounding box - - Returns: - Dictionary containing: - - center: 3D center point - - dimensions: 3D dimensions (extent) - - rotation: 3x3 rotation matrix - - error: Fitting error - - bounding_box: Open3D OrientedBoundingBox object - Returns None if insufficient points or fitting fails. - - Raises: - ValueError: If method is invalid or inputs are malformed - """ - # Validate method - valid_methods = ["minimal", "oriented", "axis_aligned"] - if method not in valid_methods: - raise ValueError(f"method must be one of {valid_methods}, got '{method}'") - - # Convert to point cloud if needed - if isinstance(points, np.ndarray): - points = np.asarray(points) - if len(points.shape) != 2 or points.shape[1] != 3: - raise ValueError(f"points array must be Nx3, got shape {points.shape}") - if len(points) < 4: - return None - - pcd = o3d.geometry.PointCloud() - pcd.points = o3d.utility.Vector3dVector(points) - elif isinstance(points, o3d.geometry.PointCloud): - pcd = points - points = np.asarray(pcd.points) - if len(points) < 4: - return None - else: - raise ValueError(f"points must be numpy array or Open3D PointCloud, got {type(points)}") - - try: - # Get bounding box based on method - if method == "minimal": - obb = pcd.get_minimal_oriented_bounding_box(robust=True) - elif method == "oriented": - obb = pcd.get_oriented_bounding_box(robust=True) - elif method == "axis_aligned": - # Convert axis-aligned to oriented format for consistency - aabb = pcd.get_axis_aligned_bounding_box() - obb = o3d.geometry.OrientedBoundingBox() - obb.center = aabb.get_center() - obb.extent = aabb.get_extent() - obb.R = np.eye(3) # Identity rotation for axis-aligned - - # Extract parameters - center = np.asarray(obb.center) - dimensions = np.asarray(obb.extent) - rotation = np.asarray(obb.R) - - # Calculate fitting error - error = _compute_fitting_error(points, center, dimensions, rotation) - - return { - "center": center, - "dimensions": dimensions, - "rotation": rotation, - "error": error, - "bounding_box": obb, - "method": method, - } - - except Exception as e: - # Log error but don't crash - return None for graceful handling - print(f"Warning: Cuboid fitting failed with method '{method}': {e}") - return None - - -def fit_cuboid_simple(points: np.ndarray | o3d.geometry.PointCloud) -> dict | None: # type: ignore[type-arg] - """ - Simple wrapper for minimal oriented bounding box fitting. - - Args: - points: Nx3 array of points or Open3D PointCloud - - Returns: - Dictionary with center, dimensions, rotation, and bounding_box, - or None if insufficient points - """ - return fit_cuboid(points, method="minimal") - - -def _compute_fitting_error( - points: np.ndarray, # type: ignore[type-arg] - center: np.ndarray, # type: ignore[type-arg] - dimensions: np.ndarray, # type: ignore[type-arg] - rotation: np.ndarray, # type: ignore[type-arg] -) -> float: - """ - Compute fitting error as mean squared distance from points to cuboid surface. - - Args: - points: Nx3 array of points - center: 3D center point - dimensions: 3D dimensions - rotation: 3x3 rotation matrix - - Returns: - Mean squared error - """ - if len(points) == 0: - return 0.0 - - # Transform points to local coordinates - local_points = (points - center) @ rotation - half_dims = dimensions / 2 - - # Calculate distance to cuboid surface - dx = np.abs(local_points[:, 0]) - half_dims[0] - dy = np.abs(local_points[:, 1]) - half_dims[1] - dz = np.abs(local_points[:, 2]) - half_dims[2] - - # Points outside: distance to nearest face - # Points inside: negative distance to nearest face - outside_dist = np.sqrt(np.maximum(dx, 0) ** 2 + np.maximum(dy, 0) ** 2 + np.maximum(dz, 0) ** 2) - inside_dist = np.minimum(np.minimum(dx, dy), dz) - distances = np.where((dx > 0) | (dy > 0) | (dz > 0), outside_dist, -inside_dist) - - return float(np.mean(distances**2)) - - -def get_cuboid_corners( - center: np.ndarray, # type: ignore[type-arg] - dimensions: np.ndarray, # type: ignore[type-arg] - rotation: np.ndarray, # type: ignore[type-arg] -) -> np.ndarray: # type: ignore[type-arg] - """ - Get the 8 corners of a cuboid. - - Args: - center: 3D center point - dimensions: 3D dimensions - rotation: 3x3 rotation matrix - - Returns: - 8x3 array of corner coordinates - """ - half_dims = dimensions / 2 - corners_local = ( - np.array( - [ - [-1, -1, -1], # 0: left bottom back - [-1, -1, 1], # 1: left bottom front - [-1, 1, -1], # 2: left top back - [-1, 1, 1], # 3: left top front - [1, -1, -1], # 4: right bottom back - [1, -1, 1], # 5: right bottom front - [1, 1, -1], # 6: right top back - [1, 1, 1], # 7: right top front - ] - ) - * half_dims - ) - - # Apply rotation and translation - return corners_local @ rotation.T + center # type: ignore[no-any-return] - - -def visualize_cuboid_on_image( - image: np.ndarray, # type: ignore[type-arg] - cuboid_params: dict, # type: ignore[type-arg] - camera_matrix: np.ndarray, # type: ignore[type-arg] - extrinsic_rotation: np.ndarray | None = None, # type: ignore[type-arg] - extrinsic_translation: np.ndarray | None = None, # type: ignore[type-arg] - color: tuple[int, int, int] = (0, 255, 0), - thickness: int = 2, - show_dimensions: bool = True, -) -> np.ndarray: # type: ignore[type-arg] - """ - Draw a fitted cuboid on an image using camera projection. - - Args: - image: Input image to draw on - cuboid_params: Dictionary containing cuboid parameters - camera_matrix: Camera intrinsic matrix (3x3) - extrinsic_rotation: Optional external rotation (3x3) - extrinsic_translation: Optional external translation (3x1) - color: Line color as (B, G, R) tuple - thickness: Line thickness - show_dimensions: Whether to display dimension text - - Returns: - Image with cuboid visualization - - Raises: - ValueError: If required parameters are missing or invalid - """ - # Validate inputs - required_keys = ["center", "dimensions", "rotation"] - if not all(key in cuboid_params for key in required_keys): - raise ValueError(f"cuboid_params must contain keys: {required_keys}") - - if camera_matrix.shape != (3, 3): - raise ValueError(f"camera_matrix must be 3x3, got {camera_matrix.shape}") - - # Get corners in world coordinates - corners = get_cuboid_corners( - cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"] - ) - - # Transform corners if extrinsic parameters are provided - if extrinsic_rotation is not None and extrinsic_translation is not None: - if extrinsic_rotation.shape != (3, 3): - raise ValueError(f"extrinsic_rotation must be 3x3, got {extrinsic_rotation.shape}") - if extrinsic_translation.shape not in [(3,), (3, 1)]: - raise ValueError( - f"extrinsic_translation must be (3,) or (3,1), got {extrinsic_translation.shape}" - ) - - extrinsic_translation = extrinsic_translation.flatten() - corners = (extrinsic_rotation @ corners.T).T + extrinsic_translation - - try: - # Project 3D corners to image coordinates - corners_img, _ = cv2.projectPoints( # type: ignore[call-overload] - corners.astype(np.float32), - np.zeros(3), - np.zeros(3), # No additional rotation/translation - camera_matrix.astype(np.float32), - None, # No distortion - ) - corners_img = corners_img.reshape(-1, 2).astype(int) - - # Check if corners are within image bounds - h, w = image.shape[:2] - valid_corners = ( - (corners_img[:, 0] >= 0) - & (corners_img[:, 0] < w) - & (corners_img[:, 1] >= 0) - & (corners_img[:, 1] < h) - ) - - if not np.any(valid_corners): - print("Warning: All cuboid corners are outside image bounds") - return image.copy() - - except Exception as e: - print(f"Warning: Failed to project cuboid corners: {e}") - return image.copy() - - # Define edges for wireframe visualization - edges = [ - # Bottom face - (0, 1), - (1, 5), - (5, 4), - (4, 0), - # Top face - (2, 3), - (3, 7), - (7, 6), - (6, 2), - # Vertical edges - (0, 2), - (1, 3), - (5, 7), - (4, 6), - ] - - # Draw edges - vis_img = image.copy() - for i, j in edges: - # Only draw edge if both corners are valid - if valid_corners[i] and valid_corners[j]: - cv2.line(vis_img, tuple(corners_img[i]), tuple(corners_img[j]), color, thickness) - - # Add dimension text if requested - if show_dimensions and np.any(valid_corners): - dims = cuboid_params["dimensions"] - dim_text = f"Dims: {dims[0]:.3f} x {dims[1]:.3f} x {dims[2]:.3f}" - - # Find a good position for text (top-left of image) - text_pos = (10, 30) - font_scale = 0.7 - - # Add background rectangle for better readability - text_size = cv2.getTextSize(dim_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 2)[0] - cv2.rectangle( - vis_img, - (text_pos[0] - 5, text_pos[1] - text_size[1] - 5), - (text_pos[0] + text_size[0] + 5, text_pos[1] + 5), - (0, 0, 0), - -1, - ) - - cv2.putText(vis_img, dim_text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, font_scale, color, 2) - - return vis_img - - -def compute_cuboid_volume(cuboid_params: dict) -> float: # type: ignore[type-arg] - """ - Compute the volume of a cuboid. - - Args: - cuboid_params: Dictionary containing cuboid parameters - - Returns: - Volume in cubic units - """ - if "dimensions" not in cuboid_params: - raise ValueError("cuboid_params must contain 'dimensions' key") - - dims = cuboid_params["dimensions"] - return float(np.prod(dims)) - - -def compute_cuboid_surface_area(cuboid_params: dict) -> float: # type: ignore[type-arg] - """ - Compute the surface area of a cuboid. - - Args: - cuboid_params: Dictionary containing cuboid parameters - - Returns: - Surface area in square units - """ - if "dimensions" not in cuboid_params: - raise ValueError("cuboid_params must contain 'dimensions' key") - - dims = cuboid_params["dimensions"] - return 2.0 * (dims[0] * dims[1] + dims[1] * dims[2] + dims[2] * dims[0]) # type: ignore[no-any-return] - - -def check_cuboid_quality(cuboid_params: dict, points: np.ndarray) -> dict: # type: ignore[type-arg] - """ - Assess the quality of a cuboid fit. - - Args: - cuboid_params: Dictionary containing cuboid parameters - points: Original points used for fitting - - Returns: - Dictionary with quality metrics - """ - if len(points) == 0: - return {"error": "No points provided"} - - # Basic metrics - volume = compute_cuboid_volume(cuboid_params) - surface_area = compute_cuboid_surface_area(cuboid_params) - error = cuboid_params.get("error", 0.0) - - # Aspect ratio analysis - dims = cuboid_params["dimensions"] - aspect_ratios = [ - dims[0] / dims[1] if dims[1] > 0 else float("inf"), - dims[1] / dims[2] if dims[2] > 0 else float("inf"), - dims[2] / dims[0] if dims[0] > 0 else float("inf"), - ] - max_aspect_ratio = max(aspect_ratios) - - # Volume ratio (cuboid volume vs convex hull volume) - try: - pcd = o3d.geometry.PointCloud() - pcd.points = o3d.utility.Vector3dVector(points) - hull, _ = pcd.compute_convex_hull() - hull_volume = hull.get_volume() - volume_ratio = volume / hull_volume if hull_volume > 0 else float("inf") - except: - volume_ratio = None - - return { - "fitting_error": error, - "volume": volume, - "surface_area": surface_area, - "max_aspect_ratio": max_aspect_ratio, - "volume_ratio": volume_ratio, - "num_points": len(points), - "method": cuboid_params.get("method", "unknown"), - } - - -# Backward compatibility -def visualize_fit(image, cuboid_params, camera_matrix, R=None, t=None): # type: ignore[no-untyped-def] - """ - Legacy function for backward compatibility. - Use visualize_cuboid_on_image instead. - """ - return visualize_cuboid_on_image( - image, cuboid_params, camera_matrix, R, t, show_dimensions=True - ) diff --git a/dimos/perception/pointcloud/pointcloud_filtering.py b/dimos/perception/pointcloud/pointcloud_filtering.py deleted file mode 100644 index d6aa2b835f..0000000000 --- a/dimos/perception/pointcloud/pointcloud_filtering.py +++ /dev/null @@ -1,370 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import cv2 -import numpy as np -import open3d as o3d # type: ignore[import-untyped] -import torch - -from dimos.perception.pointcloud.cuboid_fit import fit_cuboid -from dimos.perception.pointcloud.utils import ( - create_point_cloud_and_extract_masks, - load_camera_matrix_from_yaml, -) -from dimos.types.manipulation import ObjectData -from dimos.types.vector import Vector - - -class PointcloudFiltering: - """ - A production-ready point cloud filtering pipeline for segmented objects. - - This class takes segmentation results and produces clean, filtered point clouds - for each object with consistent coloring and optional outlier removal. - """ - - def __init__( - self, - color_intrinsics: str | list[float] | np.ndarray | None = None, # type: ignore[type-arg] - depth_intrinsics: str | list[float] | np.ndarray | None = None, # type: ignore[type-arg] - color_weight: float = 0.3, - enable_statistical_filtering: bool = True, - statistical_neighbors: int = 20, - statistical_std_ratio: float = 1.5, - enable_radius_filtering: bool = True, - radius_filtering_radius: float = 0.015, - radius_filtering_min_neighbors: int = 25, - enable_subsampling: bool = True, - voxel_size: float = 0.005, - max_num_objects: int = 10, - min_points_for_cuboid: int = 10, - cuboid_method: str = "oriented", - max_bbox_size_percent: float = 30.0, - ) -> None: - """ - Initialize the point cloud filtering pipeline. - - Args: - color_intrinsics: Camera intrinsics for color image - depth_intrinsics: Camera intrinsics for depth image - color_weight: Weight for blending generated color with original (0.0-1.0) - enable_statistical_filtering: Enable/disable statistical outlier filtering - statistical_neighbors: Number of neighbors for statistical filtering - statistical_std_ratio: Standard deviation ratio for statistical filtering - enable_radius_filtering: Enable/disable radius outlier filtering - radius_filtering_radius: Search radius for radius filtering (meters) - radius_filtering_min_neighbors: Min neighbors within radius - enable_subsampling: Enable/disable point cloud subsampling - voxel_size: Voxel size for downsampling (meters, when subsampling enabled) - max_num_objects: Maximum number of objects to process (top N by confidence) - min_points_for_cuboid: Minimum points required for cuboid fitting - cuboid_method: Method for cuboid fitting ('minimal', 'oriented', 'axis_aligned') - max_bbox_size_percent: Maximum percentage of image size for object bboxes (0-100) - - Raises: - ValueError: If invalid parameters are provided - """ - # Validate parameters - if not 0.0 <= color_weight <= 1.0: - raise ValueError(f"color_weight must be between 0.0 and 1.0, got {color_weight}") - if not 0.0 <= max_bbox_size_percent <= 100.0: - raise ValueError( - f"max_bbox_size_percent must be between 0.0 and 100.0, got {max_bbox_size_percent}" - ) - - # Store settings - self.color_weight = color_weight - self.enable_statistical_filtering = enable_statistical_filtering - self.statistical_neighbors = statistical_neighbors - self.statistical_std_ratio = statistical_std_ratio - self.enable_radius_filtering = enable_radius_filtering - self.radius_filtering_radius = radius_filtering_radius - self.radius_filtering_min_neighbors = radius_filtering_min_neighbors - self.enable_subsampling = enable_subsampling - self.voxel_size = voxel_size - self.max_num_objects = max_num_objects - self.min_points_for_cuboid = min_points_for_cuboid - self.cuboid_method = cuboid_method - self.max_bbox_size_percent = max_bbox_size_percent - - # Load camera matrices - self.color_camera_matrix = load_camera_matrix_from_yaml(color_intrinsics) - self.depth_camera_matrix = load_camera_matrix_from_yaml(depth_intrinsics) - - # Store the full point cloud - self.full_pcd = None - - def generate_color_from_id(self, object_id: int) -> np.ndarray: # type: ignore[type-arg] - """Generate a consistent color for a given object ID.""" - np.random.seed(object_id) - color = np.random.randint(0, 255, 3, dtype=np.uint8) - np.random.seed(None) - return color - - def _validate_inputs( # type: ignore[no-untyped-def] - self, - color_img: np.ndarray, # type: ignore[type-arg] - depth_img: np.ndarray, # type: ignore[type-arg] - objects: list[ObjectData], - ): - """Validate input parameters.""" - if color_img.shape[:2] != depth_img.shape: - raise ValueError("Color and depth image dimensions don't match") - - def _prepare_masks(self, masks: list[np.ndarray], target_shape: tuple) -> list[np.ndarray]: # type: ignore[type-arg] - """Prepare and validate masks to match target shape.""" - processed_masks = [] - for mask in masks: - # Convert mask to numpy if it's a tensor - if hasattr(mask, "cpu"): - mask = mask.cpu().numpy() - - mask = mask.astype(bool) - - # Handle shape mismatches - if mask.shape != target_shape: - if len(mask.shape) > 2: - mask = mask[:, :, 0] - - if mask.shape != target_shape: - mask = cv2.resize( - mask.astype(np.uint8), - (target_shape[1], target_shape[0]), - interpolation=cv2.INTER_NEAREST, - ).astype(bool) - - processed_masks.append(mask) - - return processed_masks - - def _apply_color_mask( - self, - pcd: o3d.geometry.PointCloud, - rgb_color: np.ndarray, # type: ignore[type-arg] - ) -> o3d.geometry.PointCloud: - """Apply weighted color mask to point cloud.""" - if len(np.asarray(pcd.colors)) > 0: - original_colors = np.asarray(pcd.colors) - generated_color = rgb_color.astype(np.float32) / 255.0 - colored_mask = ( - 1.0 - self.color_weight - ) * original_colors + self.color_weight * generated_color - colored_mask = np.clip(colored_mask, 0.0, 1.0) - pcd.colors = o3d.utility.Vector3dVector(colored_mask) - return pcd - - def _apply_filtering(self, pcd: o3d.geometry.PointCloud) -> o3d.geometry.PointCloud: - """Apply optional filtering to point cloud based on enabled flags.""" - current_pcd = pcd - - # Apply statistical filtering if enabled - if self.enable_statistical_filtering: - current_pcd, _ = current_pcd.remove_statistical_outlier( - nb_neighbors=self.statistical_neighbors, std_ratio=self.statistical_std_ratio - ) - - # Apply radius filtering if enabled - if self.enable_radius_filtering: - current_pcd, _ = current_pcd.remove_radius_outlier( - nb_points=self.radius_filtering_min_neighbors, radius=self.radius_filtering_radius - ) - - return current_pcd - - def _apply_subsampling(self, pcd: o3d.geometry.PointCloud) -> o3d.geometry.PointCloud: - """Apply subsampling to limit point cloud size using Open3D's voxel downsampling.""" - if self.enable_subsampling: - return pcd.voxel_down_sample(self.voxel_size) - return pcd - - def _extract_masks_from_objects(self, objects: list[ObjectData]) -> list[np.ndarray]: # type: ignore[type-arg] - """Extract segmentation masks from ObjectData objects.""" - return [obj["segmentation_mask"] for obj in objects] - - def get_full_point_cloud(self) -> o3d.geometry.PointCloud: - """Get the full point cloud.""" - return self._apply_subsampling(self.full_pcd) - - def process_images( - self, - color_img: np.ndarray, # type: ignore[type-arg] - depth_img: np.ndarray, # type: ignore[type-arg] - objects: list[ObjectData], - ) -> list[ObjectData]: - """ - Process color and depth images with object detection results to create filtered point clouds. - - Args: - color_img: RGB image as numpy array (H, W, 3) - depth_img: Depth image as numpy array (H, W) in meters - objects: List of ObjectData from object detection stream - - Returns: - List of updated ObjectData with pointcloud and 3D information. Each ObjectData - dictionary is enhanced with the following new fields: - - **3D Spatial Information** (added when sufficient points for cuboid fitting): - - "position": Vector(x, y, z) - 3D center position in world coordinates (meters) - - "rotation": Vector(roll, pitch, yaw) - 3D orientation as Euler angles (radians) - - "size": {"width": float, "height": float, "depth": float} - 3D bounding box dimensions (meters) - - **Point Cloud Data**: - - "point_cloud": o3d.geometry.PointCloud - Filtered Open3D point cloud with colors - - "color": np.ndarray - Consistent RGB color [R,G,B] (0-255) generated from object_id - - **Grasp Generation Arrays** (Dimensional grasp format): - - "point_cloud_numpy": np.ndarray - Nx3 XYZ coordinates as float32 (meters) - - "colors_numpy": np.ndarray - Nx3 RGB colors as float32 (0.0-1.0 range) - - Raises: - ValueError: If inputs are invalid - RuntimeError: If processing fails - """ - # Validate inputs - self._validate_inputs(color_img, depth_img, objects) - - if not objects: - return [] - - # Filter to top N objects by confidence - if len(objects) > self.max_num_objects: - # Sort objects by confidence (highest first), handle None confidences - sorted_objects = sorted( - objects, - key=lambda obj: obj.get("confidence", 0.0) - if obj.get("confidence") is not None - else 0.0, - reverse=True, - ) - objects = sorted_objects[: self.max_num_objects] - - # Filter out objects with bboxes too large - image_area = color_img.shape[0] * color_img.shape[1] - max_bbox_area = image_area * (self.max_bbox_size_percent / 100.0) - - filtered_objects = [] - for obj in objects: - if "bbox" in obj and obj["bbox"] is not None: - bbox = obj["bbox"] - # Calculate bbox area (assuming bbox format [x1, y1, x2, y2]) - bbox_area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) - if bbox_area <= max_bbox_area: - filtered_objects.append(obj) - else: - filtered_objects.append(obj) - - objects = filtered_objects - - # Extract masks from ObjectData - masks = self._extract_masks_from_objects(objects) - - # Prepare masks - processed_masks = self._prepare_masks(masks, depth_img.shape) - - # Create point clouds efficiently - self.full_pcd, masked_pcds = create_point_cloud_and_extract_masks( - color_img, - depth_img, - processed_masks, - self.depth_camera_matrix, # type: ignore[arg-type] - depth_scale=1.0, - ) - - # Process each object and update ObjectData - updated_objects = [] - - for i, (obj, _mask, pcd) in enumerate( - zip(objects, processed_masks, masked_pcds, strict=False) - ): - # Skip empty point clouds - if len(np.asarray(pcd.points)) == 0: - continue - - # Create a copy of the object data to avoid modifying the original - updated_obj = obj.copy() - - # Generate consistent color - object_id = obj.get("object_id", i) - rgb_color = self.generate_color_from_id(object_id) - - # Apply color mask - pcd = self._apply_color_mask(pcd, rgb_color) - - # Apply subsampling to control point cloud size - pcd = self._apply_subsampling(pcd) - - # Apply filtering (optional based on flags) - pcd_filtered = self._apply_filtering(pcd) - - # Fit cuboid and extract 3D information - points = np.asarray(pcd_filtered.points) - if len(points) >= self.min_points_for_cuboid: - cuboid_params = fit_cuboid(points, method=self.cuboid_method) - if cuboid_params is not None: - # Update position, rotation, and size from cuboid - center = cuboid_params["center"] - dimensions = cuboid_params["dimensions"] - rotation_matrix = cuboid_params["rotation"] - - # Convert rotation matrix to euler angles (roll, pitch, yaw) - sy = np.sqrt( - rotation_matrix[0, 0] * rotation_matrix[0, 0] - + rotation_matrix[1, 0] * rotation_matrix[1, 0] - ) - singular = sy < 1e-6 - - if not singular: - roll = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2]) - pitch = np.arctan2(-rotation_matrix[2, 0], sy) - yaw = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0]) - else: - roll = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1]) - pitch = np.arctan2(-rotation_matrix[2, 0], sy) - yaw = 0 - - # Update position, rotation, and size from cuboid - updated_obj["position"] = Vector(center[0], center[1], center[2]) - updated_obj["rotation"] = Vector(roll, pitch, yaw) - updated_obj["size"] = { - "width": float(dimensions[0]), - "height": float(dimensions[1]), - "depth": float(dimensions[2]), - } - - # Add point cloud data to ObjectData - updated_obj["point_cloud"] = pcd_filtered - updated_obj["color"] = rgb_color - - # Extract numpy arrays for grasp generation - points_array = np.asarray(pcd_filtered.points).astype(np.float32) # Nx3 XYZ coordinates - if pcd_filtered.has_colors(): - colors_array = np.asarray(pcd_filtered.colors).astype( - np.float32 - ) # Nx3 RGB (0-1 range) - else: - # If no colors, create array of zeros - colors_array = np.zeros((len(points_array), 3), dtype=np.float32) - - updated_obj["point_cloud_numpy"] = points_array - updated_obj["colors_numpy"] = colors_array # type: ignore[typeddict-unknown-key] - - updated_objects.append(updated_obj) - - return updated_objects - - def cleanup(self) -> None: - """Clean up resources.""" - if torch.cuda.is_available(): - torch.cuda.empty_cache() diff --git a/dimos/perception/pointcloud/test_pointcloud_filtering.py b/dimos/perception/pointcloud/test_pointcloud_filtering.py deleted file mode 100644 index 4ac7e5cb2d..0000000000 --- a/dimos/perception/pointcloud/test_pointcloud_filtering.py +++ /dev/null @@ -1,263 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import TYPE_CHECKING - -import cv2 -import numpy as np -import open3d as o3d -import pytest - -from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering -from dimos.perception.pointcloud.utils import load_camera_matrix_from_yaml - -if TYPE_CHECKING: - from dimos.types.manipulation import ObjectData - - -class TestPointcloudFiltering: - def test_pointcloud_filtering_initialization(self) -> None: - """Test PointcloudFiltering initializes correctly with default parameters.""" - try: - filtering = PointcloudFiltering() - assert filtering is not None - assert filtering.color_weight == 0.3 - assert filtering.enable_statistical_filtering - assert filtering.enable_radius_filtering - assert filtering.enable_subsampling - except Exception as e: - pytest.skip(f"Skipping test due to initialization error: {e}") - - def test_pointcloud_filtering_with_custom_params(self) -> None: - """Test PointcloudFiltering with custom parameters.""" - try: - filtering = PointcloudFiltering( - color_weight=0.5, - enable_statistical_filtering=False, - enable_radius_filtering=False, - voxel_size=0.01, - max_num_objects=5, - ) - assert filtering.color_weight == 0.5 - assert not filtering.enable_statistical_filtering - assert not filtering.enable_radius_filtering - assert filtering.voxel_size == 0.01 - assert filtering.max_num_objects == 5 - except Exception as e: - pytest.skip(f"Skipping test due to initialization error: {e}") - - def test_pointcloud_filtering_process_images(self) -> None: - """Test PointcloudFiltering can process RGB-D images and return filtered point clouds.""" - try: - # Import data inside method to avoid pytest fixture confusion - from dimos.utils.data import get_data - - # Load test RGB-D data - data_dir = get_data("rgbd_frames") - - # Load first frame - color_path = os.path.join(data_dir, "color", "00000.png") - depth_path = os.path.join(data_dir, "depth", "00000.png") - intrinsics_path = os.path.join(data_dir, "color_camera_info.yaml") - - assert os.path.exists(color_path), f"Color image not found: {color_path}" - assert os.path.exists(depth_path), f"Depth image not found: {depth_path}" - assert os.path.exists(intrinsics_path), f"Intrinsics file not found: {intrinsics_path}" - - # Load images - color_img = cv2.imread(color_path) - color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB) - - depth_img = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH) - if depth_img.dtype == np.uint16: - depth_img = depth_img.astype(np.float32) / 1000.0 - - # Load camera intrinsics - camera_matrix = load_camera_matrix_from_yaml(intrinsics_path) - if camera_matrix is None: - pytest.skip("Failed to load camera intrinsics") - - # Create mock objects with segmentation masks - height, width = color_img.shape[:2] - - # Create simple rectangular masks for testing - mock_objects = [] - - # Object 1: Top-left quadrant - mask1 = np.zeros((height, width), dtype=bool) - mask1[height // 4 : height // 2, width // 4 : width // 2] = True - - obj1: ObjectData = { - "object_id": 1, - "confidence": 0.9, - "bbox": [width // 4, height // 4, width // 2, height // 2], - "segmentation_mask": mask1, - "name": "test_object_1", - } - mock_objects.append(obj1) - - # Object 2: Bottom-right quadrant - mask2 = np.zeros((height, width), dtype=bool) - mask2[height // 2 : 3 * height // 4, width // 2 : 3 * width // 4] = True - - obj2: ObjectData = { - "object_id": 2, - "confidence": 0.8, - "bbox": [width // 2, height // 2, 3 * width // 4, 3 * height // 4], - "segmentation_mask": mask2, - "name": "test_object_2", - } - mock_objects.append(obj2) - - # Initialize filtering with intrinsics - filtering = PointcloudFiltering( - color_intrinsics=camera_matrix, - depth_intrinsics=camera_matrix, - enable_statistical_filtering=False, # Disable for faster testing - enable_radius_filtering=False, # Disable for faster testing - voxel_size=0.01, # Larger voxel for faster processing - ) - - # Process images - results = filtering.process_images(color_img, depth_img, mock_objects) - - print( - f"Processing results - Input objects: {len(mock_objects)}, Output objects: {len(results)}" - ) - - # Verify results - assert isinstance(results, list), "Results should be a list" - assert len(results) <= len(mock_objects), "Should not return more objects than input" - - # Check each result object - for i, result in enumerate(results): - print(f"Object {i}: {result.get('name', 'unknown')}") - - # Verify required fields exist - assert "point_cloud" in result, "Result should contain point_cloud" - assert "color" in result, "Result should contain color" - assert "point_cloud_numpy" in result, "Result should contain point_cloud_numpy" - - # Verify point cloud is valid Open3D object - pcd = result["point_cloud"] - assert isinstance(pcd, o3d.geometry.PointCloud), ( - "point_cloud should be Open3D PointCloud" - ) - - # Verify numpy arrays - points_array = result["point_cloud_numpy"] - assert isinstance(points_array, np.ndarray), ( - "point_cloud_numpy should be numpy array" - ) - assert points_array.shape[1] == 3, "Point array should have 3 columns (x,y,z)" - assert points_array.dtype == np.float32, "Point array should be float32" - - # Verify color - color = result["color"] - assert isinstance(color, np.ndarray), "Color should be numpy array" - assert color.shape == (3,), "Color should be RGB triplet" - assert color.dtype == np.uint8, "Color should be uint8" - - # Check if 3D information was added (when enough points for cuboid fitting) - points = np.asarray(pcd.points) - if len(points) >= filtering.min_points_for_cuboid: - if "position" in result: - assert "rotation" in result, "Should have rotation if position exists" - assert "size" in result, "Should have size if position exists" - - # Verify position format - from dimos.types.vector import Vector - - position = result["position"] - assert isinstance(position, Vector), "Position should be Vector" - - # Verify size format - size = result["size"] - assert isinstance(size, dict), "Size should be dict" - assert "width" in size and "height" in size and "depth" in size - - print(f" - Points: {len(points)}") - print(f" - Color: {color}") - if "position" in result: - print(f" - Position: {result['position']}") - print(f" - Size: {result['size']}") - - # Test full point cloud access - full_pcd = filtering.get_full_point_cloud() - if full_pcd is not None: - assert isinstance(full_pcd, o3d.geometry.PointCloud), ( - "Full point cloud should be Open3D PointCloud" - ) - full_points = np.asarray(full_pcd.points) - print(f"Full point cloud points: {len(full_points)}") - - print("All pointcloud filtering tests passed!") - - except Exception as e: - pytest.skip(f"Skipping test due to error: {e}") - - def test_pointcloud_filtering_empty_objects(self) -> None: - """Test PointcloudFiltering with empty object list.""" - try: - from dimos.utils.data import get_data - - # Load test data - data_dir = get_data("rgbd_frames") - color_path = os.path.join(data_dir, "color", "00000.png") - depth_path = os.path.join(data_dir, "depth", "00000.png") - - if not (os.path.exists(color_path) and os.path.exists(depth_path)): - pytest.skip("Test images not found") - - color_img = cv2.imread(color_path) - color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB) - depth_img = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH) - if depth_img.dtype == np.uint16: - depth_img = depth_img.astype(np.float32) / 1000.0 - - filtering = PointcloudFiltering() - - # Test with empty object list - results = filtering.process_images(color_img, depth_img, []) - - assert isinstance(results, list), "Results should be a list" - assert len(results) == 0, "Should return empty list for empty input" - - except Exception as e: - pytest.skip(f"Skipping test due to error: {e}") - - def test_color_generation_consistency(self) -> None: - """Test that color generation is consistent for the same object ID.""" - try: - filtering = PointcloudFiltering() - - # Test color generation consistency - color1 = filtering.generate_color_from_id(42) - color2 = filtering.generate_color_from_id(42) - color3 = filtering.generate_color_from_id(43) - - assert np.array_equal(color1, color2), "Same ID should generate same color" - assert not np.array_equal(color1, color3), ( - "Different IDs should generate different colors" - ) - assert color1.shape == (3,), "Color should be RGB triplet" - assert color1.dtype == np.uint8, "Color should be uint8" - - except Exception as e: - pytest.skip(f"Skipping test due to error: {e}") - - -if __name__ == "__main__": - pytest.main(["-v", __file__]) diff --git a/dimos/perception/pointcloud/utils.py b/dimos/perception/pointcloud/utils.py deleted file mode 100644 index b2bb561000..0000000000 --- a/dimos/perception/pointcloud/utils.py +++ /dev/null @@ -1,1113 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Point cloud utilities for RGBD data processing. - -This module provides efficient utilities for creating and manipulating point clouds -from RGBD images using Open3D. -""" - -import os -from typing import Any - -import cv2 -import numpy as np -import open3d as o3d # type: ignore[import-untyped] -from scipy.spatial import cKDTree # type: ignore[import-untyped] -import yaml # type: ignore[import-untyped] - -from dimos.perception.common.utils import project_3d_points_to_2d - - -def load_camera_matrix_from_yaml( - camera_info: str | list[float] | np.ndarray | dict | None, # type: ignore[type-arg] -) -> np.ndarray | None: # type: ignore[type-arg] - """ - Load camera intrinsic matrix from various input formats. - - Args: - camera_info: Can be: - - Path to YAML file containing camera parameters - - List of [fx, fy, cx, cy] - - 3x3 numpy array (returned as-is) - - Dict with camera parameters - - None (returns None) - - Returns: - 3x3 camera intrinsic matrix or None if input is None - - Raises: - ValueError: If camera_info format is invalid or file cannot be read - FileNotFoundError: If YAML file path doesn't exist - """ - if camera_info is None: - return None - - # Handle case where camera_info is already a matrix - if isinstance(camera_info, np.ndarray) and camera_info.shape == (3, 3): - return camera_info.astype(np.float32) - - # Handle case where camera_info is [fx, fy, cx, cy] format - if isinstance(camera_info, list) and len(camera_info) == 4: - fx, fy, cx, cy = camera_info - return np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32) - - # Handle case where camera_info is a dict - if isinstance(camera_info, dict): - return _extract_matrix_from_dict(camera_info) - - # Handle case where camera_info is a path to a YAML file - if isinstance(camera_info, str): - if not os.path.isfile(camera_info): - raise FileNotFoundError(f"Camera info file not found: {camera_info}") - - try: - with open(camera_info) as f: - data = yaml.safe_load(f) - return _extract_matrix_from_dict(data) - except Exception as e: - raise ValueError(f"Failed to read camera info from {camera_info}: {e}") - - raise ValueError( - f"Invalid camera_info format. Expected str, list, dict, or numpy array, got {type(camera_info)}" - ) - - -def _extract_matrix_from_dict(data: dict) -> np.ndarray: # type: ignore[type-arg] - """Extract camera matrix from dictionary with various formats.""" - # ROS format with 'K' field (most common) - if "K" in data: - k_data = data["K"] - if len(k_data) == 9: - return np.array(k_data, dtype=np.float32).reshape(3, 3) - - # Standard format with 'camera_matrix' - if "camera_matrix" in data: - if "data" in data["camera_matrix"]: - matrix_data = data["camera_matrix"]["data"] - if len(matrix_data) == 9: - return np.array(matrix_data, dtype=np.float32).reshape(3, 3) - - # Explicit intrinsics format - if all(k in data for k in ["fx", "fy", "cx", "cy"]): - fx, fy = float(data["fx"]), float(data["fy"]) - cx, cy = float(data["cx"]), float(data["cy"]) - return np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32) - - # Error case - provide helpful debug info - available_keys = list(data.keys()) - if "K" in data: - k_info = f"K field length: {len(data['K']) if hasattr(data['K'], '__len__') else 'unknown'}" - else: - k_info = "K field not found" - - raise ValueError( - f"Cannot extract camera matrix from data. " - f"Available keys: {available_keys}. {k_info}. " - f"Expected formats: 'K' (9 elements), 'camera_matrix.data' (9 elements), " - f"or individual 'fx', 'fy', 'cx', 'cy' fields." - ) - - -def create_o3d_point_cloud_from_rgbd( - color_img: np.ndarray, # type: ignore[type-arg] - depth_img: np.ndarray, # type: ignore[type-arg] - intrinsic: np.ndarray, # type: ignore[type-arg] - depth_scale: float = 1.0, - depth_trunc: float = 3.0, -) -> o3d.geometry.PointCloud: - """ - Create an Open3D point cloud from RGB and depth images. - - Args: - color_img: RGB image as numpy array (H, W, 3) - depth_img: Depth image as numpy array (H, W) - intrinsic: Camera intrinsic matrix (3x3 numpy array) - depth_scale: Scale factor to convert depth to meters - depth_trunc: Maximum depth in meters - - Returns: - Open3D point cloud object - - Raises: - ValueError: If input dimensions are invalid - """ - # Validate inputs - if len(color_img.shape) != 3 or color_img.shape[2] != 3: - raise ValueError(f"color_img must be (H, W, 3), got {color_img.shape}") - if len(depth_img.shape) != 2: - raise ValueError(f"depth_img must be (H, W), got {depth_img.shape}") - if color_img.shape[:2] != depth_img.shape: - raise ValueError( - f"Color and depth image dimensions don't match: {color_img.shape[:2]} vs {depth_img.shape}" - ) - if intrinsic.shape != (3, 3): - raise ValueError(f"intrinsic must be (3, 3), got {intrinsic.shape}") - - # Convert to Open3D format - color_o3d = o3d.geometry.Image(color_img.astype(np.uint8)) - - # Filter out inf and nan values from depth image - depth_filtered = depth_img.copy() - - # Create mask for valid depth values (finite, positive, non-zero) - valid_mask = np.isfinite(depth_filtered) & (depth_filtered > 0) - - # Set invalid values to 0 (which Open3D treats as no depth) - depth_filtered[~valid_mask] = 0.0 - - depth_o3d = o3d.geometry.Image(depth_filtered.astype(np.float32)) - - # Create Open3D intrinsic object - height, width = color_img.shape[:2] - fx, fy = intrinsic[0, 0], intrinsic[1, 1] - cx, cy = intrinsic[0, 2], intrinsic[1, 2] - intrinsic_o3d = o3d.camera.PinholeCameraIntrinsic( - width, - height, - fx, - fy, # fx, fy - cx, - cy, # cx, cy - ) - - # Create RGBD image - rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth( - color_o3d, - depth_o3d, - depth_scale=depth_scale, - depth_trunc=depth_trunc, - convert_rgb_to_intensity=False, - ) - - # Create point cloud - pcd = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, intrinsic_o3d) - - return pcd - - -def create_point_cloud_and_extract_masks( - color_img: np.ndarray, # type: ignore[type-arg] - depth_img: np.ndarray, # type: ignore[type-arg] - masks: list[np.ndarray], # type: ignore[type-arg] - intrinsic: np.ndarray, # type: ignore[type-arg] - depth_scale: float = 1.0, - depth_trunc: float = 3.0, -) -> tuple[o3d.geometry.PointCloud, list[o3d.geometry.PointCloud]]: - """ - Efficiently create a point cloud once and extract multiple masked regions. - - Args: - color_img: RGB image (H, W, 3) - depth_img: Depth image (H, W) - masks: List of boolean masks, each of shape (H, W) - intrinsic: Camera intrinsic matrix (3x3 numpy array) - depth_scale: Scale factor to convert depth to meters - depth_trunc: Maximum depth in meters - - Returns: - Tuple of (full_point_cloud, list_of_masked_point_clouds) - """ - if not masks: - return o3d.geometry.PointCloud(), [] - - # Create the full point cloud - full_pcd = create_o3d_point_cloud_from_rgbd( - color_img, depth_img, intrinsic, depth_scale, depth_trunc - ) - - if len(np.asarray(full_pcd.points)) == 0: - return full_pcd, [o3d.geometry.PointCloud() for _ in masks] - - # Create pixel-to-point mapping - valid_depth_mask = np.isfinite(depth_img) & (depth_img > 0) & (depth_img <= depth_trunc) - - valid_depth = valid_depth_mask.flatten() - if not np.any(valid_depth): - return full_pcd, [o3d.geometry.PointCloud() for _ in masks] - - pixel_to_point = np.full(len(valid_depth), -1, dtype=np.int32) - pixel_to_point[valid_depth] = np.arange(np.sum(valid_depth)) - - # Extract point clouds for each mask - masked_pcds = [] - max_points = len(np.asarray(full_pcd.points)) - - for mask in masks: - if mask.shape != depth_img.shape: - masked_pcds.append(o3d.geometry.PointCloud()) - continue - - mask_flat = mask.flatten() - valid_mask_indices = mask_flat & valid_depth - point_indices = pixel_to_point[valid_mask_indices] - valid_point_indices = point_indices[point_indices >= 0] - - if len(valid_point_indices) > 0: - valid_point_indices = np.clip(valid_point_indices, 0, max_points - 1) - valid_point_indices = np.unique(valid_point_indices) - masked_pcd = full_pcd.select_by_index(valid_point_indices.tolist()) - else: - masked_pcd = o3d.geometry.PointCloud() - - masked_pcds.append(masked_pcd) - - return full_pcd, masked_pcds - - -def filter_point_cloud_statistical( - pcd: o3d.geometry.PointCloud, nb_neighbors: int = 20, std_ratio: float = 2.0 -) -> tuple[o3d.geometry.PointCloud, np.ndarray]: # type: ignore[type-arg] - """ - Apply statistical outlier filtering to point cloud. - - Args: - pcd: Input point cloud - nb_neighbors: Number of neighbors to analyze for each point - std_ratio: Threshold level based on standard deviation - - Returns: - Tuple of (filtered_point_cloud, outlier_indices) - """ - if len(np.asarray(pcd.points)) == 0: - return pcd, np.array([]) - - return pcd.remove_statistical_outlier(nb_neighbors=nb_neighbors, std_ratio=std_ratio) # type: ignore[no-any-return] - - -def filter_point_cloud_radius( - pcd: o3d.geometry.PointCloud, nb_points: int = 16, radius: float = 0.05 -) -> tuple[o3d.geometry.PointCloud, np.ndarray]: # type: ignore[type-arg] - """ - Apply radius-based outlier filtering to point cloud. - - Args: - pcd: Input point cloud - nb_points: Minimum number of points within radius - radius: Search radius in meters - - Returns: - Tuple of (filtered_point_cloud, outlier_indices) - """ - if len(np.asarray(pcd.points)) == 0: - return pcd, np.array([]) - - return pcd.remove_radius_outlier(nb_points=nb_points, radius=radius) # type: ignore[no-any-return] - - -def overlay_point_clouds_on_image( - base_image: np.ndarray, # type: ignore[type-arg] - point_clouds: list[o3d.geometry.PointCloud], - camera_intrinsics: list[float] | np.ndarray, # type: ignore[type-arg] - colors: list[tuple[int, int, int]], - point_size: int = 2, - alpha: float = 0.7, -) -> np.ndarray: # type: ignore[type-arg] - """ - Overlay multiple colored point clouds onto an image. - - Args: - base_image: Base image to overlay onto (H, W, 3) - assumed to be RGB - point_clouds: List of Open3D point cloud objects - camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix - colors: List of RGB color tuples for each point cloud. If None, generates distinct colors. - point_size: Size of points to draw (in pixels) - alpha: Blending factor for overlay (0.0 = fully transparent, 1.0 = fully opaque) - - Returns: - Image with overlaid point clouds (H, W, 3) - """ - if len(point_clouds) == 0: - return base_image.copy() - - # Create overlay image - overlay = base_image.copy() - height, width = base_image.shape[:2] - - # Process each point cloud - for i, pcd in enumerate(point_clouds): - if pcd is None: - continue - - points_3d = np.asarray(pcd.points) - if len(points_3d) == 0: - continue - - # Project 3D points to 2D - points_2d = project_3d_points_to_2d(points_3d, camera_intrinsics) - - if len(points_2d) == 0: - continue - - # Filter points within image bounds - valid_mask = ( - (points_2d[:, 0] >= 0) - & (points_2d[:, 0] < width) - & (points_2d[:, 1] >= 0) - & (points_2d[:, 1] < height) - ) - valid_points_2d = points_2d[valid_mask] - - if len(valid_points_2d) == 0: - continue - - # Get color for this point cloud - color = colors[i % len(colors)] - - # Ensure color is a tuple of integers for OpenCV - if isinstance(color, list | tuple | np.ndarray): - color = tuple(int(c) for c in color[:3]) # type: ignore[assignment] - else: - color = (255, 255, 255) - - # Draw points on overlay - for point in valid_points_2d: - u, v = point - # Draw a small filled circle for each point - cv2.circle(overlay, (u, v), point_size, color, -1) - - # Blend overlay with base image - result = cv2.addWeighted(base_image, 1 - alpha, overlay, alpha, 0) - - return result - - -def create_point_cloud_overlay_visualization( - base_image: np.ndarray, # type: ignore[type-arg] - objects: list[dict], # type: ignore[type-arg] - intrinsics: np.ndarray, # type: ignore[type-arg] -) -> np.ndarray: # type: ignore[type-arg] - """ - Create a visualization showing object point clouds and bounding boxes overlaid on a base image. - - Args: - base_image: Base image to overlay onto (H, W, 3) - objects: List of object dictionaries containing 'point_cloud', 'color', 'position', 'rotation', 'size' keys - intrinsics: Camera intrinsics as [fx, fy, cx, cy] or 3x3 matrix - - Returns: - Visualization image with overlaid point clouds and bounding boxes (H, W, 3) - """ - # Extract point clouds and colors from objects - point_clouds = [] - colors = [] - for obj in objects: - if "point_cloud" in obj and obj["point_cloud"] is not None: - point_clouds.append(obj["point_cloud"]) - - # Convert color to tuple - color = obj["color"] - if isinstance(color, np.ndarray): - color = tuple(int(c) for c in color) - elif isinstance(color, list | tuple): - color = tuple(int(c) for c in color[:3]) - colors.append(color) - - # Create visualization - if point_clouds: - result = overlay_point_clouds_on_image( - base_image=base_image, - point_clouds=point_clouds, - camera_intrinsics=intrinsics, - colors=colors, - point_size=3, - alpha=0.8, - ) - else: - result = base_image.copy() - - # Draw 3D bounding boxes - height_img, width_img = result.shape[:2] - for i, obj in enumerate(objects): - if all(key in obj and obj[key] is not None for key in ["position", "rotation", "size"]): - try: - # Create and project 3D bounding box - corners_3d = create_3d_bounding_box_corners( - obj["position"], obj["rotation"], obj["size"] - ) - corners_2d = project_3d_points_to_2d(corners_3d, intrinsics) - - # Check if any corners are visible - valid_mask = ( - (corners_2d[:, 0] >= 0) - & (corners_2d[:, 0] < width_img) - & (corners_2d[:, 1] >= 0) - & (corners_2d[:, 1] < height_img) - ) - - if np.any(valid_mask): - # Get color - bbox_color = colors[i] if i < len(colors) else (255, 255, 255) - draw_3d_bounding_box_on_image(result, corners_2d, bbox_color, thickness=2) - except: - continue - - return result - - -def create_3d_bounding_box_corners(position, rotation, size: int): # type: ignore[no-untyped-def] - """ - Create 8 corners of a 3D bounding box from position, rotation, and size. - - Args: - position: Vector or dict with x, y, z coordinates - rotation: Vector or dict with roll, pitch, yaw angles - size: Dict with width, height, depth - - Returns: - 8x3 numpy array of corner coordinates - """ - # Convert position to numpy array - if hasattr(position, "x"): # Vector object - center = np.array([position.x, position.y, position.z]) - else: # Dictionary - center = np.array([position["x"], position["y"], position["z"]]) - - # Convert rotation (euler angles) to rotation matrix - if hasattr(rotation, "x"): # Vector object (roll, pitch, yaw) - roll, pitch, yaw = rotation.x, rotation.y, rotation.z - else: # Dictionary - roll, pitch, yaw = rotation["roll"], rotation["pitch"], rotation["yaw"] - - # Create rotation matrix from euler angles (ZYX order) - cos_r, sin_r = np.cos(roll), np.sin(roll) - cos_p, sin_p = np.cos(pitch), np.sin(pitch) - cos_y, sin_y = np.cos(yaw), np.sin(yaw) - - # Rotation matrix for ZYX euler angles - R = np.array( - [ - [ - cos_y * cos_p, - cos_y * sin_p * sin_r - sin_y * cos_r, - cos_y * sin_p * cos_r + sin_y * sin_r, - ], - [ - sin_y * cos_p, - sin_y * sin_p * sin_r + cos_y * cos_r, - sin_y * sin_p * cos_r - cos_y * sin_r, - ], - [-sin_p, cos_p * sin_r, cos_p * cos_r], - ] - ) - - # Get dimensions - width = size.get("width", 0.1) # type: ignore[attr-defined] - height = size.get("height", 0.1) # type: ignore[attr-defined] - depth = size.get("depth", 0.1) # type: ignore[attr-defined] - - # Create 8 corners of the bounding box (before rotation) - corners = np.array( - [ - [-width / 2, -height / 2, -depth / 2], # 0 - [width / 2, -height / 2, -depth / 2], # 1 - [width / 2, height / 2, -depth / 2], # 2 - [-width / 2, height / 2, -depth / 2], # 3 - [-width / 2, -height / 2, depth / 2], # 4 - [width / 2, -height / 2, depth / 2], # 5 - [width / 2, height / 2, depth / 2], # 6 - [-width / 2, height / 2, depth / 2], # 7 - ] - ) - - # Apply rotation and translation - rotated_corners = corners @ R.T + center - - return rotated_corners - - -def draw_3d_bounding_box_on_image(image, corners_2d, color, thickness: int = 2) -> None: # type: ignore[no-untyped-def] - """ - Draw a 3D bounding box on an image using projected 2D corners. - - Args: - image: Image to draw on - corners_2d: 8x2 array of 2D corner coordinates - color: RGB color tuple - thickness: Line thickness - """ - # Define the 12 edges of a cube (connecting corner indices) - edges = [ - (0, 1), - (1, 2), - (2, 3), - (3, 0), # Bottom face - (4, 5), - (5, 6), - (6, 7), - (7, 4), # Top face - (0, 4), - (1, 5), - (2, 6), - (3, 7), # Vertical edges - ] - - # Draw each edge - for start_idx, end_idx in edges: - start_point = tuple(corners_2d[start_idx].astype(int)) - end_point = tuple(corners_2d[end_idx].astype(int)) - cv2.line(image, start_point, end_point, color, thickness) - - -def extract_and_cluster_misc_points( - full_pcd: o3d.geometry.PointCloud, - all_objects: list[dict], # type: ignore[type-arg] - eps: float = 0.03, - min_points: int = 100, - enable_filtering: bool = True, - voxel_size: float = 0.02, -) -> tuple[list[o3d.geometry.PointCloud], o3d.geometry.VoxelGrid]: - """ - Extract miscellaneous/background points and cluster them using DBSCAN. - - Args: - full_pcd: Complete scene point cloud - all_objects: List of objects with point clouds to subtract - eps: DBSCAN epsilon parameter (max distance between points in cluster) - min_points: DBSCAN min_samples parameter (min points to form cluster) - enable_filtering: Whether to apply statistical and radius filtering - voxel_size: Size of voxels for voxel grid generation - - Returns: - Tuple of (clustered_point_clouds, voxel_grid) - """ - if full_pcd is None or len(np.asarray(full_pcd.points)) == 0: - return [], o3d.geometry.VoxelGrid() - - if not all_objects: - # If no objects detected, cluster the full point cloud - clusters = _cluster_point_cloud_dbscan(full_pcd, eps, min_points) - voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size) - return clusters, voxel_grid - - try: - # Start with a copy of the full point cloud - misc_pcd = o3d.geometry.PointCloud(full_pcd) - - # Remove object points by combining all object point clouds - all_object_points = [] - for obj in all_objects: - if "point_cloud" in obj and obj["point_cloud"] is not None: - obj_points = np.asarray(obj["point_cloud"].points) - if len(obj_points) > 0: - all_object_points.append(obj_points) - - if not all_object_points: - # No object points to remove, cluster full point cloud - clusters = _cluster_point_cloud_dbscan(misc_pcd, eps, min_points) - voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size) - return clusters, voxel_grid - - # Combine all object points - combined_obj_points = np.vstack(all_object_points) - - # For efficiency, downsample both point clouds - misc_downsampled = misc_pcd.voxel_down_sample(voxel_size=0.005) - - # Create object point cloud for efficient operations - obj_pcd = o3d.geometry.PointCloud() - obj_pcd.points = o3d.utility.Vector3dVector(combined_obj_points) - obj_downsampled = obj_pcd.voxel_down_sample(voxel_size=0.005) - - misc_points = np.asarray(misc_downsampled.points) - obj_points_down = np.asarray(obj_downsampled.points) - - if len(misc_points) == 0 or len(obj_points_down) == 0: - clusters = _cluster_point_cloud_dbscan(misc_downsampled, eps, min_points) - voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size) - return clusters, voxel_grid - - # Build tree for object points - obj_tree = cKDTree(obj_points_down) - - # Find distances from misc points to nearest object points - distances, _ = obj_tree.query(misc_points, k=1) - - # Keep points that are far enough from any object point - threshold = 0.015 # 1.5cm threshold - keep_mask = distances > threshold - - if not np.any(keep_mask): - return [], o3d.geometry.VoxelGrid() - - # Filter misc points - misc_indices = np.where(keep_mask)[0] - final_misc_pcd = misc_downsampled.select_by_index(misc_indices) - - if len(np.asarray(final_misc_pcd.points)) == 0: - return [], o3d.geometry.VoxelGrid() - - # Apply additional filtering if enabled - if enable_filtering: - # Apply statistical outlier filtering - filtered_misc_pcd, _ = filter_point_cloud_statistical( - final_misc_pcd, nb_neighbors=30, std_ratio=2.0 - ) - - if len(np.asarray(filtered_misc_pcd.points)) == 0: - return [], o3d.geometry.VoxelGrid() - - # Apply radius outlier filtering - final_filtered_misc_pcd, _ = filter_point_cloud_radius( - filtered_misc_pcd, - nb_points=20, - radius=0.03, # 3cm radius - ) - - if len(np.asarray(final_filtered_misc_pcd.points)) == 0: - return [], o3d.geometry.VoxelGrid() - - final_misc_pcd = final_filtered_misc_pcd - - # Cluster the misc points using DBSCAN - clusters = _cluster_point_cloud_dbscan(final_misc_pcd, eps, min_points) - - # Create voxel grid from all misc points (before clustering) - voxel_grid = _create_voxel_grid_from_point_cloud(final_misc_pcd, voxel_size) - - return clusters, voxel_grid - - except Exception as e: - print(f"Error in misc point extraction and clustering: {e}") - # Fallback: return downsampled full point cloud as single cluster - try: - downsampled = full_pcd.voxel_down_sample(voxel_size=0.02) - if len(np.asarray(downsampled.points)) > 0: - voxel_grid = _create_voxel_grid_from_point_cloud(downsampled, voxel_size) - return [downsampled], voxel_grid - else: - return [], o3d.geometry.VoxelGrid() - except: - return [], o3d.geometry.VoxelGrid() - - -def _create_voxel_grid_from_point_cloud( - pcd: o3d.geometry.PointCloud, voxel_size: float = 0.02 -) -> o3d.geometry.VoxelGrid: - """ - Create a voxel grid from a point cloud. - - Args: - pcd: Input point cloud - voxel_size: Size of each voxel - - Returns: - Open3D VoxelGrid object - """ - if len(np.asarray(pcd.points)) == 0: - return o3d.geometry.VoxelGrid() - - try: - # Create voxel grid from point cloud - voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud(pcd, voxel_size) - - # Color the voxels with a semi-transparent gray - for voxel in voxel_grid.get_voxels(): - voxel.color = [0.5, 0.5, 0.5] # Gray color - - print( - f"Created voxel grid with {len(voxel_grid.get_voxels())} voxels (voxel_size={voxel_size})" - ) - return voxel_grid - - except Exception as e: - print(f"Error creating voxel grid: {e}") - return o3d.geometry.VoxelGrid() - - -def _create_voxel_grid_from_clusters( - clusters: list[o3d.geometry.PointCloud], voxel_size: float = 0.02 -) -> o3d.geometry.VoxelGrid: - """ - Create a voxel grid from multiple clustered point clouds. - - Args: - clusters: List of clustered point clouds - voxel_size: Size of each voxel - - Returns: - Open3D VoxelGrid object - """ - if not clusters: - return o3d.geometry.VoxelGrid() - - # Combine all clusters into one point cloud - combined_points = [] - for cluster in clusters: - points = np.asarray(cluster.points) - if len(points) > 0: - combined_points.append(points) - - if not combined_points: - return o3d.geometry.VoxelGrid() - - # Create combined point cloud - all_points = np.vstack(combined_points) - combined_pcd = o3d.geometry.PointCloud() - combined_pcd.points = o3d.utility.Vector3dVector(all_points) - - return _create_voxel_grid_from_point_cloud(combined_pcd, voxel_size) - - -def _cluster_point_cloud_dbscan( - pcd: o3d.geometry.PointCloud, eps: float = 0.05, min_points: int = 50 -) -> list[o3d.geometry.PointCloud]: - """ - Cluster a point cloud using DBSCAN and return list of clustered point clouds. - - Args: - pcd: Point cloud to cluster - eps: DBSCAN epsilon parameter - min_points: DBSCAN min_samples parameter - - Returns: - List of point clouds, one for each cluster - """ - if len(np.asarray(pcd.points)) == 0: - return [] - - try: - # Apply DBSCAN clustering - labels = np.array(pcd.cluster_dbscan(eps=eps, min_points=min_points)) - - # Get unique cluster labels (excluding noise points labeled as -1) - unique_labels = np.unique(labels) - cluster_pcds = [] - - for label in unique_labels: - if label == -1: # Skip noise points - continue - - # Get indices for this cluster - cluster_indices = np.where(labels == label)[0] - - if len(cluster_indices) > 0: - # Create point cloud for this cluster - cluster_pcd = pcd.select_by_index(cluster_indices) - - # Assign a random color to this cluster - cluster_color = np.random.rand(3) # Random RGB color - cluster_pcd.paint_uniform_color(cluster_color) - - cluster_pcds.append(cluster_pcd) - - print( - f"DBSCAN clustering found {len(cluster_pcds)} clusters from {len(np.asarray(pcd.points))} points" - ) - return cluster_pcds - - except Exception as e: - print(f"Error in DBSCAN clustering: {e}") - return [pcd] # Return original point cloud as fallback - - -def get_standard_coordinate_transform(): # type: ignore[no-untyped-def] - """ - Get a standard coordinate transformation matrix for consistent visualization. - - This transformation ensures that: - - X (red) axis points right - - Y (green) axis points up - - Z (blue) axis points toward viewer - - Returns: - 4x4 transformation matrix - """ - # Standard transformation matrix to ensure consistent coordinate frame orientation - transform = np.array( - [ - [1, 0, 0, 0], # X points right - [0, -1, 0, 0], # Y points up (flip from OpenCV to standard) - [0, 0, -1, 0], # Z points toward viewer (flip depth) - [0, 0, 0, 1], - ] - ) - return transform - - -def visualize_clustered_point_clouds( - clustered_pcds: list[o3d.geometry.PointCloud], - window_name: str = "Clustered Point Clouds", - point_size: float = 2.0, - show_coordinate_frame: bool = True, - coordinate_frame_size: float = 0.1, -) -> None: - """ - Visualize multiple clustered point clouds with different colors. - - Args: - clustered_pcds: List of point clouds (already colored) - window_name: Name of the visualization window - point_size: Size of points in the visualization - show_coordinate_frame: Whether to show coordinate frame - coordinate_frame_size: Size of the coordinate frame - """ - if not clustered_pcds: - print("Warning: No clustered point clouds to visualize") - return - - # Apply standard coordinate transformation - transform = get_standard_coordinate_transform() # type: ignore[no-untyped-call] - geometries = [] - for pcd in clustered_pcds: - pcd_copy = o3d.geometry.PointCloud(pcd) - pcd_copy.transform(transform) - geometries.append(pcd_copy) - - # Add coordinate frame - if show_coordinate_frame: - coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame( - size=coordinate_frame_size - ) - coordinate_frame.transform(transform) - geometries.append(coordinate_frame) - - total_points = sum(len(np.asarray(pcd.points)) for pcd in clustered_pcds) - print(f"Visualizing {len(clustered_pcds)} clusters with {total_points} total points") - - try: - vis = o3d.visualization.Visualizer() - vis.create_window(window_name=window_name, width=1280, height=720) - for geom in geometries: - vis.add_geometry(geom) - render_option = vis.get_render_option() - render_option.point_size = point_size - vis.run() - vis.destroy_window() - except Exception as e: - print(f"Failed to create interactive visualization: {e}") - o3d.visualization.draw_geometries( - geometries, window_name=window_name, width=1280, height=720 - ) - - -def visualize_pcd( - pcd: o3d.geometry.PointCloud, - window_name: str = "Point Cloud Visualization", - point_size: float = 1.0, - show_coordinate_frame: bool = True, - coordinate_frame_size: float = 0.1, -) -> None: - """ - Visualize an Open3D point cloud using Open3D's visualization window. - - Args: - pcd: Open3D point cloud to visualize - window_name: Name of the visualization window - point_size: Size of points in the visualization - show_coordinate_frame: Whether to show coordinate frame - coordinate_frame_size: Size of the coordinate frame - """ - if pcd is None: - print("Warning: Point cloud is None, nothing to visualize") - return - - if len(np.asarray(pcd.points)) == 0: - print("Warning: Point cloud is empty, nothing to visualize") - return - - # Apply standard coordinate transformation - transform = get_standard_coordinate_transform() # type: ignore[no-untyped-call] - pcd_copy = o3d.geometry.PointCloud(pcd) - pcd_copy.transform(transform) - geometries = [pcd_copy] - - # Add coordinate frame - if show_coordinate_frame: - coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame( - size=coordinate_frame_size - ) - coordinate_frame.transform(transform) - geometries.append(coordinate_frame) - - print(f"Visualizing point cloud with {len(np.asarray(pcd.points))} points") - - try: - vis = o3d.visualization.Visualizer() - vis.create_window(window_name=window_name, width=1280, height=720) - for geom in geometries: - vis.add_geometry(geom) - render_option = vis.get_render_option() - render_option.point_size = point_size - vis.run() - vis.destroy_window() - except Exception as e: - print(f"Failed to create interactive visualization: {e}") - o3d.visualization.draw_geometries( - geometries, window_name=window_name, width=1280, height=720 - ) - - -def visualize_voxel_grid( - voxel_grid: o3d.geometry.VoxelGrid, - window_name: str = "Voxel Grid Visualization", - show_coordinate_frame: bool = True, - coordinate_frame_size: float = 0.1, -) -> None: - """ - Visualize an Open3D voxel grid using Open3D's visualization window. - - Args: - voxel_grid: Open3D voxel grid to visualize - window_name: Name of the visualization window - show_coordinate_frame: Whether to show coordinate frame - coordinate_frame_size: Size of the coordinate frame - """ - if voxel_grid is None: - print("Warning: Voxel grid is None, nothing to visualize") - return - - if len(voxel_grid.get_voxels()) == 0: - print("Warning: Voxel grid is empty, nothing to visualize") - return - - # VoxelGrid doesn't support transform, so we need to transform the source points instead - # For now, just visualize as-is with transformed coordinate frame - geometries = [voxel_grid] - - # Add coordinate frame - if show_coordinate_frame: - coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame( - size=coordinate_frame_size - ) - coordinate_frame.transform(get_standard_coordinate_transform()) # type: ignore[no-untyped-call] - geometries.append(coordinate_frame) - - print(f"Visualizing voxel grid with {len(voxel_grid.get_voxels())} voxels") - - try: - vis = o3d.visualization.Visualizer() - vis.create_window(window_name=window_name, width=1280, height=720) - for geom in geometries: - vis.add_geometry(geom) - vis.run() - vis.destroy_window() - except Exception as e: - print(f"Failed to create interactive visualization: {e}") - o3d.visualization.draw_geometries( - geometries, window_name=window_name, width=1280, height=720 - ) - - -def combine_object_pointclouds( - point_clouds: list[np.ndarray] | list[o3d.geometry.PointCloud], # type: ignore[type-arg] - colors: list[np.ndarray] | None = None, # type: ignore[type-arg] -) -> o3d.geometry.PointCloud: - """ - Combine multiple point clouds into a single Open3D point cloud. - - Args: - point_clouds: List of point clouds as numpy arrays or Open3D point clouds - colors: List of colors as numpy arrays - Returns: - Combined Open3D point cloud - """ - all_points = [] - all_colors = [] - - for i, pcd in enumerate(point_clouds): - if isinstance(pcd, np.ndarray): - points = pcd[:, :3] - all_points.append(points) - if colors: - all_colors.append(colors[i]) - - elif isinstance(pcd, o3d.geometry.PointCloud): - points = np.asarray(pcd.points) - all_points.append(points) - if pcd.has_colors(): - colors = np.asarray(pcd.colors) # type: ignore[assignment] - all_colors.append(colors) # type: ignore[arg-type] - - if not all_points: - return o3d.geometry.PointCloud() - - combined_pcd = o3d.geometry.PointCloud() - combined_pcd.points = o3d.utility.Vector3dVector(np.vstack(all_points)) - - if all_colors: - combined_pcd.colors = o3d.utility.Vector3dVector(np.vstack(all_colors)) - - return combined_pcd - - -def extract_centroids_from_masks( - rgb_image: np.ndarray, # type: ignore[type-arg] - depth_image: np.ndarray, # type: ignore[type-arg] - masks: list[np.ndarray], # type: ignore[type-arg] - camera_intrinsics: list[float] | np.ndarray, # type: ignore[type-arg] -) -> list[dict[str, Any]]: - """ - Extract 3D centroids and orientations from segmentation masks. - - Args: - rgb_image: RGB image (H, W, 3) - depth_image: Depth image (H, W) in meters - masks: List of boolean masks (H, W) - camera_intrinsics: Camera parameters as [fx, fy, cx, cy] or 3x3 matrix - - Returns: - List of dictionaries containing: - - centroid: 3D centroid position [x, y, z] in camera frame - - orientation: Normalized direction vector from camera to centroid - - num_points: Number of valid 3D points - - mask_idx: Index of the mask in the input list - """ - # Extract camera parameters - if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: - fx, fy, cx, cy = camera_intrinsics - else: - fx = camera_intrinsics[0, 0] # type: ignore[call-overload] - fy = camera_intrinsics[1, 1] # type: ignore[call-overload] - cx = camera_intrinsics[0, 2] # type: ignore[call-overload] - cy = camera_intrinsics[1, 2] # type: ignore[call-overload] - - results = [] - - for mask_idx, mask in enumerate(masks): - if mask is None or mask.sum() == 0: - continue - - # Get pixel coordinates where mask is True - y_coords, x_coords = np.where(mask) - - # Get depth values at mask locations - depths = depth_image[y_coords, x_coords] - - # Convert to 3D points in camera frame - X = (x_coords - cx) * depths / fx - Y = (y_coords - cy) * depths / fy - Z = depths - - # Calculate centroid - centroid_x = np.mean(X) - centroid_y = np.mean(Y) - centroid_z = np.mean(Z) - centroid = np.array([centroid_x, centroid_y, centroid_z]) - - # Calculate orientation as normalized direction from camera origin to centroid - # Camera origin is at (0, 0, 0) - orientation = centroid / np.linalg.norm(centroid) - - results.append( - { - "centroid": centroid, - "orientation": orientation, - "num_points": int(mask.sum()), - "mask_idx": mask_idx, - } - ) - - return results diff --git a/dimos/perception/segmentation/__init__.py b/dimos/perception/segmentation/__init__.py deleted file mode 100644 index a48a76d6a4..0000000000 --- a/dimos/perception/segmentation/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .sam_2d_seg import * -from .utils import * diff --git a/dimos/perception/segmentation/config/custom_tracker.yaml b/dimos/perception/segmentation/config/custom_tracker.yaml deleted file mode 100644 index 7a6748ebf6..0000000000 --- a/dimos/perception/segmentation/config/custom_tracker.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license - -# Default Ultralytics settings for BoT-SORT tracker when using mode="track" -# For documentation and examples see https://docs.ultralytics.com/modes/track/ -# For BoT-SORT source code see https://github.com/NirAharon/BoT-SORT - -tracker_type: botsort # tracker type, ['botsort', 'bytetrack'] -track_high_thresh: 0.4 # threshold for the first association -track_low_thresh: 0.2 # threshold for the second association -new_track_thresh: 0.5 # threshold for init new track if the detection does not match any tracks -track_buffer: 100 # buffer to calculate the time when to remove tracks -match_thresh: 0.4 # threshold for matching tracks -fuse_score: False # Whether to fuse confidence scores with the iou distances before matching -# min_box_area: 10 # threshold for min box areas(for tracker evaluation, not used for now) - -# BoT-SORT settings -gmc_method: sparseOptFlow # method of global motion compensation -# ReID model related thresh (not supported yet) -proximity_thresh: 0.6 -appearance_thresh: 0.35 -with_reid: False diff --git a/dimos/perception/segmentation/image_analyzer.py b/dimos/perception/segmentation/image_analyzer.py deleted file mode 100644 index 06db712ac7..0000000000 --- a/dimos/perception/segmentation/image_analyzer.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import base64 -import os - -import cv2 -from openai import OpenAI - -NORMAL_PROMPT = "What are in these images? Give a short word answer with at most two words, \ - if not sure, give a description of its shape or color like 'small tube', 'blue item'. \" \ - if does not look like an object, say 'unknown'. Export objects as a list of strings \ - in this exact format '['object 1', 'object 2', '...']'." - -RICH_PROMPT = ( - "What are in these images? Give a detailed description of each item, the first n images will be \ - cropped patches of the original image detected by the object detection model. \ - The last image will be the original image. Use the last image only for context, \ - do not describe objects in the last image. \ - Export the objects as a list of strings in this exact format, '['description of object 1', '...', '...']', \ - don't include anything else. " -) - - -class ImageAnalyzer: - def __init__(self) -> None: - """ - Initializes the ImageAnalyzer with OpenAI API credentials. - """ - self.client = OpenAI() - - def encode_image(self, image): # type: ignore[no-untyped-def] - """ - Encodes an image to Base64. - - Parameters: - image (numpy array): Image array (BGR format). - - Returns: - str: Base64 encoded string of the image. - """ - _, buffer = cv2.imencode(".jpg", image) - return base64.b64encode(buffer).decode("utf-8") - - def analyze_images(self, images, detail: str = "auto", prompt_type: str = "normal"): # type: ignore[no-untyped-def] - """ - Takes a list of cropped images and returns descriptions from OpenAI's Vision model. - - Parameters: - images (list of numpy arrays): Cropped images from the original frame. - detail (str): "low", "high", or "auto" to set image processing detail. - prompt_type (str): "normal" or "rich" to set the prompt type. - - Returns: - list of str: Descriptions of objects in each image. - """ - image_data = [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{self.encode_image(img)}", # type: ignore[no-untyped-call] - "detail": detail, - }, - } - for img in images - ] - - if prompt_type == "normal": - prompt = NORMAL_PROMPT - elif prompt_type == "rich": - prompt = RICH_PROMPT - else: - raise ValueError(f"Invalid prompt type: {prompt_type}") - - response = self.client.chat.completions.create( - model="gpt-4o-mini", - messages=[ - { # type: ignore[list-item, misc] - "role": "user", - "content": [{"type": "text", "text": prompt}, *image_data], - } - ], - max_tokens=300, - timeout=5, - ) - - # Accessing the content of the response using dot notation - return next(choice.message.content for choice in response.choices) - - -def main() -> None: - # Define the directory containing cropped images - cropped_images_dir = "cropped_images" - if not os.path.exists(cropped_images_dir): - print(f"Directory '{cropped_images_dir}' does not exist.") - return - - # Load all images from the directory - images = [] - for filename in os.listdir(cropped_images_dir): - if filename.endswith(".jpg") or filename.endswith(".png"): - image_path = os.path.join(cropped_images_dir, filename) - image = cv2.imread(image_path) - if image is not None: - images.append(image) - else: - print(f"Warning: Could not read image {image_path}") - - if not images: - print("No valid images found in the directory.") - return - - # Initialize ImageAnalyzer - analyzer = ImageAnalyzer() - - # Analyze images - results = analyzer.analyze_images(images) - - # Split results into a list of items - object_list = [item.strip()[2:] for item in results.split("\n")] - - # Overlay text on images and display them - for i, (img, obj) in enumerate(zip(images, object_list, strict=False)): - if obj: # Only process non-empty lines - # Add text to image - font = cv2.FONT_HERSHEY_SIMPLEX - font_scale = 0.5 - thickness = 2 - text = obj.strip() - - # Get text size - (text_width, text_height), _ = cv2.getTextSize(text, font, font_scale, thickness) - - # Position text at top of image - x = 10 - y = text_height + 10 - - # Add white background for text - cv2.rectangle( - img, (x - 5, y - text_height - 5), (x + text_width + 5, y + 5), (255, 255, 255), -1 - ) - # Add text - cv2.putText(img, text, (x, y), font, font_scale, (0, 0, 0), thickness) - - # Save or display the image - cv2.imwrite(f"annotated_image_{i}.jpg", img) - print(f"Detected object: {obj}") - - -if __name__ == "__main__": - main() diff --git a/dimos/perception/segmentation/sam_2d_seg.py b/dimos/perception/segmentation/sam_2d_seg.py deleted file mode 100644 index 741f71a9ab..0000000000 --- a/dimos/perception/segmentation/sam_2d_seg.py +++ /dev/null @@ -1,366 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import deque -from collections.abc import Sequence -from concurrent.futures import ThreadPoolExecutor -import os -import time - -import cv2 -import onnxruntime # type: ignore[import-untyped] -import torch -from ultralytics import FastSAM # type: ignore[attr-defined, import-not-found] - -from dimos.perception.common.detection2d_tracker import get_tracked_results, target2dTracker -from dimos.perception.segmentation.image_analyzer import ImageAnalyzer -from dimos.perception.segmentation.utils import ( - crop_images_from_bboxes, - extract_masks_bboxes_probs_names, - filter_segmentation_results, - plot_results, -) -from dimos.utils.data import get_data -from dimos.utils.logging_config import setup_logger - -logger = setup_logger() - - -class Sam2DSegmenter: - def __init__( - self, - model_path: str = "models_fastsam", - model_name: str = "FastSAM-s.onnx", - min_analysis_interval: float = 5.0, - use_tracker: bool = True, - use_analyzer: bool = True, - use_rich_labeling: bool = False, - use_filtering: bool = True, - ) -> None: - # Use GPU if available, otherwise fall back to CPU - if torch.cuda.is_available(): - logger.info("Using CUDA for SAM 2d segmenter") - if hasattr(onnxruntime, "preload_dlls"): # Handles CUDA 11 / onnxruntime-gpu<=1.18 - onnxruntime.preload_dlls(cuda=True, cudnn=True) - self.device = "cuda" - # MacOS Metal performance shaders - elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): - logger.info("Using Metal for SAM 2d segmenter") - self.device = "mps" - else: - logger.info("Using CPU for SAM 2d segmenter") - self.device = "cpu" - - # Core components - self.model = FastSAM(get_data(model_path) / model_name) - self.use_tracker = use_tracker - self.use_analyzer = use_analyzer - self.use_rich_labeling = use_rich_labeling - self.use_filtering = use_filtering - - module_dir = os.path.dirname(__file__) - self.tracker_config = os.path.join(module_dir, "config", "custom_tracker.yaml") - - # Initialize tracker if enabled - if self.use_tracker: - self.tracker = target2dTracker( - history_size=80, - score_threshold_start=0.7, - score_threshold_stop=0.05, - min_frame_count=10, - max_missed_frames=50, - min_area_ratio=0.05, - max_area_ratio=0.4, - texture_range=(0.0, 0.35), - border_safe_distance=100, - weights={"prob": 1.0, "temporal": 3.0, "texture": 2.0, "border": 3.0, "size": 1.0}, - ) - - # Initialize analyzer components if enabled - if self.use_analyzer: - self.image_analyzer = ImageAnalyzer() - self.min_analysis_interval = min_analysis_interval - self.last_analysis_time = 0 - self.to_be_analyzed = deque() # type: ignore[var-annotated] - self.object_names = {} # type: ignore[var-annotated] - self.analysis_executor = ThreadPoolExecutor(max_workers=1) - self.current_future = None - self.current_queue_ids = None - - def process_image(self, image): # type: ignore[no-untyped-def] - """Process an image and return segmentation results.""" - results = self.model.track( - source=image, - device=self.device, - retina_masks=True, - conf=0.3, - iou=0.5, - persist=True, - verbose=False, - ) - - if len(results) > 0: - # Get initial segmentation results - masks, bboxes, track_ids, probs, names, areas = extract_masks_bboxes_probs_names( - results[0] - ) - - # Filter results - if self.use_filtering: - ( - filtered_masks, - filtered_bboxes, - filtered_track_ids, - filtered_probs, - filtered_names, - filtered_texture_values, - ) = filter_segmentation_results( - image, masks, bboxes, track_ids, probs, names, areas - ) - else: - # Use original results without filtering - filtered_masks = masks - filtered_bboxes = bboxes - filtered_track_ids = track_ids - filtered_probs = probs - filtered_names = names - filtered_texture_values = [] - - if self.use_tracker: - # Update tracker with filtered results - tracked_targets = self.tracker.update( - image, - filtered_masks, - filtered_bboxes, - filtered_track_ids, - filtered_probs, - filtered_names, - filtered_texture_values, - ) - - # Get tracked results - tracked_masks, tracked_bboxes, tracked_target_ids, tracked_probs, tracked_names = ( - get_tracked_results(tracked_targets) # type: ignore[no-untyped-call] - ) - - if self.use_analyzer: - # Update analysis queue with tracked IDs - target_id_set = set(tracked_target_ids) - - # Remove untracked objects from object_names - all_target_ids = list(self.tracker.targets.keys()) - self.object_names = { - track_id: name - for track_id, name in self.object_names.items() - if track_id in all_target_ids - } - - # Remove untracked objects from queue and results - self.to_be_analyzed = deque( - [track_id for track_id in self.to_be_analyzed if track_id in target_id_set] - ) - - # Filter out any IDs being analyzed from the to_be_analyzed queue - if self.current_queue_ids: - self.to_be_analyzed = deque( - [ - tid - for tid in self.to_be_analyzed - if tid not in self.current_queue_ids - ] - ) - - # Add new track_ids to analysis queue - for track_id in tracked_target_ids: - if ( - track_id not in self.object_names - and track_id not in self.to_be_analyzed - ): - self.to_be_analyzed.append(track_id) - - return ( - tracked_masks, - tracked_bboxes, - tracked_target_ids, - tracked_probs, - tracked_names, - ) - else: - # When tracker disabled, just use the filtered results directly - if self.use_analyzer: - # Add unanalyzed IDs to the analysis queue - for track_id in filtered_track_ids: - if ( - track_id not in self.object_names - and track_id not in self.to_be_analyzed - ): - self.to_be_analyzed.append(track_id) - - # Simply return filtered results - return ( - filtered_masks, - filtered_bboxes, - filtered_track_ids, - filtered_probs, - filtered_names, - ) - return [], [], [], [], [] - - def check_analysis_status(self, tracked_target_ids): # type: ignore[no-untyped-def] - """Check if analysis is complete and prepare new queue if needed.""" - if not self.use_analyzer: - return None, None - - current_time = time.time() - - # Check if current queue analysis is complete - if self.current_future and self.current_future.done(): - try: - results = self.current_future.result() - if results is not None: - # Map results to track IDs - object_list = eval(results) - for track_id, result in zip(self.current_queue_ids, object_list, strict=False): - self.object_names[track_id] = result - except Exception as e: - print(f"Queue analysis failed: {e}") - self.current_future = None - self.current_queue_ids = None - self.last_analysis_time = current_time - - # If enough time has passed and we have items to analyze, start new analysis - if ( - not self.current_future - and self.to_be_analyzed - and current_time - self.last_analysis_time >= self.min_analysis_interval - ): - queue_indices = [] - queue_ids = [] - - # Collect all valid track IDs from the queue - while self.to_be_analyzed: - track_id = self.to_be_analyzed[0] - if track_id in tracked_target_ids: - bbox_idx = tracked_target_ids.index(track_id) - queue_indices.append(bbox_idx) - queue_ids.append(track_id) - self.to_be_analyzed.popleft() - - if queue_indices: - return queue_indices, queue_ids - return None, None - - def run_analysis(self, frame, tracked_bboxes, tracked_target_ids) -> None: # type: ignore[no-untyped-def] - """Run queue image analysis in background.""" - if not self.use_analyzer: - return - - queue_indices, queue_ids = self.check_analysis_status(tracked_target_ids) # type: ignore[no-untyped-call] - if queue_indices: - selected_bboxes = [tracked_bboxes[i] for i in queue_indices] - cropped_images = crop_images_from_bboxes(frame, selected_bboxes) - if cropped_images: - self.current_queue_ids = queue_ids - print(f"Analyzing objects with track_ids: {queue_ids}") - - if self.use_rich_labeling: - prompt_type = "rich" - cropped_images.append(frame) - else: - prompt_type = "normal" - - self.current_future = self.analysis_executor.submit( # type: ignore[assignment] - self.image_analyzer.analyze_images, cropped_images, prompt_type=prompt_type - ) - - def get_object_names(self, track_ids, tracked_names: Sequence[str]): # type: ignore[no-untyped-def] - """Get object names for the given track IDs, falling back to tracked names.""" - if not self.use_analyzer: - return tracked_names - - return [ - self.object_names.get(track_id, tracked_name) - for track_id, tracked_name in zip(track_ids, tracked_names, strict=False) - ] - - def visualize_results( # type: ignore[no-untyped-def] - self, image, masks, bboxes, track_ids, probs: Sequence[float], names: Sequence[str] - ): - """Generate an overlay visualization with segmentation results and object names.""" - return plot_results(image, masks, bboxes, track_ids, probs, names) - - def cleanup(self) -> None: - """Cleanup resources.""" - if self.use_analyzer: - self.analysis_executor.shutdown() - - -def main() -> None: - # Example usage with different configurations - cap = cv2.VideoCapture(0) - - # Example 1: Full functionality with rich labeling - segmenter = Sam2DSegmenter( - min_analysis_interval=4.0, - use_tracker=True, - use_analyzer=True, - use_rich_labeling=True, # Enable rich labeling - ) - - # Example 2: Full functionality with normal labeling - # segmenter = Sam2DSegmenter(min_analysis_interval=4.0, use_tracker=True, use_analyzer=True) - - # Example 3: Tracker only (analyzer disabled) - # segmenter = Sam2DSegmenter(use_analyzer=False) - - # Example 4: Basic segmentation only (both tracker and analyzer disabled) - # segmenter = Sam2DSegmenter(use_tracker=False, use_analyzer=False) - - # Example 5: Analyzer without tracker (new capability) - # segmenter = Sam2DSegmenter(use_tracker=False, use_analyzer=True) - - try: - while cap.isOpened(): - ret, frame = cap.read() - if not ret: - break - - time.time() - - # Process image and get results - masks, bboxes, target_ids, probs, names = segmenter.process_image(frame) # type: ignore[no-untyped-call] - - # Run analysis if enabled - if segmenter.use_analyzer: - segmenter.run_analysis(frame, bboxes, target_ids) - names = segmenter.get_object_names(target_ids, names) - - # processing_time = time.time() - start_time - # print(f"Processing time: {processing_time:.2f}s") - - overlay = segmenter.visualize_results(frame, masks, bboxes, target_ids, probs, names) - - cv2.imshow("Segmentation", overlay) - key = cv2.waitKey(1) - if key & 0xFF == ord("q"): - break - - finally: - segmenter.cleanup() - cap.release() - cv2.destroyAllWindows() - - -if __name__ == "__main__": - main() diff --git a/dimos/perception/segmentation/test_sam_2d_seg.py b/dimos/perception/segmentation/test_sam_2d_seg.py deleted file mode 100644 index a9222ed2f2..0000000000 --- a/dimos/perception/segmentation/test_sam_2d_seg.py +++ /dev/null @@ -1,210 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import time - -import numpy as np -import pytest -from reactivex import operators as ops - -from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter -from dimos.perception.segmentation.utils import extract_masks_bboxes_probs_names -from dimos.stream.video_provider import VideoProvider - - -@pytest.mark.heavy -class TestSam2DSegmenter: - def test_sam_segmenter_initialization(self) -> None: - """Test FastSAM segmenter initializes correctly with default model path.""" - try: - # Try to initialize with the default model path and existing device setting - segmenter = Sam2DSegmenter(use_analyzer=False) - assert segmenter is not None - assert segmenter.model is not None - except Exception as e: - # If the model file doesn't exist, the test should still pass with a warning - pytest.skip(f"Skipping test due to model initialization error: {e}") - - def test_sam_segmenter_process_image(self) -> None: - """Test FastSAM segmenter can process video frames and return segmentation masks.""" - # Import get data inside method to avoid pytest fixture confusion - from dimos.utils.data import get_data - - # Get test video path directly - video_path = get_data("assets") / "trimmed_video_office.mov" - try: - # Initialize segmenter without analyzer for faster testing - segmenter = Sam2DSegmenter(use_analyzer=False) - - # Note: conf and iou are parameters for process_image, not constructor - # We'll monkey patch the process_image method to use lower thresholds - - def patched_process_image(image): - results = segmenter.model.track( - source=image, - device=segmenter.device, - retina_masks=True, - conf=0.1, # Lower confidence threshold for testing - iou=0.5, # Lower IoU threshold - persist=True, - verbose=False, - tracker=segmenter.tracker_config - if hasattr(segmenter, "tracker_config") - else None, - ) - - if len(results) > 0: - masks, bboxes, track_ids, probs, names, _areas = ( - extract_masks_bboxes_probs_names(results[0]) - ) - return masks, bboxes, track_ids, probs, names - return [], [], [], [], [] - - # Replace the method - segmenter.process_image = patched_process_image - - # Create video provider and directly get a video stream observable - assert os.path.exists(video_path), f"Test video not found: {video_path}" - video_provider = VideoProvider(dev_name="test_video", video_source=video_path) - - video_stream = video_provider.capture_video_as_observable(realtime=False, fps=1) - - # Use ReactiveX operators to process the stream - def process_frame(frame): - try: - # Process frame with FastSAM - masks, bboxes, track_ids, probs, names = segmenter.process_image(frame) - print( - f"SAM results - masks: {len(masks)}, bboxes: {len(bboxes)}, track_ids: {len(track_ids)}, names: {len(names)}" - ) - - return { - "frame": frame, - "masks": masks, - "bboxes": bboxes, - "track_ids": track_ids, - "probs": probs, - "names": names, - } - except Exception as e: - print(f"Error in process_frame: {e}") - return {} - - # Create the segmentation stream using pipe and map operator - segmentation_stream = video_stream.pipe(ops.map(process_frame)) - - # Collect results from the stream - results = [] - frames_processed = 0 - target_frames = 5 - - def on_next(result) -> None: - nonlocal frames_processed, results - if not result: - return - - results.append(result) - frames_processed += 1 - - # Stop processing after target frames - if frames_processed >= target_frames: - subscription.dispose() - - def on_error(error) -> None: - pytest.fail(f"Error in segmentation stream: {error}") - - def on_completed() -> None: - pass - - # Subscribe and wait for results - subscription = segmentation_stream.subscribe( - on_next=on_next, on_error=on_error, on_completed=on_completed - ) - - # Wait for frames to be processed - timeout = 30.0 # seconds - start_time = time.time() - while frames_processed < target_frames and time.time() - start_time < timeout: - time.sleep(0.5) - - # Clean up subscription - subscription.dispose() - video_provider.dispose_all() - - # Check if we have results - if len(results) == 0: - pytest.skip( - "No segmentation results found, but test connection established correctly" - ) - return - - print(f"Processed {len(results)} frames with segmentation results") - - # Analyze the first result - result = results[0] - - # Check that we have a frame - assert "frame" in result, "Result doesn't contain a frame" - assert isinstance(result["frame"], np.ndarray), "Frame is not a numpy array" - - # Check that segmentation results are valid - assert isinstance(result["masks"], list) - assert isinstance(result["bboxes"], list) - assert isinstance(result["track_ids"], list) - assert isinstance(result["probs"], list) - assert isinstance(result["names"], list) - - # All result lists should be the same length - assert ( - len(result["masks"]) - == len(result["bboxes"]) - == len(result["track_ids"]) - == len(result["probs"]) - == len(result["names"]) - ) - - # If we have masks, check that they have valid shape - if result.get("masks") and len(result["masks"]) > 0: - assert result["masks"][0].shape == ( - result["frame"].shape[0], - result["frame"].shape[1], - ), "Mask shape should match image dimensions" - print(f"Found {len(result['masks'])} masks in first frame") - else: - print("No masks found in first frame, but test connection established correctly") - - # Test visualization function - if result["masks"]: - vis_frame = segmenter.visualize_results( - result["frame"], - result["masks"], - result["bboxes"], - result["track_ids"], - result["probs"], - result["names"], - ) - assert isinstance(vis_frame, np.ndarray), "Visualization output should be an image" - assert vis_frame.shape == result["frame"].shape, ( - "Visualization should have same dimensions as input frame" - ) - - # We've already tested visualization above, so no need for a duplicate test - - except Exception as e: - pytest.skip(f"Skipping test due to error: {e}") - - -if __name__ == "__main__": - pytest.main(["-v", __file__]) diff --git a/dimos/perception/segmentation/utils.py b/dimos/perception/segmentation/utils.py deleted file mode 100644 index a23a256ca2..0000000000 --- a/dimos/perception/segmentation/utils.py +++ /dev/null @@ -1,343 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections.abc import Sequence - -import cv2 -import numpy as np -import torch - - -class SimpleTracker: - def __init__( - self, history_size: int = 100, min_count: int = 10, count_window: int = 20 - ) -> None: - """ - Simple temporal tracker that counts appearances in a fixed window. - :param history_size: Number of past frames to remember - :param min_count: Minimum number of appearances required - :param count_window: Number of latest frames to consider for counting - """ - self.history = [] # type: ignore[var-annotated] - self.history_size = history_size - self.min_count = min_count - self.count_window = count_window - self.total_counts = {} # type: ignore[var-annotated] - - def update(self, track_ids): # type: ignore[no-untyped-def] - # Add new frame's track IDs to history - self.history.append(track_ids) - if len(self.history) > self.history_size: - self.history.pop(0) - - # Consider only the latest `count_window` frames for counting - recent_history = self.history[-self.count_window :] - all_tracks = np.concatenate(recent_history) if recent_history else np.array([]) - - # Compute occurrences efficiently using numpy - unique_ids, counts = np.unique(all_tracks, return_counts=True) - id_counts = dict(zip(unique_ids, counts, strict=False)) - - # Update total counts but ensure it only contains IDs within the history size - total_tracked_ids = np.concatenate(self.history) if self.history else np.array([]) - unique_total_ids, total_counts = np.unique(total_tracked_ids, return_counts=True) - self.total_counts = dict(zip(unique_total_ids, total_counts, strict=False)) - - # Return IDs that appear often enough - return [track_id for track_id, count in id_counts.items() if count >= self.min_count] - - def get_total_counts(self): # type: ignore[no-untyped-def] - """Returns the total count of each tracking ID seen over time, limited to history size.""" - return self.total_counts - - -def extract_masks_bboxes_probs_names(result, max_size: float = 0.7): # type: ignore[no-untyped-def] - """ - Extracts masks, bounding boxes, probabilities, and class names from one Ultralytics result object. - - Parameters: - result: Ultralytics result object - max_size: float, maximum allowed size of object relative to image (0-1) - - Returns: - tuple: (masks, bboxes, track_ids, probs, names, areas) - """ - masks = [] # type: ignore[var-annotated] - bboxes = [] # type: ignore[var-annotated] - track_ids = [] # type: ignore[var-annotated] - probs = [] # type: ignore[var-annotated] - names = [] # type: ignore[var-annotated] - areas = [] # type: ignore[var-annotated] - - if result.masks is None: - return masks, bboxes, track_ids, probs, names, areas - - total_area = result.masks.orig_shape[0] * result.masks.orig_shape[1] - - for box, mask_data in zip(result.boxes, result.masks.data, strict=False): - mask_numpy = mask_data - - # Extract bounding box - x1, y1, x2, y2 = box.xyxy[0].tolist() - - # Extract track_id if available - track_id = -1 # default if no tracking - if hasattr(box, "id") and box.id is not None: - track_id = int(box.id[0].item()) - - # Extract probability and class index - conf = float(box.conf[0]) - cls_idx = int(box.cls[0]) - area = (x2 - x1) * (y2 - y1) - - if area / total_area > max_size: - continue - - masks.append(mask_numpy) - bboxes.append([x1, y1, x2, y2]) - track_ids.append(track_id) - probs.append(conf) - names.append(result.names[cls_idx]) - areas.append(area) - - return masks, bboxes, track_ids, probs, names, areas - - -def compute_texture_map(frame, blur_size: int = 3): # type: ignore[no-untyped-def] - """ - Compute texture map using gradient statistics. - Returns high values for textured regions and low values for smooth regions. - - Parameters: - frame: BGR image - blur_size: Size of Gaussian blur kernel for pre-processing - - Returns: - numpy array: Texture map with values normalized to [0,1] - """ - # Convert to grayscale - if len(frame.shape) == 3: - gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) - else: - gray = frame - - # Pre-process with slight blur to reduce noise - if blur_size > 0: - gray = cv2.GaussianBlur(gray, (blur_size, blur_size), 0) - - # Compute gradients in x and y directions - grad_x = cv2.Sobel(gray, cv2.CV_32F, 1, 0, ksize=3) - grad_y = cv2.Sobel(gray, cv2.CV_32F, 0, 1, ksize=3) - - # Compute gradient magnitude and direction - magnitude = np.sqrt(grad_x**2 + grad_y**2) - - # Compute local standard deviation of gradient magnitude - texture_map = cv2.GaussianBlur(magnitude, (15, 15), 0) - - # Normalize to [0,1] - texture_map = (texture_map - texture_map.min()) / (texture_map.max() - texture_map.min() + 1e-8) - - return texture_map - - -def filter_segmentation_results( # type: ignore[no-untyped-def] - frame, - masks, - bboxes, - track_ids, - probs: Sequence[float], - names: Sequence[str], - areas, - texture_threshold: float = 0.07, - size_filter: int = 800, -): - """ - Filters segmentation results using both overlap and saliency detection. - Uses mask_sum tensor for efficient overlap detection. - - Parameters: - masks: list of torch.Tensor containing mask data - bboxes: list of bounding boxes [x1, y1, x2, y2] - track_ids: list of tracking IDs - probs: list of confidence scores - names: list of class names - areas: list of object areas - frame: BGR image for computing saliency - texture_threshold: Average texture value required for mask to be kept - size_filter: Minimum size of the object to be kept - - Returns: - tuple: (filtered_masks, filtered_bboxes, filtered_track_ids, filtered_probs, filtered_names, filtered_texture_values, texture_map) - """ - if len(masks) <= 1: - return masks, bboxes, track_ids, probs, names, [] - - # Compute texture map once and convert to tensor - texture_map = compute_texture_map(frame) - - # Sort by area (smallest to largest) - sorted_indices = torch.tensor(areas).argsort(descending=False) - - device = masks[0].device # Get the device of the first mask - - # Create mask_sum tensor where each pixel stores the index of the mask that claims it - mask_sum = torch.zeros_like(masks[0], dtype=torch.int32) - - texture_map = torch.from_numpy(texture_map).to( - device - ) # Convert texture_map to tensor and move to device - - filtered_texture_values = [] # List to store texture values of filtered masks - - for i, idx in enumerate(sorted_indices): - mask = masks[idx] - # Compute average texture value within mask - texture_value = torch.mean(texture_map[mask > 0]) if torch.any(mask > 0) else 0 - - # Only claim pixels if mask passes texture threshold - if texture_value >= texture_threshold: - mask_sum[mask > 0] = i - filtered_texture_values.append( - texture_value.item() # type: ignore[union-attr] - ) # Store the texture value as a Python float - - # Get indices that appear in mask_sum (these are the masks we want to keep) - keep_indices, counts = torch.unique(mask_sum[mask_sum > 0], return_counts=True) - size_indices = counts > size_filter - keep_indices = keep_indices[size_indices] - - sorted_indices = sorted_indices.cpu() - keep_indices = keep_indices.cpu() - - # Map back to original indices and filter - final_indices = sorted_indices[keep_indices].tolist() - - filtered_masks = [masks[i] for i in final_indices] - filtered_bboxes = [bboxes[i] for i in final_indices] - filtered_track_ids = [track_ids[i] for i in final_indices] - filtered_probs = [probs[i] for i in final_indices] - filtered_names = [names[i] for i in final_indices] - - return ( - filtered_masks, - filtered_bboxes, - filtered_track_ids, - filtered_probs, - filtered_names, - filtered_texture_values, - ) - - -def plot_results( # type: ignore[no-untyped-def] - image, - masks, - bboxes, - track_ids, - probs: Sequence[float], - names: Sequence[str], - alpha: float = 0.5, -): - """ - Draws bounding boxes, masks, and labels on the given image with enhanced visualization. - Includes object names in the overlay and improved text visibility. - """ - h, w = image.shape[:2] - overlay = image.copy() - - for mask, bbox, track_id, prob, name in zip( - masks, bboxes, track_ids, probs, names, strict=False - ): - # Convert mask tensor to numpy if needed - if isinstance(mask, torch.Tensor): - mask = mask.cpu().numpy() - - # Ensure mask is in proper format for OpenCV resize - if mask.dtype == bool: - mask = mask.astype(np.uint8) - elif mask.dtype != np.uint8 and mask.dtype != np.float32: - mask = mask.astype(np.float32) - - mask_resized = cv2.resize(mask, (w, h), interpolation=cv2.INTER_LINEAR) - - # Generate consistent color based on track_id - if track_id != -1: - np.random.seed(track_id) - color = np.random.randint(0, 255, (3,), dtype=np.uint8) - np.random.seed(None) - else: - color = np.random.randint(0, 255, (3,), dtype=np.uint8) - - # Apply mask color - overlay[mask_resized > 0.5] = color - - # Draw bounding box - x1, y1, x2, y2 = map(int, bbox) - cv2.rectangle(overlay, (x1, y1), (x2, y2), color.tolist(), 2) - - # Prepare label text - label = f"ID:{track_id} {prob:.2f}" - if name: # Add object name if available - label += f" {name}" - - # Calculate text size for background rectangle - (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) - - # Draw background rectangle for text - cv2.rectangle(overlay, (x1, y1 - text_h - 8), (x1 + text_w + 4, y1), color.tolist(), -1) - - # Draw text with white color for better visibility - cv2.putText( - overlay, - label, - (x1 + 2, y1 - 5), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - (255, 255, 255), # White text - 1, - ) - - # Blend overlay with original image - result = cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0) - return result - - -def crop_images_from_bboxes(image, bboxes, buffer: int = 0): # type: ignore[no-untyped-def] - """ - Crops regions from an image based on bounding boxes with an optional buffer. - - Parameters: - image (numpy array): Input image. - bboxes (list of lists): List of bounding boxes [x1, y1, x2, y2]. - buffer (int): Number of pixels to expand each bounding box. - - Returns: - list of numpy arrays: Cropped image regions. - """ - height, width, _ = image.shape - cropped_images = [] - - for bbox in bboxes: - x1, y1, x2, y2 = bbox - - # Apply buffer - x1 = max(0, x1 - buffer) - y1 = max(0, y1 - buffer) - x2 = min(width, x2 + buffer) - y2 = min(height, y2 + buffer) - - cropped_image = image[int(y1) : int(y2), int(x1) : int(x2)] - cropped_images.append(cropped_image) - - return cropped_images diff --git a/dimos/robot/agilex/README.md b/dimos/robot/agilex/README.md deleted file mode 100644 index 8342a6045e..0000000000 --- a/dimos/robot/agilex/README.md +++ /dev/null @@ -1,371 +0,0 @@ -# DIMOS Manipulator Robot Development Guide - -This guide explains how to create robot classes, integrate agents, and use the DIMOS module system with LCM transport. - -## Table of Contents -1. [Robot Class Architecture](#robot-class-architecture) -2. [Module System & LCM Transport](#module-system--lcm-transport) -3. [Agent Integration](#agent-integration) -4. [Complete Example](#complete-example) - -## Robot Class Architecture - -### Basic Robot Class Structure - -A DIMOS robot class should follow this pattern: - -```python -from typing import Optional, List -from dimos import core -from dimos.types.robot_capabilities import RobotCapability - -class YourRobot: - """Your robot implementation.""" - - def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None): - # Core components - self.dimos = None - self.modules = {} - self.skill_library = SkillLibrary() - - # Define capabilities - self.capabilities = robot_capabilities or [ - RobotCapability.VISION, - RobotCapability.MANIPULATION, - ] - - async def start(self): - """Start the robot modules.""" - # Initialize DIMOS with worker count - self.dimos = core.start(2) # Number of workers needed - - # Deploy modules - # ... (see Module System section) - - def stop(self): - """Stop all modules and clean up.""" - # Stop modules - # Close DIMOS - if self.dimos: - self.dimos.close() -``` - -### Key Components Explained - -1. **Initialization**: Store references to modules, skills, and capabilities -2. **Async Start**: Modules must be deployed asynchronously -3. **Proper Cleanup**: Always stop modules before closing DIMOS - -## Module System & LCM Transport - -### Understanding DIMOS Modules - -Modules are the building blocks of DIMOS robots. They: -- Process data streams (inputs) -- Produce outputs -- Can be connected together -- Communicate via LCM (Lightweight Communications and Marshalling) - -### Deploying a Module - -```python -# Deploy a camera module -self.camera = self.dimos.deploy( - ZEDModule, # Module class - camera_id=0, # Module parameters - resolution="HD720", - depth_mode="NEURAL", - fps=30, - publish_rate=30.0, - frame_id="camera_frame" -) -``` - -### Setting Up LCM Transport - -LCM transport enables inter-module communication: - -```python -# Enable LCM auto-configuration -from dimos.protocol import pubsub -pubsub.lcm.autoconf() - -# Configure output transport -self.camera.color_image.transport = core.LCMTransport( - "/camera/color_image", # Topic name - Image # Message type -) -self.camera.depth_image.transport = core.LCMTransport( - "/camera/depth_image", - Image -) -``` - -### Connecting Modules - -Connect module outputs to inputs: - -```python -# Connect manipulation module to camera outputs -self.manipulation.rgb_image.connect(self.camera.color_image) -self.manipulation.depth_image.connect(self.camera.depth_image) -self.manipulation.camera_info.connect(self.camera.camera_info) -``` - -### Module Communication Pattern - -``` -┌──────────────┐ LCM ┌────────────────┐ LCM ┌──────────────┐ -│ Camera │────────▶│ Manipulation │────────▶│ Visualization│ -│ Module │ Messages│ Module │ Messages│ Output │ -└──────────────┘ └────────────────┘ └──────────────┘ - ▲ ▲ - │ │ - └──────────────────────────┘ - Direct Connection via RPC call -``` - -## Agent Integration - -### Setting Up Agent with Robot - -The run file pattern for agent integration: - -```python -#!/usr/bin/env python3 -import asyncio -import reactivex as rx -from dimos.agents_deprecated.claude_agent import ClaudeAgent -from dimos.web.robot_web_interface import RobotWebInterface - -def main(): - # 1. Create and start robot - robot = YourRobot() - asyncio.run(robot.start()) - - # 2. Set up skills - skills = robot.get_skills() - skills.add(YourSkill) - skills.create_instance("YourSkill", robot=robot) - - # 3. Set up reactive streams - agent_response_subject = rx.subject.Subject() - agent_response_stream = agent_response_subject.pipe(ops.share()) - - # 4. Create web interface - web_interface = RobotWebInterface( - port=5555, - text_streams={"agent_responses": agent_response_stream}, - audio_subject=rx.subject.Subject() - ) - - # 5. Create agent - agent = ClaudeAgent( - dev_name="your_agent", - input_query_stream=web_interface.query_stream, - skills=skills, - system_query="Your system prompt here", - model_name="claude-3-5-haiku-latest" - ) - - # 6. Connect agent responses - agent.get_response_observable().subscribe( - lambda x: agent_response_subject.on_next(x) - ) - - # 7. Run interface - web_interface.run() -``` - -### Key Integration Points - -1. **Reactive Streams**: Use RxPy for event-driven communication -2. **Web Interface**: Provides user input/output -3. **Agent**: Processes natural language and executes skills -4. **Skills**: Define robot capabilities as executable actions - -## Complete Example - -### Step 1: Create Robot Class (`my_robot.py`) - -```python -import asyncio -from typing import Optional, List -from dimos import core -from dimos.hardware.camera import CameraModule -from dimos.manipulation.module import ManipulationModule -from dimos.skills.skills import SkillLibrary -from dimos.types.robot_capabilities import RobotCapability -from dimos_lcm.sensor_msgs import Image, CameraInfo -from dimos.protocol import pubsub - -class MyRobot: - def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None): - self.dimos = None - self.camera = None - self.manipulation = None - self.skill_library = SkillLibrary() - - self.capabilities = robot_capabilities or [ - RobotCapability.VISION, - RobotCapability.MANIPULATION, - ] - - async def start(self): - # Start DIMOS - self.dimos = core.start(2) - - # Enable LCM - pubsub.lcm.autoconf() - - # Deploy camera - self.camera = self.dimos.deploy( - CameraModule, - camera_id=0, - fps=30 - ) - - # Configure camera LCM - self.camera.color_image.transport = core.LCMTransport("/camera/rgb", Image) - self.camera.depth_image.transport = core.LCMTransport("/camera/depth", Image) - self.camera.camera_info.transport = core.LCMTransport("/camera/info", CameraInfo) - - # Deploy manipulation - self.manipulation = self.dimos.deploy(ManipulationModule) - - # Connect modules - self.manipulation.rgb_image.connect(self.camera.color_image) - self.manipulation.depth_image.connect(self.camera.depth_image) - self.manipulation.camera_info.connect(self.camera.camera_info) - - # Configure manipulation output - self.manipulation.viz_image.transport = core.LCMTransport("/viz/output", Image) - - # Start modules - self.camera.start() - self.manipulation.start() - - await asyncio.sleep(2) # Allow initialization - - def get_skills(self): - return self.skill_library - - def stop(self): - if self.manipulation: - self.manipulation.stop() - if self.camera: - self.camera.stop() - if self.dimos: - self.dimos.close() -``` - -### Step 2: Create Run Script (`run.py`) - -```python -#!/usr/bin/env python3 -import asyncio -import os -from my_robot import MyRobot -from dimos.agents_deprecated.claude_agent import ClaudeAgent -from dimos.skills.basic import BasicSkill -from dimos.web.robot_web_interface import RobotWebInterface -import reactivex as rx -import reactivex.operators as ops - -SYSTEM_PROMPT = """You are a helpful robot assistant.""" - -def main(): - # Check API key - if not os.getenv("ANTHROPIC_API_KEY"): - print("Please set ANTHROPIC_API_KEY") - return - - # Create robot - robot = MyRobot() - - try: - # Start robot - asyncio.run(robot.start()) - - # Set up skills - skills = robot.get_skills() - skills.add(BasicSkill) - skills.create_instance("BasicSkill", robot=robot) - - # Set up streams - agent_response_subject = rx.subject.Subject() - agent_response_stream = agent_response_subject.pipe(ops.share()) - - # Create web interface - web_interface = RobotWebInterface( - port=5555, - text_streams={"agent_responses": agent_response_stream} - ) - - # Create agent - agent = ClaudeAgent( - dev_name="my_agent", - input_query_stream=web_interface.query_stream, - skills=skills, - system_query=SYSTEM_PROMPT - ) - - # Connect responses - agent.get_response_observable().subscribe( - lambda x: agent_response_subject.on_next(x) - ) - - print("Robot ready at http://localhost:5555") - - # Run - web_interface.run() - - finally: - robot.stop() - -if __name__ == "__main__": - main() -``` - -### Step 3: Define Skills (`skills.py`) - -```python -from dimos.skills import Skill, skill - -@skill( - description="Perform a basic action", - parameters={ - "action": "The action to perform" - } -) -class BasicSkill(Skill): - def __init__(self, robot): - self.robot = robot - - def run(self, action: str): - # Implement skill logic - return f"Performed: {action}" -``` - -## Best Practices - -1. **Module Lifecycle**: Always start DIMOS before deploying modules -2. **LCM Topics**: Use descriptive topic names with namespaces -3. **Error Handling**: Wrap module operations in try-except blocks -4. **Resource Cleanup**: Ensure proper cleanup in stop() methods -5. **Async Operations**: Use asyncio for non-blocking operations -6. **Stream Management**: Use RxPy for reactive programming patterns - -## Debugging Tips - -1. **Check Module Status**: Print module.io().result() to see connections -2. **Monitor LCM**: Use Foxglove to visualize LCM messages -3. **Log Everything**: Use dimos.utils.logging_config.setup_logger() -4. **Test Modules Independently**: Deploy and test one module at a time - -## Common Issues - -1. **"Module not started"**: Ensure start() is called after deployment -2. **"No data received"**: Check LCM transport configuration -3. **"Connection failed"**: Verify input/output types match -4. **"Cleanup errors"**: Stop modules before closing DIMOS diff --git a/dimos/robot/agilex/README_CN.md b/dimos/robot/agilex/README_CN.md deleted file mode 100644 index a8d79ebec1..0000000000 --- a/dimos/robot/agilex/README_CN.md +++ /dev/null @@ -1,465 +0,0 @@ -# DIMOS 机械臂机器人开发指南 - -本指南介绍如何创建机器人类、集成智能体(Agent)以及使用 DIMOS 模块系统和 LCM 传输。 - -## 目录 -1. [机器人类架构](#机器人类架构) -2. [模块系统与 LCM 传输](#模块系统与-lcm-传输) -3. [智能体集成](#智能体集成) -4. [完整示例](#完整示例) - -## 机器人类架构 - -### 基本机器人类结构 - -DIMOS 机器人类应遵循以下模式: - -```python -from typing import Optional, List -from dimos import core -from dimos.types.robot_capabilities import RobotCapability - -class YourRobot: - """您的机器人实现。""" - - def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None): - # 核心组件 - self.dimos = None - self.modules = {} - self.skill_library = SkillLibrary() - - # 定义能力 - self.capabilities = robot_capabilities or [ - RobotCapability.VISION, - RobotCapability.MANIPULATION, - ] - - async def start(self): - """启动机器人模块。""" - # 初始化 DIMOS,指定工作线程数 - self.dimos = core.start(2) # 需要的工作线程数 - - # 部署模块 - # ... (参见模块系统章节) - - def stop(self): - """停止所有模块并清理资源。""" - # 停止模块 - # 关闭 DIMOS - if self.dimos: - self.dimos.close() -``` - -### 关键组件说明 - -1. **初始化**:存储模块、技能和能力的引用 -2. **异步启动**:模块必须异步部署 -3. **正确清理**:在关闭 DIMOS 之前始终停止模块 - -## 模块系统与 LCM 传输 - -### 理解 DIMOS 模块 - -模块是 DIMOS 机器人的构建块。它们: -- 处理数据流(输入) -- 产生输出 -- 可以相互连接 -- 通过 LCM(轻量级通信和编组)进行通信 - -### 部署模块 - -```python -# 部署相机模块 -self.camera = self.dimos.deploy( - ZEDModule, # 模块类 - camera_id=0, # 模块参数 - resolution="HD720", - depth_mode="NEURAL", - fps=30, - publish_rate=30.0, - frame_id="camera_frame" -) -``` - -### 设置 LCM 传输 - -LCM 传输实现模块间通信: - -```python -# 启用 LCM 自动配置 -from dimos.protocol import pubsub -pubsub.lcm.autoconf() - -# 配置输出传输 -self.camera.color_image.transport = core.LCMTransport( - "/camera/color_image", # 主题名称 - Image # 消息类型 -) -self.camera.depth_image.transport = core.LCMTransport( - "/camera/depth_image", - Image -) -``` - -### 连接模块 - -将模块输出连接到输入: - -```python -# 将操作模块连接到相机输出 -self.manipulation.rgb_image.connect(self.camera.color_image) # ROS set_callback -self.manipulation.depth_image.connect(self.camera.depth_image) -self.manipulation.camera_info.connect(self.camera.camera_info) -``` - -### 模块通信模式 - -``` -┌──────────────┐ LCM ┌────────────────┐ LCM ┌──────────────┐ -│ 相机模块 │────────▶│ 操作模块 │────────▶│ 可视化输出 │ -│ │ 消息 │ │ 消息 │ │ -└──────────────┘ └────────────────┘ └──────────────┘ - ▲ ▲ - │ │ - └──────────────────────────┘ - 直接连接(RPC指令) -``` - -## 智能体集成 - -### 设置智能体与机器人 - -运行文件的智能体集成模式: - -```python -#!/usr/bin/env python3 -import asyncio -import reactivex as rx -from dimos.agents_deprecated.claude_agent import ClaudeAgent -from dimos.web.robot_web_interface import RobotWebInterface - -def main(): - # 1. 创建并启动机器人 - robot = YourRobot() - asyncio.run(robot.start()) - - # 2. 设置技能 - skills = robot.get_skills() - skills.add(YourSkill) - skills.create_instance("YourSkill", robot=robot) - - # 3. 设置响应式流 - agent_response_subject = rx.subject.Subject() - agent_response_stream = agent_response_subject.pipe(ops.share()) - - # 4. 创建 Web 界面 - web_interface = RobotWebInterface( - port=5555, - text_streams={"agent_responses": agent_response_stream}, - audio_subject=rx.subject.Subject() - ) - - # 5. 创建智能体 - agent = ClaudeAgent( - dev_name="your_agent", - input_query_stream=web_interface.query_stream, - skills=skills, - system_query="您的系统提示词", - model_name="claude-3-5-haiku-latest" - ) - - # 6. 连接智能体响应 - agent.get_response_observable().subscribe( - lambda x: agent_response_subject.on_next(x) - ) - - # 7. 运行界面 - web_interface.run() -``` - -### 关键集成点 - -1. **响应式流**:使用 RxPy 进行事件驱动通信 -2. **Web 界面**:提供用户输入/输出 -3. **智能体**:处理自然语言并执行技能 -4. **技能**:将机器人能力定义为可执行动作 - -## 完整示例 - -### 步骤 1:创建机器人类(`my_robot.py`) - -```python -import asyncio -from typing import Optional, List -from dimos import core -from dimos.hardware.camera import CameraModule -from dimos.manipulation.module import ManipulationModule -from dimos.skills.skills import SkillLibrary -from dimos.types.robot_capabilities import RobotCapability -from dimos_lcm.sensor_msgs import Image, CameraInfo -from dimos.protocol import pubsub - -class MyRobot: - def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None): - self.dimos = None - self.camera = None - self.manipulation = None - self.skill_library = SkillLibrary() - - self.capabilities = robot_capabilities or [ - RobotCapability.VISION, - RobotCapability.MANIPULATION, - ] - - async def start(self): - # 启动 DIMOS - self.dimos = core.start(2) - - # 启用 LCM - pubsub.lcm.autoconf() - - # 部署相机 - self.camera = self.dimos.deploy( - CameraModule, - camera_id=0, - fps=30 - ) - - # 配置相机 LCM - self.camera.color_image.transport = core.LCMTransport("/camera/rgb", Image) - self.camera.depth_image.transport = core.LCMTransport("/camera/depth", Image) - self.camera.camera_info.transport = core.LCMTransport("/camera/info", CameraInfo) - - # 部署操作模块 - self.manipulation = self.dimos.deploy(ManipulationModule) - - # 连接模块 - self.manipulation.rgb_image.connect(self.camera.color_image) - self.manipulation.depth_image.connect(self.camera.depth_image) - self.manipulation.camera_info.connect(self.camera.camera_info) - - # 配置操作输出 - self.manipulation.viz_image.transport = core.LCMTransport("/viz/output", Image) - - # 启动模块 - self.camera.start() - self.manipulation.start() - - await asyncio.sleep(2) # 允许初始化 - - def get_skills(self): - return self.skill_library - - def stop(self): - if self.manipulation: - self.manipulation.stop() - if self.camera: - self.camera.stop() - if self.dimos: - self.dimos.close() -``` - -### 步骤 2:创建运行脚本(`run.py`) - -```python -#!/usr/bin/env python3 -import asyncio -import os -from my_robot import MyRobot -from dimos.agents_deprecated.claude_agent import ClaudeAgent -from dimos.skills.basic import BasicSkill -from dimos.web.robot_web_interface import RobotWebInterface -import reactivex as rx -import reactivex.operators as ops - -SYSTEM_PROMPT = """您是一个有用的机器人助手。""" - -def main(): - # 检查 API 密钥 - if not os.getenv("ANTHROPIC_API_KEY"): - print("请设置 ANTHROPIC_API_KEY") - return - - # 创建机器人 - robot = MyRobot() - - try: - # 启动机器人 - asyncio.run(robot.start()) - - # 设置技能 - skills = robot.get_skills() - skills.add(BasicSkill) - skills.create_instance("BasicSkill", robot=robot) - - # 设置流 - agent_response_subject = rx.subject.Subject() - agent_response_stream = agent_response_subject.pipe(ops.share()) - - # 创建 Web 界面 - web_interface = RobotWebInterface( - port=5555, - text_streams={"agent_responses": agent_response_stream} - ) - - # 创建智能体 - agent = ClaudeAgent( - dev_name="my_agent", - input_query_stream=web_interface.query_stream, - skills=skills, - system_query=SYSTEM_PROMPT - ) - - # 连接响应 - agent.get_response_observable().subscribe( - lambda x: agent_response_subject.on_next(x) - ) - - print("机器人就绪,访问 http://localhost:5555") - - # 运行 - web_interface.run() - - finally: - robot.stop() - -if __name__ == "__main__": - main() -``` - -### 步骤 3:定义技能(`skills.py`) - -```python -from dimos.skills import Skill, skill - -@skill( - description="执行一个基本动作", - parameters={ - "action": "要执行的动作" - } -) -class BasicSkill(Skill): - def __init__(self, robot): - self.robot = robot - - def run(self, action: str): - # 实现技能逻辑 - return f"已执行:{action}" -``` - -## 最佳实践 - -1. **模块生命周期**:在部署模块之前始终先启动 DIMOS -2. **LCM 主题**:使用带命名空间的描述性主题名称 -3. **错误处理**:用 try-except 块包装模块操作 -4. **资源清理**:确保在 stop() 方法中正确清理 -5. **异步操作**:使用 asyncio 进行非阻塞操作 -6. **流管理**:使用 RxPy 进行响应式编程模式 - -## 调试技巧 - -1. **检查模块状态**:打印 module.io().result() 查看连接 -2. **监控 LCM**:使用 Foxglove 可视化 LCM 消息 -3. **记录一切**:使用 dimos.utils.logging_config.setup_logger() -4. **独立测试模块**:一次部署和测试一个模块 - -## 常见问题 - -1. **"模块未启动"**:确保在部署后调用 start() -2. **"未收到数据"**:检查 LCM 传输配置 -3. **"连接失败"**:验证输入/输出类型是否匹配 -4. **"清理错误"**:在关闭 DIMOS 之前停止模块 - -## 高级主题 - -### 自定义模块开发 - -创建自定义模块的基本结构: - -```python -from dimos.core import Module, In, Out, rpc - -class CustomModule(Module): - # 定义输入 - input_data: In[DataType] - - # 定义输出 - output_data: Out[DataType] - - def __init__(self, param1, param2, **kwargs): - super().__init__(**kwargs) - self.param1 = param1 - self.param2 = param2 - - @rpc - def start(self): - """启动模块处理。""" - self.input_data.subscribe(self._process_data) - - def _process_data(self, data): - """处理输入数据。""" - # 处理逻辑 - result = self.process(data) - # 发布输出 - self.output_data.publish(result) - - @rpc - def stop(self): - """停止模块。""" - # 清理资源 - pass -``` - -### 技能开发指南 - -技能是机器人可执行的高级动作: - -```python -from dimos.skills import Skill, skill -from typing import Optional - -@skill( - description="复杂操作技能", - parameters={ - "target": "目标对象", - "location": "目标位置" - } -) -class ComplexSkill(Skill): - def __init__(self, robot, **kwargs): - super().__init__(**kwargs) - self.robot = robot - - def run(self, target: str, location: Optional[str] = None): - """执行技能逻辑。""" - try: - # 1. 感知阶段 - object_info = self.robot.detect_object(target) - - # 2. 规划阶段 - if location: - plan = self.robot.plan_movement(object_info, location) - - # 3. 执行阶段 - result = self.robot.execute_plan(plan) - - return { - "success": True, - "message": f"成功移动 {target} 到 {location}" - } - - except Exception as e: - return { - "success": False, - "error": str(e) - } -``` - -### 性能优化 - -1. **并行处理**:使用多个工作线程处理不同模块 -2. **数据缓冲**:为高频数据流实现缓冲机制 -3. **延迟加载**:仅在需要时初始化重型模块 -4. **资源池化**:重用昂贵的资源(如神经网络模型) - -希望本指南能帮助您快速上手 DIMOS 机器人开发! diff --git a/dimos/robot/agilex/piper_arm.py b/dimos/robot/agilex/piper_arm.py deleted file mode 100644 index 29624b9a4c..0000000000 --- a/dimos/robot/agilex/piper_arm.py +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import asyncio - -# Import LCM message types -from dimos_lcm.sensor_msgs import CameraInfo - -from dimos import core -from dimos.hardware.sensors.camera.zed import ZEDModule -from dimos.manipulation.visual_servoing.manipulation_module import ManipulationModule -from dimos.msgs.sensor_msgs import Image -from dimos.protocol import pubsub -from dimos.robot.foxglove_bridge import FoxgloveBridge -from dimos.robot.robot import Robot -from dimos.skills.skills import SkillLibrary -from dimos.types.robot_capabilities import RobotCapability -from dimos.utils.logging_config import setup_logger - -logger = setup_logger() - - -class PiperArmRobot(Robot): - """Piper Arm robot with ZED camera and manipulation capabilities.""" - - def __init__(self, robot_capabilities: list[RobotCapability] | None = None) -> None: - super().__init__() - self.dimos = None - self.stereo_camera = None - self.manipulation_interface = None - self.skill_library = SkillLibrary() # type: ignore[assignment] - - # Initialize capabilities - self.capabilities = robot_capabilities or [ - RobotCapability.VISION, - RobotCapability.MANIPULATION, - ] - - async def start(self) -> None: - """Start the robot modules.""" - # Start Dimos - self.dimos = core.start(2) # type: ignore[assignment] # Need 2 workers for ZED and manipulation modules - self.foxglove_bridge = FoxgloveBridge() - - # Enable LCM auto-configuration - pubsub.lcm.autoconf() # type: ignore[attr-defined] - - # Deploy ZED module - logger.info("Deploying ZED module...") - self.stereo_camera = self.dimos.deploy( # type: ignore[attr-defined] - ZEDModule, - camera_id=0, - resolution="HD720", - depth_mode="NEURAL", - fps=30, - enable_tracking=False, # We don't need tracking for manipulation - publish_rate=30.0, - frame_id="zed_camera", - ) - - # Configure ZED LCM transports - self.stereo_camera.color_image.transport = core.LCMTransport("/zed/color_image", Image) # type: ignore[attr-defined] - self.stereo_camera.depth_image.transport = core.LCMTransport("/zed/depth_image", Image) # type: ignore[attr-defined] - self.stereo_camera.camera_info.transport = core.LCMTransport("/zed/camera_info", CameraInfo) # type: ignore[attr-defined] - - # Deploy manipulation module - logger.info("Deploying manipulation module...") - self.manipulation_interface = self.dimos.deploy(ManipulationModule) # type: ignore[attr-defined] - - # Connect manipulation inputs to ZED outputs - self.manipulation_interface.rgb_image.connect(self.stereo_camera.color_image) # type: ignore[attr-defined] - self.manipulation_interface.depth_image.connect(self.stereo_camera.depth_image) # type: ignore[attr-defined] - self.manipulation_interface.camera_info.connect(self.stereo_camera.camera_info) # type: ignore[attr-defined] - - # Configure manipulation output - self.manipulation_interface.viz_image.transport = core.LCMTransport( # type: ignore[attr-defined] - "/manipulation/viz", Image - ) - - # Print module info - logger.info("Modules configured:") - print("\nZED Module:") - print(self.stereo_camera.io()) # type: ignore[attr-defined] - print("\nManipulation Module:") - print(self.manipulation_interface.io()) # type: ignore[attr-defined] - - # Start modules - logger.info("Starting modules...") - self.foxglove_bridge.start() - self.stereo_camera.start() # type: ignore[attr-defined] - self.manipulation_interface.start() # type: ignore[attr-defined] - - # Give modules time to initialize - await asyncio.sleep(2) - - logger.info("PiperArmRobot initialized and started") - - def pick_and_place( # type: ignore[no-untyped-def] - self, pick_x: int, pick_y: int, place_x: int | None = None, place_y: int | None = None - ): - """Execute pick and place task. - - Args: - pick_x: X coordinate for pick location - pick_y: Y coordinate for pick location - place_x: X coordinate for place location (optional) - place_y: Y coordinate for place location (optional) - - Returns: - Result of the pick and place operation - """ - if self.manipulation_interface: - return self.manipulation_interface.pick_and_place(pick_x, pick_y, place_x, place_y) - else: - logger.error("Manipulation module not initialized") - return False - - def handle_keyboard_command(self, key: str): # type: ignore[no-untyped-def] - """Pass keyboard commands to manipulation module. - - Args: - key: Keyboard key pressed - - Returns: - Action taken or None - """ - if self.manipulation_interface: - return self.manipulation_interface.handle_keyboard_command(key) - else: - logger.error("Manipulation module not initialized") - return None - - def stop(self) -> None: - """Stop all modules and clean up.""" - logger.info("Stopping PiperArmRobot...") - - try: - if self.manipulation_interface: - self.manipulation_interface.stop() - - if self.stereo_camera: - self.stereo_camera.stop() - except Exception as e: - logger.warning(f"Error stopping modules: {e}") - - # Close dimos last to ensure workers are available for cleanup - if self.dimos: - self.dimos.close() - - logger.info("PiperArmRobot stopped") - - -async def run_piper_arm() -> None: - """Run the Piper Arm robot.""" - robot = PiperArmRobot() # type: ignore[abstract] - - await robot.start() - - # Keep the robot running - try: - while True: - await asyncio.sleep(1) - except KeyboardInterrupt: - logger.info("Keyboard interrupt received") - finally: - await robot.stop() # type: ignore[func-returns-value] - - -if __name__ == "__main__": - asyncio.run(run_piper_arm()) diff --git a/dimos/robot/agilex/run.py b/dimos/robot/agilex/run.py deleted file mode 100644 index 64e0ae5470..0000000000 --- a/dimos/robot/agilex/run.py +++ /dev/null @@ -1,190 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Run script for Piper Arm robot with Claude agent integration. -Provides manipulation capabilities with natural language interface. -""" - -import asyncio -import os -import sys - -from dotenv import load_dotenv -import reactivex as rx -import reactivex.operators as ops - -from dimos.agents_deprecated.claude_agent import ClaudeAgent -from dimos.robot.agilex.piper_arm import PiperArmRobot -from dimos.skills.kill_skill import KillSkill -from dimos.skills.manipulation.pick_and_place import PickAndPlace -from dimos.stream.audio.pipelines import stt, tts -from dimos.utils.logging_config import setup_logger -from dimos.web.robot_web_interface import RobotWebInterface - -logger = setup_logger() - -# Load environment variables -load_dotenv() - -# System prompt for the Piper Arm manipulation agent -SYSTEM_PROMPT = """You are an intelligent robotic assistant controlling a Piper Arm robot with advanced manipulation capabilities. Your primary role is to help users with pick and place tasks using natural language understanding. - -## Your Capabilities: -1. **Visual Perception**: You have access to a ZED stereo camera that provides RGB and depth information -2. **Object Manipulation**: You can pick up and place objects using a 6-DOF robotic arm with a gripper -3. **Language Understanding**: You use the Qwen vision-language model to identify objects and locations from natural language descriptions - -## Available Skills: -- **PickAndPlace**: Execute pick and place operations based on object and location descriptions - - Pick only: "Pick up the red mug" - - Pick and place: "Move the book to the shelf" -- **KillSkill**: Stop any currently running skill - -## Guidelines: -1. **Safety First**: Always ensure safe operation. If unsure about an object's graspability or a placement location's stability, ask for clarification -2. **Clear Communication**: Explain what you're doing and ask for confirmation when needed -3. **Error Handling**: If a task fails, explain why and suggest alternatives -4. **Precision**: When users give specific object descriptions, use them exactly as provided to the vision model - -## Interaction Examples: -- User: "Pick up the coffee mug" - You: "I'll pick up the coffee mug for you." [Execute PickAndPlace with object_query="coffee mug"] - -- User: "Put the toy on the table" - You: "I'll place the toy on the table." [Execute PickAndPlace with object_query="toy", target_query="on the table"] - -- User: "What do you see?" - -Remember: You're here to assist with manipulation tasks. Be helpful, precise, and always prioritize safe operation of the robot.""" - - -def main(): # type: ignore[no-untyped-def] - """Main entry point.""" - print("\n" + "=" * 60) - print("Piper Arm Robot with Claude Agent") - print("=" * 60) - print("\nThis system integrates:") - print(" - Piper Arm 6-DOF robot") - print(" - ZED stereo camera") - print(" - Claude AI for natural language understanding") - print(" - Qwen VLM for visual object detection") - print(" - Web interface with text and voice input") - print(" - Foxglove visualization via LCM") - print("\nStarting system...\n") - - # Check for API key - if not os.getenv("ANTHROPIC_API_KEY"): - print("WARNING: ANTHROPIC_API_KEY not found in environment") - print("Please set your API key in .env file or environment") - sys.exit(1) - - logger.info("Starting Piper Arm Robot with Agent") - - # Create robot instance - robot = PiperArmRobot() # type: ignore[abstract] - - try: - # Start the robot (this is async, so we need asyncio.run) - logger.info("Initializing robot...") - asyncio.run(robot.start()) - logger.info("Robot initialized successfully") - - # Set up skill library - skills = robot.get_skills() # type: ignore[no-untyped-call] - skills.add(PickAndPlace) - skills.add(KillSkill) - - # Create skill instances - skills.create_instance("PickAndPlace", robot=robot) - skills.create_instance("KillSkill", robot=robot, skill_library=skills) - - logger.info(f"Skills registered: {[skill.__name__ for skill in skills.get_class_skills()]}") - - # Set up streams for agent and web interface - agent_response_subject = rx.subject.Subject() # type: ignore[var-annotated] - agent_response_stream = agent_response_subject.pipe(ops.share()) - audio_subject = rx.subject.Subject() # type: ignore[var-annotated] - - # Set up streams for web interface - streams = {} # type: ignore[var-annotated] - - text_streams = { - "agent_responses": agent_response_stream, - } - - # Create web interface first (needed for agent) - try: - web_interface = RobotWebInterface( - port=5555, text_streams=text_streams, audio_subject=audio_subject, **streams - ) - logger.info("Web interface created successfully") - except Exception as e: - logger.error(f"Failed to create web interface: {e}") - raise - - # Set up speech-to-text - stt_node = stt() # type: ignore[no-untyped-call] - stt_node.consume_audio(audio_subject.pipe(ops.share())) - - # Create Claude agent - agent = ClaudeAgent( - dev_name="piper_arm_agent", - input_query_stream=web_interface.query_stream, # Use text input from web interface - # input_query_stream=stt_node.emit_text(), # Uncomment to use voice input - skills=skills, - system_query=SYSTEM_PROMPT, - model_name="claude-3-5-haiku-latest", - thinking_budget_tokens=0, - max_output_tokens_per_request=4096, - ) - - # Subscribe to agent responses - agent.get_response_observable().subscribe(lambda x: agent_response_subject.on_next(x)) - - # Set up text-to-speech for agent responses - tts_node = tts() # type: ignore[no-untyped-call] - tts_node.consume_text(agent.get_response_observable()) - - logger.info("=" * 60) - logger.info("Piper Arm Agent Ready!") - logger.info("Web interface available at: http://localhost:5555") - logger.info("Foxglove visualization available at: ws://localhost:8765") - logger.info("You can:") - logger.info(" - Type commands in the web interface") - logger.info(" - Use voice commands") - logger.info(" - Ask the robot to pick up objects") - logger.info(" - Ask the robot to move objects to locations") - logger.info("=" * 60) - - # Run web interface (this blocks) - web_interface.run() - - except KeyboardInterrupt: - logger.info("Keyboard interrupt received") - except Exception as e: - logger.error(f"Error running robot: {e}") - import traceback - - traceback.print_exc() - finally: - logger.info("Shutting down...") - # Stop the robot (this is also async) - robot.stop() - logger.info("Robot stopped") - - -if __name__ == "__main__": - main() # type: ignore[no-untyped-call] diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 05658e11c2..6dfa077498 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -49,9 +49,10 @@ # Demo blueprints "demo-camera": "dimos.hardware.sensors.camera.module:demo_camera", "demo-osm": "dimos.mapping.osm.demo_osm:demo_osm", - "demo-skill": "dimos.agents.skills.demo_skill:demo_skill", - "demo-gps-nav": "dimos.agents.skills.demo_gps_nav:demo_gps_nav_skill", - "demo-google-maps-skill": "dimos.agents.skills.demo_google_maps_skill:demo_google_maps_skill", + "demo-skill": "dimos.agents2.skills.demo_skill:demo_skill", + "demo-gps-nav": "dimos.agents2.skills.demo_gps_nav:demo_gps_nav_skill", + "demo-google-maps-skill": "dimos.agents2.skills.demo_google_maps_skill:demo_google_maps_skill", + "demo-object-scene-registration": "dimos.perception.demo_object_scene_registration:demo_object_scene_registration", "demo-remapping": "dimos.robot.unitree_webrtc.demo_remapping:remapping", "demo-remapping-transport": "dimos.robot.unitree_webrtc.demo_remapping:remapping_and_transport", "demo-error-on-name-conflicts": "dimos.robot.unitree_webrtc.demo_error_on_name_conflicts:blueprint", diff --git a/docs/depth_camera_integration.md b/docs/depth_camera_integration.md new file mode 100644 index 0000000000..1c8760d5cb --- /dev/null +++ b/docs/depth_camera_integration.md @@ -0,0 +1,147 @@ +# Depth Camera Integration Guide + +This folder contains camera drivers and modules for RGB-D (depth) cameras such as RealSense and ZED. +Use this guide to add a new depth camera, wire TF correctly, and publish the required streams. + +## Add a New Depth Camera + +1) **Create a new driver module** + - Path: `dimos/dimos/hardware/sensors/camera//camera.py` + - Export a blueprint in `/__init__.py` (match the `realsense` / `zed` pattern). + +2) **Define config** + - Inherit from `ModuleConfig` and `DepthCameraConfig`: + ```python + @dataclass + class MyDepthCameraConfig(ModuleConfig, DepthCameraConfig): + width: int = 1280 + height: int = 720 + fps: int = 15 + camera_name: str = "camera" + base_frame_id: str = "base_link" + base_transform: Transform | None = field(default_factory=default_base_transform) + align_depth_to_color: bool = True + enable_depth: bool = True + enable_pointcloud: bool = False + pointcloud_fps: float = 5.0 + camera_info_fps: float = 1.0 + ``` + +3) **Implement the module** + - Inherit from `DepthCameraHardware` and `Module` (see `RealSenseCamera` / `ZEDCamera`). + - Provide these outputs (matching `RealSenseCamera` / `ZEDCamera`): + - `color_image: Out[Image]` + - `depth_image: Out[Image]` + - `pointcloud: Out[PointCloud2]` (optional, can be disabled by config) + - `camera_info: Out[CameraInfo]` + - `depth_camera_info: Out[CameraInfo]` + - Implement RPCs: + - `start()` / `stop()` + - `get_color_camera_info()` / `get_depth_camera_info()` + - `get_depth_scale()` (meters per depth unit) + +4) **Publish frames** + - Color images: `Image(format=ImageFormat.RGB, frame_id=_color_optical_frame)` + - Depth images: + - If `align_depth_to_color`: use `_color_optical_frame` + - Else: use `_depth_optical_frame` + - CameraInfo frame_id must match the image frame_id you publish. + +5) **Publish camera info** + - Build `CameraInfo` from camera intrinsics. + - Publish at `camera_info_fps`. + +6) **Publish pointcloud (optional)** + - Use `PointCloud2.from_rgbd(color_image, depth_image, camera_info, depth_scale)`. + - Publish at `pointcloud_fps`. + +## TF: Required Frames and Transforms + +Frame names are defined by the abstract depth camera spec (`dimos/dimos/hardware/sensors/camera/spec.py`). +Use the properties below to ensure consistent naming: + +- `_camera_link`: base link for the camera module (usually `{camera_name}_link`) +- `_color_frame`: non-optical color frame +- `_color_optical_frame`: optical color frame +- `_depth_frame`: non-optical depth frame +- `_depth_optical_frame`: optical depth frame + +Recommended transform chain (publish every frame or at your preferred TF rate): + +1) **Mounting transform** (from config): + - `base_frame_id -> _camera_link` + - Use `config.base_transform` if provided + +2) **Depth frame** + - `_camera_link -> _depth_frame` (identity unless the camera provides extrinsics) + - `_depth_frame -> _depth_optical_frame` using `OPTICAL_ROTATION` + +3) **Color frame** + - `_camera_link -> _color_frame` (from extrinsics, or identity if unavailable) + - `_color_frame -> _color_optical_frame` using `OPTICAL_ROTATION` + +Notes: +- If you align depth to color, keep TFs the same but publish depth images in `_color_optical_frame`. +- Ensure `color_image.frame_id` and `camera_info.frame_id` match; same for depth. + +## Required Streams / Topics + +Use these stream names in your module and attach transports as needed. +Default LCM topics in `realsense` / `zed` demos are shown below. + +| Stream name | Type | Suggested topic | Frame_id source | +|-------------------|--------------|-------------------------|-----------------| +| `color_image` | `Image` | `/camera/color` | `_color_optical_frame` | +| `depth_image` | `Image` | `/camera/depth` | `_depth_optical_frame` or `_color_optical_frame` | +| `pointcloud` | `PointCloud2`| `/camera/pointcloud` | (derived from CameraInfo) | +| `camera_info` | `CameraInfo` | `/camera/color_info` | matches `color_image` | +| `depth_camera_info` | `CameraInfo` | `/camera/depth_info` | matches `depth_image` | + +For `ObjectSceneRegistrationModule`, the required inputs are: +- `color_image` +- `depth_image` +- `camera_info` +- TF tree resolving `target_frame` to `color_image.frame_id` + +## Object Scene Registration (Brief Overview) + +`ObjectSceneRegistrationModule` consumes synchronized RGB + depth + camera intrinsics and produces: +- 2D detections (YOLO‑E) +- 3D detections (projected via depth + intrinsics + TF) +- Overlay annotations and aggregated pointclouds + +See: +- `dimos/dimos/perception/object_scene_registration.py` +- `dimos/dimos/perception/demo_object_scene_registration.py` + +Quick wiring example: + +```python +from dimos.core.blueprints import autoconnect +from dimos.hardware.sensors.camera.realsense import realsense_camera +from dimos.perception.object_scene_registration import object_scene_registration_module + +pipeline = autoconnect( + realsense_camera(enable_pointcloud=False), + object_scene_registration_module(target_frame="world"), +) +``` + +Run the demo via CLI: +```bash +dimos run demo-object-scene-registration +``` + +## Foxglove (Viewer) + +Install Foxglove from: +- https://foxglove.dev/download + +## Modules and Skills (Short Intro) + +- **Modules** are typed components with `In[...]` / `Out[...]` streams and `start()` / `stop()` lifecycles. +- **Skills** are callable methods (decorated with `@skill`) exposed by `SkillModule` for agents. + +Reference: +- Modules overview: `dimos/docs/concepts/modules.md` +- TF fundamentals: `dimos/docs/api/transforms.md`