From 4383737bfe2a5622d8d10fdc460737c7f8727849 Mon Sep 17 00:00:00 2001 From: Alex Lin Date: Tue, 15 Jul 2025 19:16:20 -0700 Subject: [PATCH 1/7] Point Cloud Filtering and Segmentation, Full 6DOF Object pose estimation, Grasp generation, ZED driver support, Hosted grasp integration --- dimos/hardware/zed_camera.py | 313 ++++ dimos/perception/common/__init__.py | 2 +- dimos/perception/common/cuboid_fit.py | 331 ---- dimos/perception/common/utils.py | 331 ++++ dimos/perception/grasp_generation/__init__.py | 1 + .../grasp_generation/grasp_generation.py | 228 +++ dimos/perception/grasp_generation/utils.py | 621 ++++++++ dimos/perception/manip_aio_pipeline.py | 590 +++++++ dimos/perception/manip_aio_processer.py | 411 +++++ dimos/perception/object_detection_stream.py | 67 +- dimos/perception/pointcloud/__init__.py | 3 + dimos/perception/pointcloud/cuboid_fit.py | 414 +++++ .../pointcloud/pointcloud_filtering.py | 674 ++++++++ dimos/perception/pointcloud/utils.py | 1367 +++++++++++++++++ dimos/perception/segmentation/sam_2d_seg.py | 17 +- dimos/perception/segmentation/utils.py | 6 + dimos/stream/stereo_camera_streams/zed.py | 164 ++ dimos/types/manipulation.py | 19 +- ...est_manipulation_perception_pipeline.py.py | 167 ++ ...test_manipulation_pipeline_single_frame.py | 248 +++ ..._manipulation_pipeline_single_frame_lcm.py | 431 ++++++ tests/test_pointcloud_filtering.py | 105 ++ tests/test_zed_setup.py | 182 +++ tests/visualization_script.py | 1041 +++++++++++++ tests/zed_neural_depth_demo.py | 450 ++++++ 25 files changed, 7783 insertions(+), 400 deletions(-) create mode 100644 dimos/hardware/zed_camera.py delete mode 100644 dimos/perception/common/cuboid_fit.py create mode 100644 dimos/perception/common/utils.py create mode 100644 dimos/perception/grasp_generation/__init__.py create mode 100644 dimos/perception/grasp_generation/grasp_generation.py create mode 100644 dimos/perception/grasp_generation/utils.py create mode 100644 dimos/perception/manip_aio_pipeline.py create mode 100644 dimos/perception/manip_aio_processer.py create mode 100644 dimos/perception/pointcloud/__init__.py create mode 100644 dimos/perception/pointcloud/cuboid_fit.py create mode 100644 dimos/perception/pointcloud/pointcloud_filtering.py create mode 100644 dimos/perception/pointcloud/utils.py create mode 100644 dimos/stream/stereo_camera_streams/zed.py create mode 100644 tests/test_manipulation_perception_pipeline.py.py create mode 100644 tests/test_manipulation_pipeline_single_frame.py create mode 100644 tests/test_manipulation_pipeline_single_frame_lcm.py create mode 100644 tests/test_pointcloud_filtering.py create mode 100755 tests/test_zed_setup.py create mode 100644 tests/visualization_script.py create mode 100755 tests/zed_neural_depth_demo.py diff --git a/dimos/hardware/zed_camera.py b/dimos/hardware/zed_camera.py new file mode 100644 index 0000000000..b93d2577e6 --- /dev/null +++ b/dimos/hardware/zed_camera.py @@ -0,0 +1,313 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import cv2 +import open3d as o3d +from typing import Optional, Tuple, Dict, Any +import logging + +try: + import pyzed.sl as sl +except ImportError: + sl = None + logging.warning("ZED SDK not found. Please install pyzed to use ZED camera functionality.") + +from dimos.hardware.stereo_camera import StereoCamera + +logger = logging.getLogger(__name__) + + +class ZEDCamera(StereoCamera): + """ZED Camera capture node with neural depth processing.""" + + def __init__( + self, + camera_id: int = 0, + resolution: sl.RESOLUTION = sl.RESOLUTION.HD720, + depth_mode: sl.DEPTH_MODE = sl.DEPTH_MODE.NEURAL, + fps: int = 30, + **kwargs, + ): + """ + Initialize ZED Camera. + + Args: + camera_id: Camera ID (0 for first ZED) + resolution: ZED camera resolution + depth_mode: Depth computation mode + fps: Camera frame rate (default: 30) + """ + if sl is None: + raise ImportError("ZED SDK not installed. Please install pyzed package.") + + super().__init__(**kwargs) + + self.camera_id = camera_id + self.resolution = resolution + self.depth_mode = depth_mode + self.fps = fps + + # Initialize ZED camera + self.zed = sl.Camera() + self.init_params = sl.InitParameters() + self.init_params.camera_resolution = resolution + self.init_params.depth_mode = depth_mode + self.init_params.coordinate_units = sl.UNIT.METER + self.init_params.camera_fps = fps + + # Set camera ID using the correct parameter name + if hasattr(self.init_params, "set_from_camera_id"): + self.init_params.set_from_camera_id(camera_id) + elif hasattr(self.init_params, "input"): + self.init_params.input.set_from_camera_id(camera_id) + + # Use enable_fill_mode instead of SENSING_MODE.STANDARD + self.runtime_params = sl.RuntimeParameters() + self.runtime_params.enable_fill_mode = True # False = STANDARD mode, True = FILL mode + + # Image containers + self.image_left = sl.Mat() + self.image_right = sl.Mat() + self.depth_map = sl.Mat() + self.point_cloud = sl.Mat() + self.confidence_map = sl.Mat() + + self.is_opened = False + + def open(self) -> bool: + """Open the ZED camera.""" + try: + err = self.zed.open(self.init_params) + if err != sl.ERROR_CODE.SUCCESS: + logger.error(f"Failed to open ZED camera: {err}") + return False + + self.is_opened = True + logger.info("ZED camera opened successfully") + + # Get camera information + info = self.zed.get_camera_information() + logger.info(f"ZED Camera Model: {info.camera_model}") + logger.info(f"Serial Number: {info.serial_number}") + logger.info(f"Firmware: {info.camera_configuration.firmware_version}") + + return True + + except Exception as e: + logger.error(f"Error opening ZED camera: {e}") + return False + + def close(self): + """Close the ZED camera.""" + if self.is_opened: + self.zed.close() + self.is_opened = False + logger.info("ZED camera closed") + + def capture_frame( + self, + ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]: + """ + Capture a frame from ZED camera. + + Returns: + Tuple of (left_image, right_image, depth_map) as numpy arrays + """ + if not self.is_opened: + logger.error("ZED camera not opened") + return None, None, None + + try: + # Grab frame + if self.zed.grab(self.runtime_params) == sl.ERROR_CODE.SUCCESS: + # Retrieve left image + self.zed.retrieve_image(self.image_left, sl.VIEW.LEFT) + left_img = self.image_left.get_data()[:, :, :3] # Remove alpha channel + + # Retrieve right image + self.zed.retrieve_image(self.image_right, sl.VIEW.RIGHT) + right_img = self.image_right.get_data()[:, :, :3] # Remove alpha channel + + # Retrieve depth map + self.zed.retrieve_measure(self.depth_map, sl.MEASURE.DEPTH) + depth = self.depth_map.get_data() + + return left_img, right_img, depth + else: + logger.warning("Failed to grab frame from ZED camera") + return None, None, None + + except Exception as e: + logger.error(f"Error capturing frame: {e}") + return None, None, None + + def capture_pointcloud(self) -> Optional[o3d.geometry.PointCloud]: + """ + Capture point cloud from ZED camera. + + Returns: + Open3D point cloud with XYZ coordinates and RGB colors + """ + if not self.is_opened: + logger.error("ZED camera not opened") + return None + + try: + if self.zed.grab(self.runtime_params) == sl.ERROR_CODE.SUCCESS: + # Retrieve point cloud with RGBA data + self.zed.retrieve_measure(self.point_cloud, sl.MEASURE.XYZRGBA) + point_cloud_data = self.point_cloud.get_data() + + # Convert to numpy array format + height, width = point_cloud_data.shape[:2] + points = point_cloud_data.reshape(-1, 4) + + # Extract XYZ coordinates + xyz = points[:, :3] + + # Extract and unpack RGBA color data from 4th channel + rgba_packed = points[:, 3].view(np.uint32) + + # Unpack RGBA: each 32-bit value contains 4 bytes (R, G, B, A) + colors_rgba = np.zeros((len(rgba_packed), 4), dtype=np.uint8) + colors_rgba[:, 0] = rgba_packed & 0xFF # R + colors_rgba[:, 1] = (rgba_packed >> 8) & 0xFF # G + colors_rgba[:, 2] = (rgba_packed >> 16) & 0xFF # B + colors_rgba[:, 3] = (rgba_packed >> 24) & 0xFF # A + + # Extract RGB (ignore alpha) and normalize to [0, 1] + colors_rgb = colors_rgba[:, :3].astype(np.float64) / 255.0 + + # Filter out invalid points (NaN or inf) + valid = np.isfinite(xyz).all(axis=1) + valid_xyz = xyz[valid] + valid_colors = colors_rgb[valid] + + # Create Open3D point cloud + pcd = o3d.geometry.PointCloud() + + if len(valid_xyz) > 0: + pcd.points = o3d.utility.Vector3dVector(valid_xyz) + pcd.colors = o3d.utility.Vector3dVector(valid_colors) + + return pcd + else: + logger.warning("Failed to grab frame for point cloud") + return None + + except Exception as e: + logger.error(f"Error capturing point cloud: {e}") + return None + + def get_camera_info(self) -> Dict[str, Any]: + """Get ZED camera information and calibration parameters.""" + if not self.is_opened: + return {} + + try: + info = self.zed.get_camera_information() + calibration = info.camera_configuration.calibration_parameters + + # In ZED SDK 4.0+, the baseline calculation has changed + # Try to get baseline from the stereo parameters + try: + # Method 1: Try to get from stereo parameters if available + if hasattr(calibration, "getCameraBaseline"): + baseline = calibration.getCameraBaseline() + else: + # Method 2: Calculate from left and right camera positions + # The baseline is the distance between left and right cameras + left_cam = calibration.left_cam + right_cam = calibration.right_cam + + # Try different ways to get baseline in SDK 4.0+ + if hasattr(info.camera_configuration, "calibration_parameters_raw"): + # Use raw calibration if available + raw_calib = info.camera_configuration.calibration_parameters_raw + if hasattr(raw_calib, "T"): + baseline = abs(raw_calib.T[0]) + else: + baseline = 0.12 # Default ZED-M baseline approximation + else: + # Use default baseline for ZED-M + baseline = 0.12 # ZED-M baseline is approximately 120mm + except: + baseline = 0.12 # Fallback to approximate ZED-M baseline + + return { + "model": str(info.camera_model), + "serial_number": info.serial_number, + "firmware": info.camera_configuration.firmware_version, + "resolution": { + "width": info.camera_configuration.resolution.width, + "height": info.camera_configuration.resolution.height, + }, + "fps": info.camera_configuration.fps, + "left_cam": { + "fx": calibration.left_cam.fx, + "fy": calibration.left_cam.fy, + "cx": calibration.left_cam.cx, + "cy": calibration.left_cam.cy, + "k1": calibration.left_cam.disto[0], + "k2": calibration.left_cam.disto[1], + "p1": calibration.left_cam.disto[2], + "p2": calibration.left_cam.disto[3], + "k3": calibration.left_cam.disto[4], + }, + "right_cam": { + "fx": calibration.right_cam.fx, + "fy": calibration.right_cam.fy, + "cx": calibration.right_cam.cx, + "cy": calibration.right_cam.cy, + "k1": calibration.right_cam.disto[0], + "k2": calibration.right_cam.disto[1], + "p1": calibration.right_cam.disto[2], + "p2": calibration.right_cam.disto[3], + "k3": calibration.right_cam.disto[4], + }, + "baseline": baseline, + } + except Exception as e: + logger.error(f"Error getting camera info: {e}") + return {} + + def calculate_intrinsics(self): + """Calculate camera intrinsics from ZED calibration.""" + info = self.get_camera_info() + if not info: + return super().calculate_intrinsics() + + left_cam = info.get("left_cam", {}) + resolution = info.get("resolution", {}) + + return { + "focal_length_x": left_cam.get("fx", 0), + "focal_length_y": left_cam.get("fy", 0), + "principal_point_x": left_cam.get("cx", 0), + "principal_point_y": left_cam.get("cy", 0), + "baseline": info.get("baseline", 0), + "resolution_width": resolution.get("width", 0), + "resolution_height": resolution.get("height", 0), + } + + def __enter__(self): + """Context manager entry.""" + if not self.open(): + raise RuntimeError("Failed to open ZED camera") + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + self.close() diff --git a/dimos/perception/common/__init__.py b/dimos/perception/common/__init__.py index ad815d3f46..e658a8734c 100644 --- a/dimos/perception/common/__init__.py +++ b/dimos/perception/common/__init__.py @@ -1,3 +1,3 @@ from .detection2d_tracker import target2dTracker, get_tracked_results -from .cuboid_fit import * from .ibvs import * +from .utils import * diff --git a/dimos/perception/common/cuboid_fit.py b/dimos/perception/common/cuboid_fit.py deleted file mode 100644 index 9848332c06..0000000000 --- a/dimos/perception/common/cuboid_fit.py +++ /dev/null @@ -1,331 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -from sklearn.decomposition import PCA -import matplotlib.pyplot as plt -import cv2 - - -def depth_to_point_cloud(depth_image, camera_matrix, subsample_factor=4): - """ - Convert depth image to point cloud using camera intrinsics. - Subsamples points to reduce density. - - Args: - depth_image: HxW depth image in meters - camera_matrix: 3x3 camera intrinsic matrix - subsample_factor: Factor to subsample points (higher = fewer points) - - Returns: - Nx3 array of 3D points - """ - # Get focal length and principal point from camera matrix - fx = camera_matrix[0, 0] - fy = camera_matrix[1, 1] - cx = camera_matrix[0, 2] - cy = camera_matrix[1, 2] - - # Create pixel coordinate grid - rows, cols = depth_image.shape - x_grid, y_grid = np.meshgrid( - np.arange(0, cols, subsample_factor), np.arange(0, rows, subsample_factor) - ) - - # Flatten grid and depth - x = x_grid.flatten() - y = y_grid.flatten() - z = depth_image[y_grid, x_grid].flatten() - - # Remove points with invalid depth - valid = z > 0 - x = x[valid] - y = y[valid] - z = z[valid] - - # Convert to 3D points - X = (x - cx) * z / fx - Y = (y - cy) * z / fy - Z = z - - return np.column_stack([X, Y, Z]) - - -def fit_cuboid(points, n_iterations=5, inlier_thresh=2.0): - """ - Fit a cuboid to a point cloud using iteratively refined PCA. - - Args: - points: Nx3 array of points - n_iterations: Number of refinement iterations - inlier_thresh: Threshold for inlier detection in standard deviations - - Returns: - dict containing: - - center: 3D center point - - dimensions: 3D dimensions - - rotation: 3x3 rotation matrix - - error: fitting error - """ - points = np.asarray(points) - if len(points) < 4: - return None - - # Initial center estimate using median for robustness - best_error = float("inf") - best_params = None - center = np.median(points, axis=0) - current_points = points - center - - for iteration in range(n_iterations): - if len(current_points) < 4: # Need at least 4 points for PCA - break - - # Perform PCA - pca = PCA(n_components=3) - pca.fit(current_points) - - # Get rotation matrix from PCA - rotation = pca.components_ - - # Transform points to PCA space - local_points = current_points @ rotation.T - - # Initialize mask for this iteration - inlier_mask = np.ones(len(current_points), dtype=bool) - dimensions = np.zeros(3) - - # Filter points along each dimension - for dim in range(3): - points_1d = local_points[inlier_mask, dim] - if len(points_1d) < 4: - break - - median = np.median(points_1d) - mad = np.median(np.abs(points_1d - median)) - sigma = mad * 1.4826 # Convert MAD to standard deviation estimate - - # Avoid issues with constant values - if sigma < 1e-6: - continue - - # Update mask for this dimension - dim_inliers = np.abs(points_1d - median) < (inlier_thresh * sigma) - inlier_mask[inlier_mask] = dim_inliers - - # Calculate dimension based on robust statistics - valid_points = points_1d[dim_inliers] - if len(valid_points) > 0: - dimensions[dim] = np.max(valid_points) - np.min(valid_points) - - # Skip if we don't have enough inliers - if np.sum(inlier_mask) < 4: - continue - - # Calculate error for this iteration - # Mean squared distance from points to cuboid surface - half_dims = dimensions / 2 - dx = np.abs(local_points[:, 0]) - half_dims[0] - dy = np.abs(local_points[:, 1]) - half_dims[1] - dz = np.abs(local_points[:, 2]) - half_dims[2] - - outside_dist = np.sqrt( - np.maximum(dx, 0) ** 2 + np.maximum(dy, 0) ** 2 + np.maximum(dz, 0) ** 2 - ) - inside_dist = np.minimum(np.maximum(np.maximum(dx, dy), dz), 0) - distances = outside_dist + inside_dist - error = np.mean(distances**2) - - if error < best_error: - best_error = error - best_params = { - "center": center, - "rotation": rotation, - "dimensions": dimensions, - "error": error, - } - - # Update points for next iteration - current_points = current_points[inlier_mask] - - return best_params - - -def compute_fitting_error(local_points, dimensions): - """Compute mean squared distance from points to cuboid surface.""" - half_dims = dimensions / 2 - dx = np.abs(local_points[:, 0]) - half_dims[0] - dy = np.abs(local_points[:, 1]) - half_dims[1] - dz = np.abs(local_points[:, 2]) - half_dims[2] - - outside_dist = np.sqrt(np.maximum(dx, 0) ** 2 + np.maximum(dy, 0) ** 2 + np.maximum(dz, 0) ** 2) - inside_dist = np.minimum(np.maximum(np.maximum(dx, dy), dz), 0) - - distances = outside_dist + inside_dist - return np.mean(distances**2) - - -def get_cuboid_corners(center, dimensions, rotation): - """Get the 8 corners of a cuboid.""" - half_dims = dimensions / 2 - corners_local = ( - np.array( - [ - [-1, -1, -1], # 0: left bottom back - [-1, -1, 1], # 1: left bottom front - [-1, 1, -1], # 2: left top back - [-1, 1, 1], # 3: left top front - [1, -1, -1], # 4: right bottom back - [1, -1, 1], # 5: right bottom front - [1, 1, -1], # 6: right top back - [1, 1, 1], # 7: right top front - ] - ) - * half_dims - ) - - return corners_local @ rotation + center - - -def visualize_fit(image, cuboid_params, camera_matrix, R=None, t=None): - """ - Draw the fitted cuboid on the image. - """ - # Get corners in world coordinates - corners = get_cuboid_corners( - cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"] - ) - - # Transform corners if R and t are provided - if R is not None and t is not None: - corners = (R @ corners.T).T + t - - # Project corners to image space - corners_img = ( - cv2.projectPoints( - corners, - np.zeros(3), - np.zeros(3), # Already in camera frame - camera_matrix, - None, - )[0] - .reshape(-1, 2) - .astype(int) - ) - - # Define edges for visualization - edges = [ - # Bottom face - (0, 1), - (1, 5), - (5, 4), - (4, 0), - # Top face - (2, 3), - (3, 7), - (7, 6), - (6, 2), - # Vertical edges - (0, 2), - (1, 3), - (5, 7), - (4, 6), - ] - - # Draw edges - vis_img = image.copy() - for i, j in edges: - cv2.line(vis_img, tuple(corners_img[i]), tuple(corners_img[j]), (0, 255, 0), 2) - - # Add text with dimensions - dims = cuboid_params["dimensions"] - dim_text = f"Dims: {dims[0]:.3f} x {dims[1]:.3f} x {dims[2]:.3f}" - cv2.putText(vis_img, dim_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) - - return vis_img - - -def plot_3d_fit(points, cuboid_params, title="3D Cuboid Fit"): - """Plot points and fitted cuboid in 3D.""" - fig = plt.figure(figsize=(10, 10)) - ax = fig.add_subplot(111, projection="3d") - - # Plot points - ax.scatter( - points[:, 0], points[:, 1], points[:, 2], c="b", marker=".", alpha=0.1, label="Points" - ) - - # Plot fitted cuboid - corners = get_cuboid_corners( - cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"] - ) - - # Define edges - edges = [ - # Bottom face - (0, 1), - (1, 5), - (5, 4), - (4, 0), - # Top face - (2, 3), - (3, 7), - (7, 6), - (6, 2), - # Vertical edges - (0, 2), - (1, 3), - (5, 7), - (4, 6), - ] - - # Plot edges - for i, j in edges: - ax.plot3D( - [corners[i, 0], corners[j, 0]], - [corners[i, 1], corners[j, 1]], - [corners[i, 2], corners[j, 2]], - "r-", - ) - - # Set labels and title - ax.set_xlabel("X") - ax.set_ylabel("Y") - ax.set_zlabel("Z") - ax.set_title(title) - - # Make scaling uniform - all_points = np.vstack([points, corners]) - max_range = ( - np.array( - [ - all_points[:, 0].max() - all_points[:, 0].min(), - all_points[:, 1].max() - all_points[:, 1].min(), - all_points[:, 2].max() - all_points[:, 2].min(), - ] - ).max() - / 2.0 - ) - - mid_x = (all_points[:, 0].max() + all_points[:, 0].min()) * 0.5 - mid_y = (all_points[:, 1].max() + all_points[:, 1].min()) * 0.5 - mid_z = (all_points[:, 2].max() + all_points[:, 2].min()) * 0.5 - - ax.set_xlim(mid_x - max_range, mid_x + max_range) - ax.set_ylim(mid_y - max_range, mid_y + max_range) - ax.set_zlim(mid_z - max_range, mid_z + max_range) - - ax.set_box_aspect([1, 1, 1]) - plt.legend() - return fig, ax diff --git a/dimos/perception/common/utils.py b/dimos/perception/common/utils.py new file mode 100644 index 0000000000..da9cc58fe0 --- /dev/null +++ b/dimos/perception/common/utils.py @@ -0,0 +1,331 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cv2 +import numpy as np +from typing import List, Tuple, Optional +from dimos.types.manipulation import ObjectData +from dimos.types.vector import Vector +from dimos.utils.logging_config import setup_logger +import torch + +logger = setup_logger("dimos.perception.common.utils") + + +def colorize_depth(depth_img: np.ndarray, max_depth: float = 5.0) -> Optional[np.ndarray]: + """ + Normalize and colorize depth image using COLORMAP_JET. + + Args: + depth_img: Depth image (H, W) in meters + max_depth: Maximum depth value for normalization + + Returns: + Colorized depth image (H, W, 3) in RGB format, or None if input is None + """ + if depth_img is None: + return None + + valid_mask = np.isfinite(depth_img) & (depth_img > 0) + depth_norm = np.zeros_like(depth_img) + depth_norm[valid_mask] = np.clip(depth_img[valid_mask] / max_depth, 0, 1) + depth_colored = cv2.applyColorMap((depth_norm * 255).astype(np.uint8), cv2.COLORMAP_JET) + depth_rgb = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB) + + # Make the depth image less bright by scaling down the values + depth_rgb = (depth_rgb * 0.6).astype(np.uint8) + + return depth_rgb + + +def draw_bounding_box( + image: np.ndarray, + bbox: List[float], + color: Tuple[int, int, int] = (0, 255, 0), + thickness: int = 2, + label: Optional[str] = None, + confidence: Optional[float] = None, + object_id: Optional[int] = None, + font_scale: float = 0.6, +) -> np.ndarray: + """ + Draw a bounding box with optional label on an image. + + Args: + image: Image to draw on (H, W, 3) + bbox: Bounding box [x1, y1, x2, y2] + color: RGB color tuple for the box + thickness: Line thickness for the box + label: Optional class label + confidence: Optional confidence score + object_id: Optional object ID + font_scale: Font scale for text + + Returns: + Image with bounding box drawn + """ + x1, y1, x2, y2 = map(int, bbox) + + # Draw bounding box + cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness) + + # Create label text + text_parts = [] + if label is not None: + text_parts.append(str(label)) + if object_id is not None: + text_parts.append(f"ID: {object_id}") + if confidence is not None: + text_parts.append(f"({confidence:.2f})") + + if text_parts: + text = ", ".join(text_parts) + + # Draw text background + text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 1)[0] + cv2.rectangle( + image, + (x1, y1 - text_size[1] - 5), + (x1 + text_size[0], y1), + (0, 0, 0), + -1, + ) + + # Draw text + cv2.putText( + image, + text, + (x1, y1 - 5), + cv2.FONT_HERSHEY_SIMPLEX, + font_scale, + (255, 255, 255), + 1, + ) + + return image + + +def draw_segmentation_mask( + image: np.ndarray, + mask: np.ndarray, + color: Tuple[int, int, int] = (0, 200, 200), + alpha: float = 0.5, + draw_contours: bool = True, + contour_thickness: int = 2, +) -> np.ndarray: + """ + Draw segmentation mask overlay on an image. + + Args: + image: Image to draw on (H, W, 3) + mask: Segmentation mask (H, W) - boolean or uint8 + color: RGB color for the mask + alpha: Transparency factor (0.0 = transparent, 1.0 = opaque) + draw_contours: Whether to draw mask contours + contour_thickness: Thickness of contour lines + + Returns: + Image with mask overlay drawn + """ + if mask is None: + return image + + try: + # Ensure mask is uint8 + mask = mask.astype(np.uint8) + + # Create colored mask overlay + colored_mask = np.zeros_like(image) + colored_mask[mask > 0] = color + + # Apply the mask with transparency + mask_area = mask > 0 + image[mask_area] = cv2.addWeighted( + image[mask_area], 1 - alpha, colored_mask[mask_area], alpha, 0 + ) + + # Draw mask contours if requested + if draw_contours: + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(image, contours, -1, color, contour_thickness) + + except Exception as e: + logger.warning(f"Error drawing segmentation mask: {e}") + + return image + + +def draw_object_detection_visualization( + image: np.ndarray, + objects: List[ObjectData], + draw_masks: bool = False, + bbox_color: Tuple[int, int, int] = (0, 255, 0), + mask_color: Tuple[int, int, int] = (0, 200, 200), + font_scale: float = 0.6, +) -> np.ndarray: + """ + Create object detection visualization with bounding boxes and optional masks. + + Args: + image: Base image to draw on (H, W, 3) + objects: List of ObjectData with detection information + draw_masks: Whether to draw segmentation masks + bbox_color: Default color for bounding boxes + mask_color: Default color for segmentation masks + font_scale: Font scale for text labels + + Returns: + Image with detection visualization + """ + viz_image = image.copy() + + for obj in objects: + try: + # Draw segmentation mask first (if enabled and available) + if draw_masks and "segmentation_mask" in obj and obj["segmentation_mask"] is not None: + viz_image = draw_segmentation_mask( + viz_image, obj["segmentation_mask"], color=mask_color, alpha=0.5 + ) + + # Draw bounding box + if "bbox" in obj and obj["bbox"] is not None: + # Use object's color if available, otherwise default + color = bbox_color + if "color" in obj and obj["color"] is not None: + obj_color = obj["color"] + if isinstance(obj_color, np.ndarray): + color = tuple(int(c) for c in obj_color) + elif isinstance(obj_color, (list, tuple)): + color = tuple(int(c) for c in obj_color[:3]) + + viz_image = draw_bounding_box( + viz_image, + obj["bbox"], + color=color, + label=obj.get("label"), + confidence=obj.get("confidence"), + object_id=obj.get("object_id"), + font_scale=font_scale, + ) + + except Exception as e: + logger.warning(f"Error drawing object visualization: {e}") + + return viz_image + + +def detection_results_to_object_data( + bboxes: List[List[float]], + track_ids: List[int], + class_ids: List[int], + confidences: List[float], + names: List[str], + masks: Optional[List[np.ndarray]] = None, + source: str = "detection", +) -> List[ObjectData]: + """ + Convert detection/segmentation results to ObjectData format. + + Args: + bboxes: List of bounding boxes [x1, y1, x2, y2] + track_ids: List of tracking IDs + class_ids: List of class indices + confidences: List of detection confidences + names: List of class names + masks: Optional list of segmentation masks + source: Source type ("detection" or "segmentation") + + Returns: + List of ObjectData dictionaries + """ + objects = [] + + for i in range(len(bboxes)): + # Calculate basic properties from bbox + bbox = bboxes[i] + width = bbox[2] - bbox[0] + height = bbox[3] - bbox[1] + center_x = bbox[0] + width / 2 + center_y = bbox[1] + height / 2 + + # Create ObjectData + object_data: ObjectData = { + "object_id": track_ids[i] if i < len(track_ids) else i, + "bbox": bbox, + "depth": -1.0, # Will be populated by depth estimation or point cloud processing + "confidence": confidences[i] if i < len(confidences) else 1.0, + "class_id": class_ids[i] if i < len(class_ids) else 0, + "label": names[i] if i < len(names) else f"{source}_object", + "movement_tolerance": 1.0, # Default to freely movable + "segmentation_mask": masks[i].cpu().numpy() + if masks and i < len(masks) and isinstance(masks[i], torch.Tensor) + else masks[i] + if masks and i < len(masks) + else None, + # Initialize 3D properties (will be populated by point cloud processing) + "position": Vector(0, 0, 0), + "rotation": Vector(0, 0, 0), + "size": { + "width": 0.0, + "height": 0.0, + "depth": 0.0, + }, + } + objects.append(object_data) + + return objects + + +def combine_object_data( + list1: List[ObjectData], list2: List[ObjectData], overlap_threshold: float = 0.8 +) -> List[ObjectData]: + """ + Combine two ObjectData lists, removing duplicates based on segmentation mask overlap. + """ + combined = list1.copy() + used_ids = set(obj.get("object_id", 0) for obj in list1) + next_id = max(used_ids) + 1 if used_ids else 1 + + for obj2 in list2: + obj_copy = obj2.copy() + + # Handle duplicate object_id + if obj_copy.get("object_id", 0) in used_ids: + obj_copy["object_id"] = next_id + next_id += 1 + used_ids.add(obj_copy["object_id"]) + + # Check mask overlap + mask2 = obj2.get("segmentation_mask") + if mask2 is None or np.sum(mask2 > 0) == 0: + combined.append(obj_copy) + continue + + mask2_area = np.sum(mask2 > 0) + is_duplicate = False + + for obj1 in list1: + mask1 = obj1.get("segmentation_mask") + if mask1 is None: + continue + + intersection = np.sum((mask1 > 0) & (mask2 > 0)) + if intersection / mask2_area >= overlap_threshold: + is_duplicate = True + break + + if not is_duplicate: + combined.append(obj_copy) + + return combined diff --git a/dimos/perception/grasp_generation/__init__.py b/dimos/perception/grasp_generation/__init__.py new file mode 100644 index 0000000000..16281fe0b6 --- /dev/null +++ b/dimos/perception/grasp_generation/__init__.py @@ -0,0 +1 @@ +from .utils import * diff --git a/dimos/perception/grasp_generation/grasp_generation.py b/dimos/perception/grasp_generation/grasp_generation.py new file mode 100644 index 0000000000..89e7a0036c --- /dev/null +++ b/dimos/perception/grasp_generation/grasp_generation.py @@ -0,0 +1,228 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Dimensional-hosted grasp generation for manipulation pipeline. +""" + +import asyncio +import numpy as np +import open3d as o3d +from typing import Dict, List, Optional + +from dimos.types.manipulation import ObjectData +from dimos.utils.logging_config import setup_logger +from dimos.perception.grasp_generation.utils import parse_grasp_results + +logger = setup_logger("dimos.perception.grasp_generation") + + +class HostedGraspGenerator: + """ + Dimensional-hosted grasp generator using WebSocket communication. + """ + + def __init__(self, server_url: str): + """ + Initialize Dimensional-hosted grasp generator. + + Args: + server_url: WebSocket URL for Dimensional-hosted grasp generator server + """ + self.server_url = server_url + logger.info(f"Initialized grasp generator with server: {server_url}") + + def generate_grasps_from_objects( + self, objects: List[ObjectData], full_pcd: o3d.geometry.PointCloud + ) -> List[Dict]: + """ + Generate grasps from ObjectData objects using grasp generator. + + Args: + objects: List of ObjectData with point clouds + full_pcd: Open3D point cloud of full scene + + Returns: + Parsed grasp results as list of dictionaries + """ + try: + # Combine all point clouds + all_points = [] + all_colors = [] + valid_objects = 0 + + for obj in objects: + if "point_cloud_numpy" not in obj or obj["point_cloud_numpy"] is None: + continue + + points = obj["point_cloud_numpy"] + if not isinstance(points, np.ndarray) or points.size == 0: + continue + + if len(points.shape) != 2 or points.shape[1] != 3: + continue + + colors = None + if "colors_numpy" in obj and obj["colors_numpy"] is not None: + colors = obj["colors_numpy"] + if isinstance(colors, np.ndarray) and colors.size > 0: + if ( + colors.shape[0] != points.shape[0] + or len(colors.shape) != 2 + or colors.shape[1] != 3 + ): + colors = None + + all_points.append(points) + if colors is not None: + all_colors.append(colors) + valid_objects += 1 + + if not all_points: + return [] + + # Combine point clouds + combined_points = np.vstack(all_points) + combined_colors = None + if len(all_colors) == valid_objects and len(all_colors) > 0: + combined_colors = np.vstack(all_colors) + + # Send grasp request + grasps = self._send_grasp_request_sync(combined_points, combined_colors) + + if not grasps: + return [] + + # Parse and return results in list of dictionaries format + return parse_grasp_results(grasps) + + except Exception as e: + logger.error(f"Grasp generation failed: {e}") + return [] + + def _send_grasp_request_sync( + self, points: np.ndarray, colors: Optional[np.ndarray] + ) -> Optional[List[Dict]]: + """Send synchronous grasp request to grasp server.""" + + try: + # Prepare colors + colors = np.ones((points.shape[0], 3), dtype=np.float32) * 0.5 + + # Ensure correct data types + points = points.astype(np.float32) + colors = colors.astype(np.float32) + + # Validate ranges + if np.any(np.isnan(points)) or np.any(np.isinf(points)): + logger.error("Points contain NaN or Inf values") + return None + if np.any(np.isnan(colors)) or np.any(np.isinf(colors)): + logger.error("Colors contain NaN or Inf values") + return None + + colors = np.clip(colors, 0.0, 1.0) + + # Run async request in sync context + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + result = loop.run_until_complete(self._async_grasp_request(points, colors)) + return result + finally: + loop.close() + + except Exception as e: + logger.error(f"Error in synchronous grasp request: {e}") + return None + + async def _async_grasp_request( + self, points: np.ndarray, colors: np.ndarray + ) -> Optional[List[Dict]]: + """Async grasp request helper.""" + import json + import websockets + + try: + async with websockets.connect(self.server_url) as websocket: + request = { + "points": points.tolist(), + "colors": colors.tolist(), + "lims": [-1.0, 1.0, -1.0, 1.0, 0.0, 2.0], + } + + await websocket.send(json.dumps(request)) + response = await websocket.recv() + grasps = json.loads(response) + + if isinstance(grasps, dict) and "error" in grasps: + logger.error(f"Server returned error: {grasps['error']}") + return None + elif isinstance(grasps, (int, float)) and grasps == 0: + return None + elif not isinstance(grasps, list): + logger.error(f"Server returned unexpected response type: {type(grasps)}") + return None + elif len(grasps) == 0: + return None + + return self._convert_grasp_format(grasps) + + except Exception as e: + logger.error(f"Async grasp request failed: {e}") + return None + + def _convert_grasp_format(self, grasps: List[dict]) -> List[dict]: + """Convert Dimensional Grasp format to visualization format.""" + converted = [] + + for i, grasp in enumerate(grasps): + rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3))) + euler_angles = self._rotation_matrix_to_euler(rotation_matrix) + + converted_grasp = { + "id": f"grasp_{i}", + "score": grasp.get("score", 0.0), + "width": grasp.get("width", 0.0), + "height": grasp.get("height", 0.0), + "depth": grasp.get("depth", 0.0), + "translation": grasp.get("translation", [0, 0, 0]), + "rotation_matrix": rotation_matrix.tolist(), + "euler_angles": euler_angles, + } + converted.append(converted_grasp) + + converted.sort(key=lambda x: x["score"], reverse=True) + return converted + + def _rotation_matrix_to_euler(self, rotation_matrix: np.ndarray) -> Dict[str, float]: + """Convert rotation matrix to Euler angles (in radians).""" + sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2) + + singular = sy < 1e-6 + + if not singular: + x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2]) + y = np.arctan2(-rotation_matrix[2, 0], sy) + z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0]) + else: + x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1]) + y = np.arctan2(-rotation_matrix[2, 0], sy) + z = 0 + + return {"roll": x, "pitch": y, "yaw": z} + + def cleanup(self): + """Clean up resources.""" + logger.info("Grasp generator cleaned up") diff --git a/dimos/perception/grasp_generation/utils.py b/dimos/perception/grasp_generation/utils.py new file mode 100644 index 0000000000..5a79a7d3bc --- /dev/null +++ b/dimos/perception/grasp_generation/utils.py @@ -0,0 +1,621 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for grasp generation and visualization.""" + +import numpy as np +import open3d as o3d +import cv2 +from typing import List, Dict, Tuple, Optional, Union + + +def project_3d_points_to_2d( + points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray] +) -> np.ndarray: + """ + Project 3D points to 2D image coordinates using camera intrinsics. + + Args: + points_3d: Nx3 array of 3D points (X, Y, Z) + camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix + + Returns: + Nx2 array of 2D image coordinates (u, v) + """ + if len(points_3d) == 0: + return np.zeros((0, 2), dtype=np.int32) + + # Filter out points with zero or negative depth + valid_mask = points_3d[:, 2] > 0 + if not np.any(valid_mask): + return np.zeros((0, 2), dtype=np.int32) + + valid_points = points_3d[valid_mask] + + # Extract camera parameters + if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: + fx, fy, cx, cy = camera_intrinsics + else: + camera_matrix = np.array(camera_intrinsics) + fx = camera_matrix[0, 0] + fy = camera_matrix[1, 1] + cx = camera_matrix[0, 2] + cy = camera_matrix[1, 2] + + # Project to image coordinates + u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx + v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy + + # Round to integer pixel coordinates + points_2d = np.column_stack([u, v]).astype(np.int32) + + return points_2d + + +def euler_to_rotation_matrix(roll: float, pitch: float, yaw: float) -> np.ndarray: + """ + Convert Euler angles to rotation matrix. + + Args: + roll: Roll angle in radians + pitch: Pitch angle in radians + yaw: Yaw angle in radians + + Returns: + 3x3 rotation matrix + """ + Rx = np.array([[1, 0, 0], [0, np.cos(roll), -np.sin(roll)], [0, np.sin(roll), np.cos(roll)]]) + + Ry = np.array( + [[np.cos(pitch), 0, np.sin(pitch)], [0, 1, 0], [-np.sin(pitch), 0, np.cos(pitch)]] + ) + + Rz = np.array([[np.cos(yaw), -np.sin(yaw), 0], [np.sin(yaw), np.cos(yaw), 0], [0, 0, 1]]) + + # Combined rotation matrix + R = Rz @ Ry @ Rx + + return R + + +def create_gripper_geometry( + grasp_data: dict, + finger_length: float = 0.08, + finger_thickness: float = 0.004, +) -> List[o3d.geometry.TriangleMesh]: + """ + Create a simple fork-like gripper geometry from grasp data. + + Args: + grasp_data: Dictionary containing grasp parameters + - translation: 3D position list + - rotation_matrix: 3x3 rotation matrix defining gripper coordinate system + * X-axis: gripper width direction (opening/closing) + * Y-axis: finger length direction + * Z-axis: approach direction (toward object) + - width: Gripper opening width + finger_length: Length of gripper fingers (longer) + finger_thickness: Thickness of gripper fingers + base_height: Height of gripper base (longer) + color: RGB color for the gripper (solid blue) + + Returns: + List of Open3D TriangleMesh geometries for the gripper + """ + + translation = np.array(grasp_data["translation"]) + rotation_matrix = np.array(grasp_data["rotation_matrix"]) + + width = grasp_data.get("width", 0.04) + + # Create transformation matrix + transform = np.eye(4) + transform[:3, :3] = rotation_matrix + transform[:3, 3] = translation + + geometries = [] + + # Gripper dimensions + finger_width = 0.006 # Thickness of each finger + handle_length = 0.05 # Length of handle extending backward + + # Build gripper in local coordinate system: + # X-axis = width direction (left/right finger separation) + # Y-axis = finger length direction (fingers extend along +Y) + # Z-axis = approach direction (toward object, handle extends along -Z) + # IMPORTANT: Fingertips should be at origin (translation point) + + # Create left finger extending along +Y, positioned at +X + left_finger = o3d.geometry.TriangleMesh.create_box( + width=finger_width, # Thin finger + height=finger_length, # Extends along Y (finger length direction) + depth=finger_thickness, # Thin in Z direction + ) + left_finger.translate( + [ + width / 2 - finger_width / 2, # Position at +X (half width from center) + -finger_length, # Shift so fingertips are at origin + -finger_thickness / 2, # Center in Z + ] + ) + + # Create right finger extending along +Y, positioned at -X + right_finger = o3d.geometry.TriangleMesh.create_box( + width=finger_width, # Thin finger + height=finger_length, # Extends along Y (finger length direction) + depth=finger_thickness, # Thin in Z direction + ) + right_finger.translate( + [ + -width / 2 - finger_width / 2, # Position at -X (half width from center) + -finger_length, # Shift so fingertips are at origin + -finger_thickness / 2, # Center in Z + ] + ) + + # Create base connecting fingers - flat like a stickman body + base = o3d.geometry.TriangleMesh.create_box( + width=width + finger_width, # Full width plus finger thickness + height=finger_thickness, # Flat like fingers (stickman style) + depth=finger_thickness, # Thin like fingers + ) + base.translate( + [ + -width / 2 - finger_width / 2, # Start from left finger position + -finger_length - finger_thickness, # Behind fingers, adjusted for fingertips at origin + -finger_thickness / 2, # Center in Z + ] + ) + + # Create handle extending backward - flat stick like stickman arm + handle = o3d.geometry.TriangleMesh.create_box( + width=finger_width, # Same width as fingers + height=handle_length, # Extends backward along Y direction (same plane) + depth=finger_thickness, # Thin like fingers (same plane) + ) + handle.translate( + [ + -finger_width / 2, # Center in X + -finger_length + - finger_thickness + - handle_length, # Extend backward from base, adjusted for fingertips at origin + -finger_thickness / 2, # Same Z plane as other components + ] + ) + + # Use solid red color for all parts (user changed to red) + solid_color = [1.0, 0.0, 0.0] # Red color + + left_finger.paint_uniform_color(solid_color) + right_finger.paint_uniform_color(solid_color) + base.paint_uniform_color(solid_color) + handle.paint_uniform_color(solid_color) + + # Apply transformation to all parts + left_finger.transform(transform) + right_finger.transform(transform) + base.transform(transform) + handle.transform(transform) + + geometries.extend([left_finger, right_finger, base, handle]) + + return geometries + + +def create_all_gripper_geometries( + grasp_list: List[dict], max_grasps: int = -1 +) -> List[o3d.geometry.TriangleMesh]: + """ + Create gripper geometries for multiple grasps. + + Args: + grasp_list: List of grasp dictionaries + max_grasps: Maximum number of grasps to visualize (-1 for all) + + Returns: + List of all gripper geometries + """ + all_geometries = [] + + grasps_to_show = grasp_list if max_grasps < 0 else grasp_list[:max_grasps] + + for grasp in grasps_to_show: + gripper_parts = create_gripper_geometry(grasp) + all_geometries.extend(gripper_parts) + + return all_geometries + + +def draw_grasps_on_image( + image: np.ndarray, + grasp_data: Union[dict, Dict[Union[int, str], List[dict]], List[dict]], + camera_intrinsics: Union[List[float], np.ndarray], # [fx, fy, cx, cy] or 3x3 matrix + max_grasps: int = -1, # -1 means show all grasps + finger_length: float = 0.08, # Match 3D gripper + finger_thickness: float = 0.004, # Match 3D gripper +) -> np.ndarray: + """ + Draw fork-like gripper visualizations on the image matching 3D gripper design. + + Args: + image: Base image to draw on + grasp_data: Can be: + - A single grasp dict + - A list of grasp dicts + - A dictionary mapping object IDs or "scene" to list of grasps + camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix + max_grasps: Maximum number of grasps to visualize (-1 for all) + finger_length: Length of gripper fingers (matches 3D design) + finger_thickness: Thickness of gripper fingers (matches 3D design) + + Returns: + Image with grasps drawn + """ + result = image.copy() + + # Convert camera intrinsics to 3x3 matrix if needed + if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: + fx, fy, cx, cy = camera_intrinsics + camera_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]]) + else: + camera_matrix = np.array(camera_intrinsics) + + # Convert input to standard format + if isinstance(grasp_data, dict) and not any( + key in grasp_data for key in ["scene", 0, 1, 2, 3, 4, 5] + ): + # Single grasp + grasps_to_draw = [(grasp_data, 0)] + elif isinstance(grasp_data, list): + # List of grasps + grasps_to_draw = [(grasp, i) for i, grasp in enumerate(grasp_data)] + else: + # Dictionary of grasps by object ID + grasps_to_draw = [] + for obj_id, grasps in grasp_data.items(): + for i, grasp in enumerate(grasps): + grasps_to_draw.append((grasp, i)) + + # Limit number of grasps if specified + if max_grasps > 0: + grasps_to_draw = grasps_to_draw[:max_grasps] + + # Define grasp colors (solid red to match 3D design) + def get_grasp_color(index: int) -> tuple: + # Use solid red color for all grasps to match 3D design + return (0, 0, 255) # Red in BGR format for OpenCV + + # Draw each grasp + for grasp, index in grasps_to_draw: + try: + color = get_grasp_color(index) + thickness = max(1, 4 - index // 3) + + # Extract grasp parameters (using translation and rotation_matrix) + if "translation" not in grasp or "rotation_matrix" not in grasp: + continue + + translation = np.array(grasp["translation"]) + rotation_matrix = np.array(grasp["rotation_matrix"]) + width = grasp.get("width", 0.04) + + # Match 3D gripper dimensions + finger_width = 0.006 # Thickness of each finger (matches 3D) + handle_length = 0.05 # Length of handle extending backward (matches 3D) + + # Create gripper geometry in local coordinate system matching 3D design: + # X-axis = width direction (left/right finger separation) + # Y-axis = finger length direction (fingers extend along +Y) + # Z-axis = approach direction (toward object, handle extends along -Z) + # IMPORTANT: Fingertips should be at origin (translation point) + + # Left finger extending along +Y, positioned at +X + left_finger_points = np.array( + [ + [ + width / 2 - finger_width / 2, + -finger_length, + -finger_thickness / 2, + ], # Back left + [ + width / 2 + finger_width / 2, + -finger_length, + -finger_thickness / 2, + ], # Back right + [ + width / 2 + finger_width / 2, + 0, + -finger_thickness / 2, + ], # Front right (at origin) + [ + width / 2 - finger_width / 2, + 0, + -finger_thickness / 2, + ], # Front left (at origin) + ] + ) + + # Right finger extending along +Y, positioned at -X + right_finger_points = np.array( + [ + [ + -width / 2 - finger_width / 2, + -finger_length, + -finger_thickness / 2, + ], # Back left + [ + -width / 2 + finger_width / 2, + -finger_length, + -finger_thickness / 2, + ], # Back right + [ + -width / 2 + finger_width / 2, + 0, + -finger_thickness / 2, + ], # Front right (at origin) + [ + -width / 2 - finger_width / 2, + 0, + -finger_thickness / 2, + ], # Front left (at origin) + ] + ) + + # Base connecting fingers - flat rectangle behind fingers + base_points = np.array( + [ + [ + -width / 2 - finger_width / 2, + -finger_length - finger_thickness, + -finger_thickness / 2, + ], # Back left + [ + width / 2 + finger_width / 2, + -finger_length - finger_thickness, + -finger_thickness / 2, + ], # Back right + [ + width / 2 + finger_width / 2, + -finger_length, + -finger_thickness / 2, + ], # Front right + [ + -width / 2 - finger_width / 2, + -finger_length, + -finger_thickness / 2, + ], # Front left + ] + ) + + # Handle extending backward - thin rectangle + handle_points = np.array( + [ + [ + -finger_width / 2, + -finger_length - finger_thickness - handle_length, + -finger_thickness / 2, + ], # Back left + [ + finger_width / 2, + -finger_length - finger_thickness - handle_length, + -finger_thickness / 2, + ], # Back right + [ + finger_width / 2, + -finger_length - finger_thickness, + -finger_thickness / 2, + ], # Front right + [ + -finger_width / 2, + -finger_length - finger_thickness, + -finger_thickness / 2, + ], # Front left + ] + ) + + # Transform all points to world frame + def transform_points(points): + # Apply rotation and translation + world_points = (rotation_matrix @ points.T).T + translation + return world_points + + left_finger_world = transform_points(left_finger_points) + right_finger_world = transform_points(right_finger_points) + base_world = transform_points(base_points) + handle_world = transform_points(handle_points) + + # Project to 2D + left_finger_2d = project_3d_points_to_2d(left_finger_world, camera_matrix) + right_finger_2d = project_3d_points_to_2d(right_finger_world, camera_matrix) + base_2d = project_3d_points_to_2d(base_world, camera_matrix) + handle_2d = project_3d_points_to_2d(handle_world, camera_matrix) + + # Draw left finger + pts = left_finger_2d.astype(np.int32) + cv2.polylines(result, [pts], True, color, thickness) + + # Draw right finger + pts = right_finger_2d.astype(np.int32) + cv2.polylines(result, [pts], True, color, thickness) + + # Draw base + pts = base_2d.astype(np.int32) + cv2.polylines(result, [pts], True, color, thickness) + + # Draw handle + pts = handle_2d.astype(np.int32) + cv2.polylines(result, [pts], True, color, thickness) + + # Draw grasp center (fingertips at origin) + center_2d = project_3d_points_to_2d(translation.reshape(1, -1), camera_matrix)[0] + cv2.circle(result, tuple(center_2d.astype(int)), 3, color, -1) + + except Exception as e: + # Skip this grasp if there's an error + continue + + return result + + +def get_standard_coordinate_transform(): + """ + Get a standard coordinate transformation matrix for consistent visualization. + + This transformation ensures that: + - X (red) axis points right + - Y (green) axis points up + - Z (blue) axis points toward viewer + + Returns: + 4x4 transformation matrix + """ + # Standard transformation matrix to ensure consistent coordinate frame orientation + transform = np.array( + [ + [1, 0, 0, 0], # X points right + [0, -1, 0, 0], # Y points up (flip from OpenCV to standard) + [0, 0, -1, 0], # Z points toward viewer (flip depth) + [0, 0, 0, 1], + ] + ) + return transform + + +def visualize_grasps_3d( + point_cloud: o3d.geometry.PointCloud, + grasp_list: List[dict], + max_grasps: int = -1, +): + """ + Visualize grasps in 3D with point cloud. + + Args: + point_cloud: Open3D point cloud + grasp_list: List of grasp dictionaries + max_grasps: Maximum number of grasps to visualize + """ + # Apply standard coordinate transformation + transform = get_standard_coordinate_transform() + + # Transform point cloud + pc_copy = o3d.geometry.PointCloud(point_cloud) + pc_copy.transform(transform) + geometries = [pc_copy] + + # Transform gripper geometries + gripper_geometries = create_all_gripper_geometries(grasp_list, max_grasps) + for geom in gripper_geometries: + geom.transform(transform) + geometries.extend(gripper_geometries) + + # Add transformed coordinate frame + origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1) + origin_frame.transform(transform) + geometries.append(origin_frame) + + o3d.visualization.draw_geometries(geometries, window_name="3D Grasp Visualization") + + +def rotation_matrix_to_euler(rotation_matrix: np.ndarray) -> Tuple[float, float, float]: + """ + Convert 3x3 rotation matrix to Euler angles (roll, pitch, yaw). + + Args: + rotation_matrix: 3x3 rotation matrix + + Returns: + Tuple of (roll, pitch, yaw) in radians + """ + sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2) + singular = sy < 1e-6 + + if not singular: + x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2]) # roll + y = np.arctan2(-rotation_matrix[2, 0], sy) # pitch + z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0]) # yaw + else: + x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1]) # roll + y = np.arctan2(-rotation_matrix[2, 0], sy) # pitch + z = 0 # yaw + + return x, y, z + + +def parse_grasp_results(grasps: List[Dict]) -> List[Dict]: + """ + Parse grasp results into visualization format. + + Args: + grasps: List of grasp dictionaries + + Returns: + List of dictionaries containing: + - id: Unique grasp identifier + - score: Confidence score (float) + - width: Gripper opening width (float) + - translation: 3D position [x, y, z] + - rotation_matrix: 3x3 rotation matrix as nested list + """ + if not grasps: + return [] + + parsed_grasps = [] + + for i, grasp in enumerate(grasps): + # Extract data from each grasp + translation = grasp.get("translation", [0, 0, 0]) + rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3))) + score = float(grasp.get("score", 0.0)) + width = float(grasp.get("width", 0.08)) + + parsed_grasp = { + "id": f"grasp_{i}", + "score": score, + "width": width, + "translation": translation, + "rotation_matrix": rotation_matrix.tolist(), + } + parsed_grasps.append(parsed_grasp) + + return parsed_grasps + + +def create_grasp_overlay( + rgb_image: np.ndarray, + grasps: List[Dict], + camera_intrinsics: Union[List[float], np.ndarray], +) -> np.ndarray: + """ + Create grasp visualization overlay on RGB image. + + Args: + rgb_image: RGB input image + grasps: List of grasp dictionaries in viz format + camera_intrinsics: Camera parameters + + Returns: + RGB image with grasp overlay + """ + try: + bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) + + result_bgr = draw_grasps_on_image( + bgr_image, + grasps, + camera_intrinsics, + max_grasps=-1, + ) + return cv2.cvtColor(result_bgr, cv2.COLOR_BGR2RGB) + except Exception as e: + return rgb_image.copy() diff --git a/dimos/perception/manip_aio_pipeline.py b/dimos/perception/manip_aio_pipeline.py new file mode 100644 index 0000000000..7c69e562cf --- /dev/null +++ b/dimos/perception/manip_aio_pipeline.py @@ -0,0 +1,590 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Asynchronous, reactive manipulation pipeline for realtime detection, filtering, and grasp generation. +""" + +import asyncio +import json +import logging +import threading +import time +import traceback +import websockets +from typing import Dict, List, Optional, Any +import numpy as np +import reactivex as rx +import reactivex.operators as ops +from dimos.utils.logging_config import setup_logger +from dimos.perception.detection2d.detic_2d_det import Detic2DDetector +from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering +from dimos.perception.object_detection_stream import ObjectDetectionStream +from dimos.perception.grasp_generation.utils import draw_grasps_on_image +from dimos.perception.pointcloud.utils import create_point_cloud_overlay_visualization +from dimos.perception.common.utils import colorize_depth +from dimos.utils.logging_config import setup_logger +import cv2 + +logger = setup_logger("dimos.perception.manip_aio_pipeline") + + +class ManipulationPipeline: + """ + Clean separated stream pipeline with frame buffering. + + - Object detection runs independently on RGB stream + - Point cloud processing subscribes to both detection and ZED streams separately + - Simple frame buffering to match RGB+depth+objects + """ + + def __init__( + self, + camera_intrinsics: List[float], # [fx, fy, cx, cy] + min_confidence: float = 0.6, + max_objects: int = 10, + vocabulary: Optional[str] = None, + grasp_server_url: Optional[str] = None, + enable_grasp_generation: bool = False, + ): + """ + Initialize the manipulation pipeline. + + Args: + camera_intrinsics: [fx, fy, cx, cy] camera parameters + min_confidence: Minimum detection confidence threshold + max_objects: Maximum number of objects to process + vocabulary: Optional vocabulary for Detic detector + grasp_server_url: Optional WebSocket URL for Dimensional Grasp server + enable_grasp_generation: Whether to enable async grasp generation + """ + self.camera_intrinsics = camera_intrinsics + self.min_confidence = min_confidence + + # Grasp generation settings + self.grasp_server_url = grasp_server_url + self.enable_grasp_generation = enable_grasp_generation + + # Asyncio event loop for WebSocket communication + self.grasp_loop = None + self.grasp_loop_thread = None + + # Storage for grasp results and filtered objects + self.latest_grasps: List[dict] = [] # Simplified: just a list of grasps + self.grasps_consumed = False + self.latest_filtered_objects = [] + self.latest_rgb_for_grasps = None # Store RGB image for grasp overlay + self.grasp_lock = threading.Lock() + + # Track pending requests - simplified to single task + self.grasp_task: Optional[asyncio.Task] = None + + # Reactive subjects for streaming filtered objects and grasps + self.filtered_objects_subject = rx.subject.Subject() + self.grasps_subject = rx.subject.Subject() + self.grasp_overlay_subject = rx.subject.Subject() # Add grasp overlay subject + + # Initialize grasp client if enabled + if self.enable_grasp_generation and self.grasp_server_url: + self._start_grasp_loop() + + # Initialize object detector + self.detector = Detic2DDetector(vocabulary=vocabulary, threshold=min_confidence) + + # Initialize point cloud processor + self.pointcloud_filter = PointcloudFiltering( + color_intrinsics=camera_intrinsics, + depth_intrinsics=camera_intrinsics, # ZED uses same intrinsics + max_num_objects=max_objects, + ) + + logger.info(f"Initialized ManipulationPipeline with confidence={min_confidence}") + + def create_streams(self, zed_stream: rx.Observable) -> Dict[str, rx.Observable]: + """ + Create streams using exact old main logic. + """ + # Create ZED streams (from old main) + zed_frame_stream = zed_stream.pipe(ops.share()) + + # RGB stream for object detection (from old main) + video_stream = zed_frame_stream.pipe( + ops.map(lambda x: x.get("rgb") if x is not None else None), + ops.filter(lambda x: x is not None), + ops.share(), + ) + object_detector = ObjectDetectionStream( + camera_intrinsics=self.camera_intrinsics, + min_confidence=self.min_confidence, + class_filter=None, + detector=self.detector, + video_stream=video_stream, + disable_depth=True, + ) + + # Store latest frames for point cloud processing (from old main) + latest_rgb = None + latest_depth = None + latest_point_cloud_overlay = None + frame_lock = threading.Lock() + + # Subscribe to combined ZED frames (from old main) + def on_zed_frame(zed_data): + nonlocal latest_rgb, latest_depth + if zed_data is not None: + with frame_lock: + latest_rgb = zed_data.get("rgb") + latest_depth = zed_data.get("depth") + + # Depth stream for point cloud filtering (from old main) + def get_depth_or_overlay(zed_data): + if zed_data is None: + return None + + # Check if we have a point cloud overlay available + with frame_lock: + overlay = latest_point_cloud_overlay + + if overlay is not None: + return overlay + else: + # Return regular colorized depth + return colorize_depth(zed_data.get("depth"), max_depth=10.0) + + depth_stream = zed_frame_stream.pipe( + ops.map(get_depth_or_overlay), ops.filter(lambda x: x is not None), ops.share() + ) + + # Process object detection results with point cloud filtering (from old main) + def on_detection_next(result): + nonlocal latest_point_cloud_overlay + if "objects" in result and result["objects"]: + # Get latest RGB and depth frames + with frame_lock: + rgb = latest_rgb + depth = latest_depth + + if rgb is not None and depth is not None: + try: + filtered_objects = self.pointcloud_filter.process_images( + rgb, depth, result["objects"] + ) + + if filtered_objects: + # Store filtered objects + with self.grasp_lock: + self.latest_filtered_objects = filtered_objects + self.filtered_objects_subject.on_next(filtered_objects) + + # Create base image (colorized depth) + base_image = colorize_depth(depth, max_depth=10.0) + + # Create point cloud overlay visualization + overlay_viz = create_point_cloud_overlay_visualization( + base_image=base_image, + objects=filtered_objects, + intrinsics=self.camera_intrinsics, + ) + + # Store the overlay for the stream + with frame_lock: + latest_point_cloud_overlay = overlay_viz + + # Request grasps if enabled + if self.enable_grasp_generation and len(filtered_objects) > 0: + # Save RGB image for later grasp overlay + with frame_lock: + self.latest_rgb_for_grasps = rgb.copy() + + task = self.request_scene_grasps(filtered_objects) + if task: + # Check for results after a delay + def check_grasps_later(): + time.sleep(2.0) # Wait for grasp processing + # Wait for task to complete + if hasattr(self, "grasp_task") and self.grasp_task: + try: + result = self.grasp_task.result( + timeout=3.0 + ) # Get result with timeout + except Exception as e: + logger.warning(f"Grasp task failed or timeout: {e}") + + # Try to get latest grasps and create overlay + with self.grasp_lock: + grasps = self.latest_grasps + + if grasps and hasattr(self, "latest_rgb_for_grasps"): + # Create grasp overlay on the saved RGB image + try: + bgr_image = cv2.cvtColor( + self.latest_rgb_for_grasps, cv2.COLOR_RGB2BGR + ) + result_bgr = draw_grasps_on_image( + bgr_image, + grasps, + self.camera_intrinsics, + max_grasps=-1, # Show all grasps + ) + result_rgb = cv2.cvtColor( + result_bgr, cv2.COLOR_BGR2RGB + ) + + # Emit grasp overlay immediately + self.grasp_overlay_subject.on_next(result_rgb) + + except Exception as e: + logger.error(f"Error creating grasp overlay: {e}") + + # Emit grasps to stream + self.grasps_subject.on_next(grasps) + + threading.Thread(target=check_grasps_later, daemon=True).start() + else: + logger.warning("Failed to create grasp task") + except Exception as e: + logger.error(f"Error in point cloud filtering: {e}") + with frame_lock: + latest_point_cloud_overlay = None + + def on_error(error): + logger.error(f"Error in stream: {error}") + + def on_completed(): + logger.info("Stream completed") + + def start_subscriptions(): + """Start subscriptions in background thread (from old main)""" + # Subscribe to combined ZED frames + zed_frame_stream.subscribe(on_next=on_zed_frame) + + # Start subscriptions in background thread (from old main) + subscription_thread = threading.Thread(target=start_subscriptions, daemon=True) + subscription_thread.start() + time.sleep(2) # Give subscriptions time to start + + # Subscribe to object detection stream (from old main) + object_detector.get_stream().subscribe( + on_next=on_detection_next, on_error=on_error, on_completed=on_completed + ) + + # Create visualization stream for web interface (from old main) + viz_stream = object_detector.get_stream().pipe( + ops.map(lambda x: x["viz_frame"] if x is not None else None), + ops.filter(lambda x: x is not None), + ) + + # Create filtered objects stream + filtered_objects_stream = self.filtered_objects_subject + + # Create grasps stream + grasps_stream = self.grasps_subject + + # Create grasp overlay subject for immediate emission + grasp_overlay_stream = self.grasp_overlay_subject + + return { + "detection_viz": viz_stream, + "pointcloud_viz": depth_stream, + "objects": object_detector.get_stream().pipe(ops.map(lambda x: x.get("objects", []))), + "filtered_objects": filtered_objects_stream, + "grasps": grasps_stream, + "grasp_overlay": grasp_overlay_stream, + } + + def _start_grasp_loop(self): + """Start asyncio event loop in a background thread for WebSocket communication.""" + + def run_loop(): + self.grasp_loop = asyncio.new_event_loop() + asyncio.set_event_loop(self.grasp_loop) + self.grasp_loop.run_forever() + + self.grasp_loop_thread = threading.Thread(target=run_loop, daemon=True) + self.grasp_loop_thread.start() + + # Wait for loop to start + while self.grasp_loop is None: + time.sleep(0.01) + + async def _send_grasp_request( + self, points: np.ndarray, colors: Optional[np.ndarray] + ) -> Optional[List[dict]]: + """Send grasp request to Dimensional Grasp server.""" + try: + # Comprehensive client-side validation to prevent server errors + + # Validate points array + if points is None: + logger.error("Points array is None") + return None + if not isinstance(points, np.ndarray): + logger.error(f"Points is not numpy array: {type(points)}") + return None + if points.size == 0: + logger.error("Points array is empty") + return None + if len(points.shape) != 2 or points.shape[1] != 3: + logger.error(f"Points has invalid shape {points.shape}, expected (N, 3)") + return None + if points.shape[0] < 100: # Minimum points for stable grasp detection + logger.error(f"Insufficient points for grasp detection: {points.shape[0]} < 100") + return None + + # Validate and prepare colors + if colors is not None: + if not isinstance(colors, np.ndarray): + colors = None + elif colors.size == 0: + colors = None + elif len(colors.shape) != 2 or colors.shape[1] != 3: + colors = None + elif colors.shape[0] != points.shape[0]: + colors = None + + # If no valid colors, create default colors (required by server) + if colors is None: + # Create default white colors for all points + colors = np.ones((points.shape[0], 3), dtype=np.float32) * 0.5 + + # Ensure data types are correct (server expects float32) + points = points.astype(np.float32) + colors = colors.astype(np.float32) + + # Validate ranges (basic sanity checks) + if np.any(np.isnan(points)) or np.any(np.isinf(points)): + logger.error("Points contain NaN or Inf values") + return None + if np.any(np.isnan(colors)) or np.any(np.isinf(colors)): + logger.error("Colors contain NaN or Inf values") + return None + + # Clamp color values to valid range [0, 1] + colors = np.clip(colors, 0.0, 1.0) + + async with websockets.connect(self.grasp_server_url) as websocket: + request = { + "points": points.tolist(), + "colors": colors.tolist(), # Always send colors array + "lims": [-0.19, 0.12, 0.02, 0.15, 0.0, 1.0], # Default workspace limits + } + + await websocket.send(json.dumps(request)) + + response = await websocket.recv() + grasps = json.loads(response) + + # Handle server response validation + if isinstance(grasps, dict) and "error" in grasps: + logger.error(f"Server returned error: {grasps['error']}") + return None + elif isinstance(grasps, (int, float)) and grasps == 0: + return None + elif not isinstance(grasps, list): + logger.error( + f"Server returned unexpected response type: {type(grasps)}, value: {grasps}" + ) + return None + elif len(grasps) == 0: + return None + + converted_grasps = self._convert_grasp_format(grasps) + with self.grasp_lock: + self.latest_grasps = converted_grasps + self.grasps_consumed = False # Reset consumed flag + + # Emit to reactive stream + self.grasps_subject.on_next(self.latest_grasps) + + return converted_grasps + except websockets.exceptions.ConnectionClosed as e: + logger.error(f"WebSocket connection closed: {e}") + except websockets.exceptions.WebSocketException as e: + logger.error(f"WebSocket error: {e}") + except json.JSONDecodeError as e: + logger.error(f"Failed to parse server response as JSON: {e}") + except Exception as e: + logger.error(f"Error requesting grasps: {e}") + + return None + + def request_scene_grasps(self, objects: List[dict]) -> Optional[asyncio.Task]: + """Request grasps for entire scene by combining all object point clouds.""" + if not self.grasp_loop or not objects: + return None + + all_points = [] + all_colors = [] + valid_objects = 0 + + for i, obj in enumerate(objects): + # Validate point cloud data + if "point_cloud_numpy" not in obj or obj["point_cloud_numpy"] is None: + continue + + points = obj["point_cloud_numpy"] + if not isinstance(points, np.ndarray) or points.size == 0: + continue + + # Ensure points have correct shape (N, 3) + if len(points.shape) != 2 or points.shape[1] != 3: + continue + + # Validate colors if present + colors = None + if "colors_numpy" in obj and obj["colors_numpy"] is not None: + colors = obj["colors_numpy"] + if isinstance(colors, np.ndarray) and colors.size > 0: + # Ensure colors match points count and have correct shape + if colors.shape[0] != points.shape[0]: + colors = None # Ignore colors for this object + elif len(colors.shape) != 2 or colors.shape[1] != 3: + colors = None # Ignore colors for this object + + all_points.append(points) + if colors is not None: + all_colors.append(colors) + valid_objects += 1 + + if not all_points: + return None + + try: + combined_points = np.vstack(all_points) + + # Only combine colors if ALL objects have valid colors + combined_colors = None + if len(all_colors) == valid_objects and len(all_colors) > 0: + combined_colors = np.vstack(all_colors) + + # Validate final combined data + if combined_points.size == 0: + logger.warning("Combined point cloud is empty") + return None + + if combined_colors is not None and combined_colors.shape[0] != combined_points.shape[0]: + logger.warning( + f"Color/point count mismatch: {combined_colors.shape[0]} colors vs {combined_points.shape[0]} points, dropping colors" + ) + combined_colors = None + + except Exception as e: + logger.error(f"Failed to combine point clouds: {e}") + return None + + try: + # Check if there's already a grasp task running + if hasattr(self, "grasp_task") and self.grasp_task and not self.grasp_task.done(): + return self.grasp_task + + task = asyncio.run_coroutine_threadsafe( + self._send_grasp_request(combined_points, combined_colors), self.grasp_loop + ) + + self.grasp_task = task + return task + except Exception as e: + logger.warning("Failed to create grasp task") + return None + + def get_latest_grasps(self, timeout: float = 5.0) -> Optional[List[dict]]: + """Get latest grasp results, waiting for new ones if current ones have been consumed.""" + # Mark current grasps as consumed and get a reference + with self.grasp_lock: + current_grasps = self.latest_grasps + self.grasps_consumed = True + + # If we already have grasps and they haven't been consumed, return them + if current_grasps is not None and not getattr(self, "grasps_consumed", False): + return current_grasps + + # Wait for new grasps + start_time = time.time() + while time.time() - start_time < timeout: + with self.grasp_lock: + # Check if we have new grasps (different from what we marked as consumed) + if self.latest_grasps is not None and not getattr(self, "grasps_consumed", False): + return self.latest_grasps + time.sleep(0.1) # Check every 100ms + + return None # Timeout reached + + def clear_grasps(self) -> None: + """Clear all stored grasp results.""" + with self.grasp_lock: + self.latest_grasps = [] + + def _prepare_colors(self, colors: Optional[np.ndarray]) -> Optional[np.ndarray]: + """Prepare colors array, converting from various formats if needed.""" + if colors is None: + return None + + if colors.max() > 1.0: + colors = colors / 255.0 + + return colors + + def _convert_grasp_format(self, grasps: List[dict]) -> List[dict]: + """Convert Grasp format to our visualization format.""" + converted = [] + + for i, grasp in enumerate(grasps): + rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3))) + euler_angles = self._rotation_matrix_to_euler(rotation_matrix) + + converted_grasp = { + "id": f"grasp_{i}", + "score": grasp.get("score", 0.0), + "width": grasp.get("width", 0.0), + "height": grasp.get("height", 0.0), + "depth": grasp.get("depth", 0.0), + "translation": grasp.get("translation", [0, 0, 0]), + "rotation_matrix": rotation_matrix.tolist(), + "euler_angles": euler_angles, + } + converted.append(converted_grasp) + + converted.sort(key=lambda x: x["score"], reverse=True) + + return converted + + def _rotation_matrix_to_euler(self, rotation_matrix: np.ndarray) -> Dict[str, float]: + """Convert rotation matrix to Euler angles (in radians).""" + sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2) + + singular = sy < 1e-6 + + if not singular: + x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2]) + y = np.arctan2(-rotation_matrix[2, 0], sy) + z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0]) + else: + x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1]) + y = np.arctan2(-rotation_matrix[2, 0], sy) + z = 0 + + return {"roll": x, "pitch": y, "yaw": z} + + def cleanup(self): + """Clean up resources.""" + if hasattr(self.detector, "cleanup"): + self.detector.cleanup() + + if self.grasp_loop and self.grasp_loop_thread: + self.grasp_loop.call_soon_threadsafe(self.grasp_loop.stop) + self.grasp_loop_thread.join(timeout=1.0) + + if hasattr(self.pointcloud_filter, "cleanup"): + self.pointcloud_filter.cleanup() + logger.info("ManipulationPipeline cleaned up") diff --git a/dimos/perception/manip_aio_processer.py b/dimos/perception/manip_aio_processer.py new file mode 100644 index 0000000000..2d80b527ce --- /dev/null +++ b/dimos/perception/manip_aio_processer.py @@ -0,0 +1,411 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Sequential manipulation processor for single-frame processing without reactive streams. +""" + +import logging +import time +from typing import Dict, List, Optional, Any, Tuple +import numpy as np +import cv2 + +from dimos.utils.logging_config import setup_logger +from dimos.perception.detection2d.detic_2d_det import Detic2DDetector +from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering +from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter +from dimos.perception.grasp_generation.grasp_generation import HostedGraspGenerator +from dimos.perception.grasp_generation.utils import create_grasp_overlay +from dimos.perception.pointcloud.utils import ( + create_point_cloud_overlay_visualization, + extract_and_cluster_misc_points, + overlay_point_clouds_on_image, +) +from dimos.perception.common.utils import ( + colorize_depth, + detection_results_to_object_data, + combine_object_data, +) + +logger = setup_logger("dimos.perception.manip_aio_processor") + + +class ManipulationProcessor: + """ + Sequential manipulation processor for single-frame processing. + + Processes RGB-D frames through object detection, point cloud filtering, + and grasp generation in a single thread without reactive streams. + """ + + def __init__( + self, + camera_intrinsics: List[float], # [fx, fy, cx, cy] + min_confidence: float = 0.6, + max_objects: int = 20, + vocabulary: Optional[str] = None, + enable_grasp_generation: bool = False, + grasp_server_url: Optional[str] = None, # Required when enable_grasp_generation=True + enable_segmentation: bool = True, + ): + """ + Initialize the manipulation processor. + + Args: + camera_intrinsics: [fx, fy, cx, cy] camera parameters + min_confidence: Minimum detection confidence threshold + max_objects: Maximum number of objects to process + vocabulary: Optional vocabulary for Detic detector + enable_grasp_generation: Whether to enable grasp generation + grasp_server_url: WebSocket URL for Dimensional Grasp server (required when enable_grasp_generation=True) + enable_segmentation: Whether to enable semantic segmentation + segmentation_model: Segmentation model to use (SAM 2 or FastSAM) + """ + self.camera_intrinsics = camera_intrinsics + self.min_confidence = min_confidence + self.max_objects = max_objects + self.enable_grasp_generation = enable_grasp_generation + self.grasp_server_url = grasp_server_url + self.enable_segmentation = enable_segmentation + + # Validate grasp generation requirements + if enable_grasp_generation and not grasp_server_url: + raise ValueError("grasp_server_url is required when enable_grasp_generation=True") + + # Initialize object detector + self.detector = Detic2DDetector(vocabulary=vocabulary, threshold=min_confidence) + + # Initialize point cloud processor + self.pointcloud_filter = PointcloudFiltering( + color_intrinsics=camera_intrinsics, + depth_intrinsics=camera_intrinsics, # ZED uses same intrinsics + max_num_objects=max_objects, + ) + + # Initialize semantic segmentation + self.segmenter = None + if self.enable_segmentation: + self.segmenter = Sam2DSegmenter( + device="cuda", + use_tracker=False, # Disable tracker for simple segmentation + use_analyzer=False, # Disable analyzer for simple segmentation + ) + + # Initialize grasp generator if enabled + self.grasp_generator = None + if self.enable_grasp_generation: + try: + self.grasp_generator = HostedGraspGenerator(server_url=grasp_server_url) + logger.info("Hosted grasp generator initialized successfully") + except Exception as e: + logger.error(f"Failed to initialize hosted grasp generator: {e}") + self.grasp_generator = None + self.enable_grasp_generation = False + + logger.info( + f"Initialized ManipulationProcessor with confidence={min_confidence}, " + f"grasp_generation={enable_grasp_generation}" + ) + + def process_frame( + self, rgb_image: np.ndarray, depth_image: np.ndarray, generate_grasps: bool = None + ) -> Dict[str, Any]: + """ + Process a single RGB-D frame through the complete pipeline. + + Args: + rgb_image: RGB image (H, W, 3) + depth_image: Depth image (H, W) in meters + generate_grasps: Override grasp generation setting for this frame + + Returns: + Dictionary containing: + - detection_viz: Visualization of object detection + - pointcloud_viz: Visualization of point cloud overlay + - segmentation_viz: Visualization of semantic segmentation (if enabled) + - detection2d_objects: Raw detection results as ObjectData + - segmentation2d_objects: Raw segmentation results as ObjectData (if enabled) + - detected_objects: Detection (Object Detection) objects with point clouds filtered + - all_objects: Combined objects with intelligent duplicate removal + - full_pointcloud: Complete scene point cloud (if point cloud processing enabled) + - misc_clusters: List of clustered background/miscellaneous point clouds (DBSCAN) + - misc_voxel_grid: Open3D voxel grid approximating all misc/background points + - misc_pointcloud_viz: Visualization of misc/background cluster overlay + - grasps: Grasp results (list of dictionaries, if enabled) + - grasp_overlay: Grasp visualization overlay (if enabled) + - processing_time: Total processing time + """ + start_time = time.time() + results = {} + + try: + # Step 1: Object Detection + step_start = time.time() + detection_results = self.run_object_detection(rgb_image) + results["detection2d_objects"] = detection_results.get("objects", []) + results["detection_viz"] = detection_results.get("viz_frame") + detection_time = time.time() - step_start + + # Step 2: Semantic Segmentation (if enabled) + segmentation_time = 0 + if self.enable_segmentation: + step_start = time.time() + segmentation_results = self.run_segmentation(rgb_image) + results["segmentation2d_objects"] = segmentation_results.get("objects", []) + results["segmentation_viz"] = segmentation_results.get("viz_frame") + segmentation_time = time.time() - step_start + + # Step 3: Point Cloud Processing + pointcloud_time = 0 + detection2d_objects = results.get("detection2d_objects", []) + segmentation2d_objects = results.get("segmentation2d_objects", []) + + # Process detection objects if available + detected_objects = [] + if detection2d_objects: + step_start = time.time() + detected_objects = self.run_pointcloud_filtering( + rgb_image, depth_image, detection2d_objects + ) + pointcloud_time += time.time() - step_start + + # Process segmentation objects if available + segmentation_filtered_objects = [] + if segmentation2d_objects: + step_start = time.time() + segmentation_filtered_objects = self.run_pointcloud_filtering( + rgb_image, depth_image, segmentation2d_objects + ) + pointcloud_time += time.time() - step_start + + # Combine all objects using intelligent duplicate removal + all_objects = combine_object_data( + detected_objects, segmentation_filtered_objects, overlap_threshold=0.8 + ) + + # Get full point cloud + full_pcd = self.pointcloud_filter.get_full_point_cloud() + + # Extract misc/background points and create voxel grid + misc_start = time.time() + misc_clusters, misc_voxel_grid = extract_and_cluster_misc_points( + full_pcd, + all_objects, + eps=0.03, + min_points=100, + enable_filtering=True, + voxel_size=0.02, + ) + misc_time = time.time() - misc_start + + # Store results + results.update( + { + "detected_objects": detected_objects, + "all_objects": all_objects, + "full_pointcloud": full_pcd, + "misc_clusters": misc_clusters, + "misc_voxel_grid": misc_voxel_grid, + } + ) + + # Create point cloud visualizations + base_image = colorize_depth(depth_image, max_depth=10.0) + + # Create visualizations + results["pointcloud_viz"] = ( + create_point_cloud_overlay_visualization( + base_image=base_image, + objects=all_objects, + intrinsics=self.camera_intrinsics, + ) + if all_objects + else base_image + ) + + results["detected_pointcloud_viz"] = ( + create_point_cloud_overlay_visualization( + base_image=base_image, + objects=detected_objects, + intrinsics=self.camera_intrinsics, + ) + if detected_objects + else base_image + ) + + if misc_clusters: + # Generate consistent colors for clusters + cluster_colors = [ + tuple((np.random.RandomState(i + 100).rand(3) * 255).astype(int)) + for i in range(len(misc_clusters)) + ] + results["misc_pointcloud_viz"] = overlay_point_clouds_on_image( + base_image=base_image, + point_clouds=misc_clusters, + camera_intrinsics=self.camera_intrinsics, + colors=cluster_colors, + point_size=2, + alpha=0.6, + ) + else: + results["misc_pointcloud_viz"] = base_image + + # Step 4: Grasp Generation (if enabled) + should_generate_grasps = ( + generate_grasps if generate_grasps is not None else self.enable_grasp_generation + ) + + if should_generate_grasps and all_objects and full_pcd: + grasps = self.run_grasp_generation(all_objects, full_pcd) + results["grasps"] = grasps + if grasps: + results["grasp_overlay"] = create_grasp_overlay( + rgb_image, grasps, self.camera_intrinsics + ) + + except Exception as e: + logger.error(f"Error processing frame: {e}") + results["error"] = str(e) + + # Add timing information + total_time = time.time() - start_time + results.update( + { + "processing_time": total_time, + "timing_breakdown": { + "detection": detection_time if "detection_time" in locals() else 0, + "segmentation": segmentation_time if "segmentation_time" in locals() else 0, + "pointcloud": pointcloud_time if "pointcloud_time" in locals() else 0, + "misc_extraction": misc_time if "misc_time" in locals() else 0, + "total": total_time, + }, + } + ) + + return results + + def run_object_detection(self, rgb_image: np.ndarray) -> Dict[str, Any]: + """Run object detection on RGB image.""" + try: + # Convert RGB to BGR for Detic detector + bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) + + # Use process_image method from Detic detector + bboxes, track_ids, class_ids, confidences, names, masks = self.detector.process_image( + bgr_image + ) + + # Convert to ObjectData format using utility function + objects = detection_results_to_object_data( + bboxes=bboxes, + track_ids=track_ids, + class_ids=class_ids, + confidences=confidences, + names=names, + masks=masks, + source="detection", + ) + + # Create visualization using detector's built-in method + viz_frame = self.detector.visualize_results( + rgb_image, bboxes, track_ids, class_ids, confidences, names + ) + + return {"objects": objects, "viz_frame": viz_frame} + + except Exception as e: + logger.error(f"Object detection failed: {e}") + return {"objects": [], "viz_frame": rgb_image.copy()} + + def run_pointcloud_filtering( + self, rgb_image: np.ndarray, depth_image: np.ndarray, objects: List[Dict] + ) -> List[Dict]: + """Run point cloud filtering on detected objects.""" + try: + filtered_objects = self.pointcloud_filter.process_images( + rgb_image, depth_image, objects + ) + return filtered_objects if filtered_objects else [] + except Exception as e: + logger.error(f"Point cloud filtering failed: {e}") + return [] + + def run_segmentation(self, rgb_image: np.ndarray) -> Dict[str, Any]: + """Run semantic segmentation on RGB image.""" + if not self.segmenter: + return {"objects": [], "viz_frame": rgb_image.copy()} + + try: + # Convert RGB to BGR for segmenter + bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) + + # Get segmentation results + masks, bboxes, track_ids, probs, names = self.segmenter.process_image(bgr_image) + + # Convert to ObjectData format using utility function + objects = detection_results_to_object_data( + bboxes=bboxes, + track_ids=track_ids, + class_ids=list(range(len(bboxes))), # Use indices as class IDs for segmentation + confidences=probs, + names=names, + masks=masks, + source="segmentation", + ) + + # Create visualization + if masks: + viz_bgr = self.segmenter.visualize_results( + bgr_image, masks, bboxes, track_ids, probs, names + ) + # Convert back to RGB + viz_frame = cv2.cvtColor(viz_bgr, cv2.COLOR_BGR2RGB) + else: + viz_frame = rgb_image.copy() + + return {"objects": objects, "viz_frame": viz_frame} + + except Exception as e: + logger.error(f"Segmentation failed: {e}") + return {"objects": [], "viz_frame": rgb_image.copy()} + + def run_grasp_generation(self, filtered_objects: List[Dict], full_pcd) -> Optional[List[Dict]]: + """Run grasp generation using the configured generator.""" + if not self.grasp_generator: + logger.warning("Grasp generation requested but no generator available") + return None + + try: + # Generate grasps using the configured generator + grasps = self.grasp_generator.generate_grasps_from_objects(filtered_objects, full_pcd) + + # Return parsed results directly (list of grasp dictionaries) + return grasps + + except Exception as e: + logger.error(f"Grasp generation failed: {e}") + return None + + def cleanup(self): + """Clean up resources.""" + if hasattr(self.detector, "cleanup"): + self.detector.cleanup() + if hasattr(self.pointcloud_filter, "cleanup"): + self.pointcloud_filter.cleanup() + if self.segmenter and hasattr(self.segmenter, "cleanup"): + self.segmenter.cleanup() + if self.grasp_generator and hasattr(self.grasp_generator, "cleanup"): + self.grasp_generator.cleanup() + logger.info("ManipulationProcessor cleaned up") diff --git a/dimos/perception/object_detection_stream.py b/dimos/perception/object_detection_stream.py index 3284d99f8b..4fb8fc2691 100644 --- a/dimos/perception/object_detection_stream.py +++ b/dimos/perception/object_detection_stream.py @@ -33,6 +33,7 @@ calculate_object_size_from_bbox, calculate_position_rotation_from_bbox, ) +from dimos.perception.common.utils import draw_object_detection_visualization from dimos.types.vector import Vector from typing import Optional, Union, Callable from dimos.types.manipulation import ObjectData @@ -234,68 +235,10 @@ def process_frame(frame): objects.append(object_data) - # Add visualization - x1, y1, x2, y2 = map(int, bbox) - color = (0, 255, 0) # Green for detected objects - mask_color = (0, 200, 200) # Yellow-green for masks - - # Draw segmentation mask if available and valid - try: - if self.draw_masks and object_data["segmentation_mask"] is not None: - # Create a colored mask overlay - mask = object_data["segmentation_mask"].astype(np.uint8) - colored_mask = np.zeros_like(viz_frame) - colored_mask[mask > 0] = mask_color - - # Apply the mask with transparency - alpha = 0.5 # transparency factor - mask_area = mask > 0 - viz_frame[mask_area] = cv2.addWeighted( - viz_frame[mask_area], 1 - alpha, colored_mask[mask_area], alpha, 0 - ) - - # Draw mask contour - contours, _ = cv2.findContours( - mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE - ) - cv2.drawContours(viz_frame, contours, -1, mask_color, 2) - except Exception as e: - logger.warning(f"Error drawing segmentation mask: {e}") - - # Draw bounding box with metadata - try: - cv2.rectangle(viz_frame, (x1, y1), (x2, y2), color, 1) - - # Add text for class only (removed position data) - # Handle possible None values for class_name or track_ids[i] - class_text = class_name if class_name is not None else "Unknown" - id_text = ( - track_ids[i] if i < len(track_ids) and track_ids[i] is not None else "?" - ) - text = f"{class_text}, ID: {id_text}" - - # Draw text background with smaller font - text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.3, 1)[0] - cv2.rectangle( - viz_frame, - (x1, y1 - text_size[1] - 5), - (x1 + text_size[0], y1), - (0, 0, 0), - -1, - ) - - # Draw text with smaller font - cv2.putText( - viz_frame, - text, - (x1, y1 - 5), - cv2.FONT_HERSHEY_SIMPLEX, - 0.3, - (255, 255, 255), - 1, - ) - except Exception as e: - logger.warning(f"Error drawing bounding box or text: {e}") + # Create visualization using common function + viz_frame = draw_object_detection_visualization( + viz_frame, objects, draw_masks=self.draw_masks, font_scale=1.5 + ) return {"frame": frame, "viz_frame": viz_frame, "objects": objects} diff --git a/dimos/perception/pointcloud/__init__.py b/dimos/perception/pointcloud/__init__.py new file mode 100644 index 0000000000..1f282bb738 --- /dev/null +++ b/dimos/perception/pointcloud/__init__.py @@ -0,0 +1,3 @@ +from .utils import * +from .cuboid_fit import * +from .pointcloud_filtering import * diff --git a/dimos/perception/pointcloud/cuboid_fit.py b/dimos/perception/pointcloud/cuboid_fit.py new file mode 100644 index 0000000000..d567f40395 --- /dev/null +++ b/dimos/perception/pointcloud/cuboid_fit.py @@ -0,0 +1,414 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import open3d as o3d +import cv2 +from typing import Dict, Optional, Union, Tuple + + +def fit_cuboid( + points: Union[np.ndarray, o3d.geometry.PointCloud], method: str = "minimal" +) -> Optional[Dict]: + """ + Fit a cuboid to a point cloud using Open3D's built-in methods. + + Args: + points: Nx3 array of points or Open3D PointCloud + method: Fitting method: + - 'minimal': Minimal oriented bounding box (best fit) + - 'oriented': PCA-based oriented bounding box + - 'axis_aligned': Axis-aligned bounding box + + Returns: + Dictionary containing: + - center: 3D center point + - dimensions: 3D dimensions (extent) + - rotation: 3x3 rotation matrix + - error: Fitting error + - bounding_box: Open3D OrientedBoundingBox object + Returns None if insufficient points or fitting fails. + + Raises: + ValueError: If method is invalid or inputs are malformed + """ + # Validate method + valid_methods = ["minimal", "oriented", "axis_aligned"] + if method not in valid_methods: + raise ValueError(f"method must be one of {valid_methods}, got '{method}'") + + # Convert to point cloud if needed + if isinstance(points, np.ndarray): + points = np.asarray(points) + if len(points.shape) != 2 or points.shape[1] != 3: + raise ValueError(f"points array must be Nx3, got shape {points.shape}") + if len(points) < 4: + return None + + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(points) + elif isinstance(points, o3d.geometry.PointCloud): + pcd = points + points = np.asarray(pcd.points) + if len(points) < 4: + return None + else: + raise ValueError(f"points must be numpy array or Open3D PointCloud, got {type(points)}") + + try: + # Get bounding box based on method + if method == "minimal": + obb = pcd.get_minimal_oriented_bounding_box(robust=True) + elif method == "oriented": + obb = pcd.get_oriented_bounding_box(robust=True) + elif method == "axis_aligned": + # Convert axis-aligned to oriented format for consistency + aabb = pcd.get_axis_aligned_bounding_box() + obb = o3d.geometry.OrientedBoundingBox() + obb.center = aabb.get_center() + obb.extent = aabb.get_extent() + obb.R = np.eye(3) # Identity rotation for axis-aligned + + # Extract parameters + center = np.asarray(obb.center) + dimensions = np.asarray(obb.extent) + rotation = np.asarray(obb.R) + + # Calculate fitting error + error = _compute_fitting_error(points, center, dimensions, rotation) + + return { + "center": center, + "dimensions": dimensions, + "rotation": rotation, + "error": error, + "bounding_box": obb, + "method": method, + } + + except Exception as e: + # Log error but don't crash - return None for graceful handling + print(f"Warning: Cuboid fitting failed with method '{method}': {e}") + return None + + +def fit_cuboid_simple(points: Union[np.ndarray, o3d.geometry.PointCloud]) -> Optional[Dict]: + """ + Simple wrapper for minimal oriented bounding box fitting. + + Args: + points: Nx3 array of points or Open3D PointCloud + + Returns: + Dictionary with center, dimensions, rotation, and bounding_box, + or None if insufficient points + """ + return fit_cuboid(points, method="minimal") + + +def _compute_fitting_error( + points: np.ndarray, center: np.ndarray, dimensions: np.ndarray, rotation: np.ndarray +) -> float: + """ + Compute fitting error as mean squared distance from points to cuboid surface. + + Args: + points: Nx3 array of points + center: 3D center point + dimensions: 3D dimensions + rotation: 3x3 rotation matrix + + Returns: + Mean squared error + """ + if len(points) == 0: + return 0.0 + + # Transform points to local coordinates + local_points = (points - center) @ rotation + half_dims = dimensions / 2 + + # Calculate distance to cuboid surface + dx = np.abs(local_points[:, 0]) - half_dims[0] + dy = np.abs(local_points[:, 1]) - half_dims[1] + dz = np.abs(local_points[:, 2]) - half_dims[2] + + # Points outside: distance to nearest face + # Points inside: negative distance to nearest face + outside_dist = np.sqrt(np.maximum(dx, 0) ** 2 + np.maximum(dy, 0) ** 2 + np.maximum(dz, 0) ** 2) + inside_dist = np.minimum(np.minimum(dx, dy), dz) + distances = np.where((dx > 0) | (dy > 0) | (dz > 0), outside_dist, -inside_dist) + + return float(np.mean(distances**2)) + + +def get_cuboid_corners( + center: np.ndarray, dimensions: np.ndarray, rotation: np.ndarray +) -> np.ndarray: + """ + Get the 8 corners of a cuboid. + + Args: + center: 3D center point + dimensions: 3D dimensions + rotation: 3x3 rotation matrix + + Returns: + 8x3 array of corner coordinates + """ + half_dims = dimensions / 2 + corners_local = ( + np.array( + [ + [-1, -1, -1], # 0: left bottom back + [-1, -1, 1], # 1: left bottom front + [-1, 1, -1], # 2: left top back + [-1, 1, 1], # 3: left top front + [1, -1, -1], # 4: right bottom back + [1, -1, 1], # 5: right bottom front + [1, 1, -1], # 6: right top back + [1, 1, 1], # 7: right top front + ] + ) + * half_dims + ) + + # Apply rotation and translation + return corners_local @ rotation.T + center + + +def visualize_cuboid_on_image( + image: np.ndarray, + cuboid_params: Dict, + camera_matrix: np.ndarray, + extrinsic_rotation: Optional[np.ndarray] = None, + extrinsic_translation: Optional[np.ndarray] = None, + color: Tuple[int, int, int] = (0, 255, 0), + thickness: int = 2, + show_dimensions: bool = True, +) -> np.ndarray: + """ + Draw a fitted cuboid on an image using camera projection. + + Args: + image: Input image to draw on + cuboid_params: Dictionary containing cuboid parameters + camera_matrix: Camera intrinsic matrix (3x3) + extrinsic_rotation: Optional external rotation (3x3) + extrinsic_translation: Optional external translation (3x1) + color: Line color as (B, G, R) tuple + thickness: Line thickness + show_dimensions: Whether to display dimension text + + Returns: + Image with cuboid visualization + + Raises: + ValueError: If required parameters are missing or invalid + """ + # Validate inputs + required_keys = ["center", "dimensions", "rotation"] + if not all(key in cuboid_params for key in required_keys): + raise ValueError(f"cuboid_params must contain keys: {required_keys}") + + if camera_matrix.shape != (3, 3): + raise ValueError(f"camera_matrix must be 3x3, got {camera_matrix.shape}") + + # Get corners in world coordinates + corners = get_cuboid_corners( + cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"] + ) + + # Transform corners if extrinsic parameters are provided + if extrinsic_rotation is not None and extrinsic_translation is not None: + if extrinsic_rotation.shape != (3, 3): + raise ValueError(f"extrinsic_rotation must be 3x3, got {extrinsic_rotation.shape}") + if extrinsic_translation.shape not in [(3,), (3, 1)]: + raise ValueError( + f"extrinsic_translation must be (3,) or (3,1), got {extrinsic_translation.shape}" + ) + + extrinsic_translation = extrinsic_translation.flatten() + corners = (extrinsic_rotation @ corners.T).T + extrinsic_translation + + try: + # Project 3D corners to image coordinates + corners_img, _ = cv2.projectPoints( + corners.astype(np.float32), + np.zeros(3), + np.zeros(3), # No additional rotation/translation + camera_matrix.astype(np.float32), + None, # No distortion + ) + corners_img = corners_img.reshape(-1, 2).astype(int) + + # Check if corners are within image bounds + h, w = image.shape[:2] + valid_corners = ( + (corners_img[:, 0] >= 0) + & (corners_img[:, 0] < w) + & (corners_img[:, 1] >= 0) + & (corners_img[:, 1] < h) + ) + + if not np.any(valid_corners): + print("Warning: All cuboid corners are outside image bounds") + return image.copy() + + except Exception as e: + print(f"Warning: Failed to project cuboid corners: {e}") + return image.copy() + + # Define edges for wireframe visualization + edges = [ + # Bottom face + (0, 1), + (1, 5), + (5, 4), + (4, 0), + # Top face + (2, 3), + (3, 7), + (7, 6), + (6, 2), + # Vertical edges + (0, 2), + (1, 3), + (5, 7), + (4, 6), + ] + + # Draw edges + vis_img = image.copy() + for i, j in edges: + # Only draw edge if both corners are valid + if valid_corners[i] and valid_corners[j]: + cv2.line(vis_img, tuple(corners_img[i]), tuple(corners_img[j]), color, thickness) + + # Add dimension text if requested + if show_dimensions and np.any(valid_corners): + dims = cuboid_params["dimensions"] + dim_text = f"Dims: {dims[0]:.3f} x {dims[1]:.3f} x {dims[2]:.3f}" + + # Find a good position for text (top-left of image) + text_pos = (10, 30) + font_scale = 0.7 + + # Add background rectangle for better readability + text_size = cv2.getTextSize(dim_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 2)[0] + cv2.rectangle( + vis_img, + (text_pos[0] - 5, text_pos[1] - text_size[1] - 5), + (text_pos[0] + text_size[0] + 5, text_pos[1] + 5), + (0, 0, 0), + -1, + ) + + cv2.putText(vis_img, dim_text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, font_scale, color, 2) + + return vis_img + + +def compute_cuboid_volume(cuboid_params: Dict) -> float: + """ + Compute the volume of a cuboid. + + Args: + cuboid_params: Dictionary containing cuboid parameters + + Returns: + Volume in cubic units + """ + if "dimensions" not in cuboid_params: + raise ValueError("cuboid_params must contain 'dimensions' key") + + dims = cuboid_params["dimensions"] + return float(np.prod(dims)) + + +def compute_cuboid_surface_area(cuboid_params: Dict) -> float: + """ + Compute the surface area of a cuboid. + + Args: + cuboid_params: Dictionary containing cuboid parameters + + Returns: + Surface area in square units + """ + if "dimensions" not in cuboid_params: + raise ValueError("cuboid_params must contain 'dimensions' key") + + dims = cuboid_params["dimensions"] + return 2.0 * (dims[0] * dims[1] + dims[1] * dims[2] + dims[2] * dims[0]) + + +def check_cuboid_quality(cuboid_params: Dict, points: np.ndarray) -> Dict: + """ + Assess the quality of a cuboid fit. + + Args: + cuboid_params: Dictionary containing cuboid parameters + points: Original points used for fitting + + Returns: + Dictionary with quality metrics + """ + if len(points) == 0: + return {"error": "No points provided"} + + # Basic metrics + volume = compute_cuboid_volume(cuboid_params) + surface_area = compute_cuboid_surface_area(cuboid_params) + error = cuboid_params.get("error", 0.0) + + # Aspect ratio analysis + dims = cuboid_params["dimensions"] + aspect_ratios = [ + dims[0] / dims[1] if dims[1] > 0 else float("inf"), + dims[1] / dims[2] if dims[2] > 0 else float("inf"), + dims[2] / dims[0] if dims[0] > 0 else float("inf"), + ] + max_aspect_ratio = max(aspect_ratios) + + # Volume ratio (cuboid volume vs convex hull volume) + try: + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(points) + hull, _ = pcd.compute_convex_hull() + hull_volume = hull.get_volume() + volume_ratio = volume / hull_volume if hull_volume > 0 else float("inf") + except: + volume_ratio = None + + return { + "fitting_error": error, + "volume": volume, + "surface_area": surface_area, + "max_aspect_ratio": max_aspect_ratio, + "volume_ratio": volume_ratio, + "num_points": len(points), + "method": cuboid_params.get("method", "unknown"), + } + + +# Backward compatibility +def visualize_fit(image, cuboid_params, camera_matrix, R=None, t=None): + """ + Legacy function for backward compatibility. + Use visualize_cuboid_on_image instead. + """ + return visualize_cuboid_on_image( + image, cuboid_params, camera_matrix, R, t, show_dimensions=True + ) diff --git a/dimos/perception/pointcloud/pointcloud_filtering.py b/dimos/perception/pointcloud/pointcloud_filtering.py new file mode 100644 index 0000000000..64fd8953a6 --- /dev/null +++ b/dimos/perception/pointcloud/pointcloud_filtering.py @@ -0,0 +1,674 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import cv2 +import os +import torch +import open3d as o3d +import argparse +import pickle +from typing import Dict, List, Optional, Union +import time +from dimos.types.manipulation import ObjectData +from dimos.types.vector import Vector +from dimos.perception.pointcloud.utils import ( + load_camera_matrix_from_yaml, + create_point_cloud_and_extract_masks, + o3d_point_cloud_to_numpy, +) +from dimos.perception.pointcloud.cuboid_fit import fit_cuboid + + +class PointcloudFiltering: + """ + A production-ready point cloud filtering pipeline for segmented objects. + + This class takes segmentation results and produces clean, filtered point clouds + for each object with consistent coloring and optional outlier removal. + """ + + def __init__( + self, + color_intrinsics: Optional[Union[str, List[float], np.ndarray]] = None, + depth_intrinsics: Optional[Union[str, List[float], np.ndarray]] = None, + color_weight: float = 0.3, + enable_statistical_filtering: bool = True, + statistical_neighbors: int = 20, + statistical_std_ratio: float = 1.5, + enable_radius_filtering: bool = True, + radius_filtering_radius: float = 0.015, + radius_filtering_min_neighbors: int = 25, + enable_subsampling: bool = True, + voxel_size: float = 0.005, + max_num_objects: int = 10, + min_points_for_cuboid: int = 10, + cuboid_method: str = "oriented", + max_bbox_size_percent: float = 30.0, + ): + """ + Initialize the point cloud filtering pipeline. + + Args: + color_intrinsics: Camera intrinsics for color image + depth_intrinsics: Camera intrinsics for depth image + color_weight: Weight for blending generated color with original (0.0-1.0) + enable_statistical_filtering: Enable/disable statistical outlier filtering + statistical_neighbors: Number of neighbors for statistical filtering + statistical_std_ratio: Standard deviation ratio for statistical filtering + enable_radius_filtering: Enable/disable radius outlier filtering + radius_filtering_radius: Search radius for radius filtering (meters) + radius_filtering_min_neighbors: Min neighbors within radius + enable_subsampling: Enable/disable point cloud subsampling + voxel_size: Voxel size for downsampling (meters, when subsampling enabled) + max_num_objects: Maximum number of objects to process (top N by confidence) + min_points_for_cuboid: Minimum points required for cuboid fitting + cuboid_method: Method for cuboid fitting ('minimal', 'oriented', 'axis_aligned') + max_bbox_size_percent: Maximum percentage of image size for object bboxes (0-100) + + Raises: + ValueError: If invalid parameters are provided + """ + # Validate parameters + if not 0.0 <= color_weight <= 1.0: + raise ValueError(f"color_weight must be between 0.0 and 1.0, got {color_weight}") + if not 0.0 <= max_bbox_size_percent <= 100.0: + raise ValueError( + f"max_bbox_size_percent must be between 0.0 and 100.0, got {max_bbox_size_percent}" + ) + + # Store settings + self.color_weight = color_weight + self.enable_statistical_filtering = enable_statistical_filtering + self.statistical_neighbors = statistical_neighbors + self.statistical_std_ratio = statistical_std_ratio + self.enable_radius_filtering = enable_radius_filtering + self.radius_filtering_radius = radius_filtering_radius + self.radius_filtering_min_neighbors = radius_filtering_min_neighbors + self.enable_subsampling = enable_subsampling + self.voxel_size = voxel_size + self.max_num_objects = max_num_objects + self.min_points_for_cuboid = min_points_for_cuboid + self.cuboid_method = cuboid_method + self.max_bbox_size_percent = max_bbox_size_percent + + # Load camera matrices + self.color_camera_matrix = load_camera_matrix_from_yaml(color_intrinsics) + self.depth_camera_matrix = load_camera_matrix_from_yaml(depth_intrinsics) + + # Store the full point cloud + self.full_pcd = None + + def generate_color_from_id(self, object_id: int) -> np.ndarray: + """Generate a consistent color for a given object ID.""" + np.random.seed(object_id) + color = np.random.randint(0, 255, 3, dtype=np.uint8) + np.random.seed(None) + return color + + def _validate_inputs( + self, color_img: np.ndarray, depth_img: np.ndarray, objects: List[ObjectData] + ): + """Validate input parameters.""" + if color_img.shape[:2] != depth_img.shape: + raise ValueError("Color and depth image dimensions don't match") + + def _prepare_masks(self, masks: List[np.ndarray], target_shape: tuple) -> List[np.ndarray]: + """Prepare and validate masks to match target shape.""" + processed_masks = [] + for mask in masks: + # Convert mask to numpy if it's a tensor + if hasattr(mask, "cpu"): + mask = mask.cpu().numpy() + + mask = mask.astype(bool) + + # Handle shape mismatches + if mask.shape != target_shape: + if len(mask.shape) > 2: + mask = mask[:, :, 0] + + if mask.shape != target_shape: + mask = cv2.resize( + mask.astype(np.uint8), + (target_shape[1], target_shape[0]), + interpolation=cv2.INTER_NEAREST, + ).astype(bool) + + processed_masks.append(mask) + + return processed_masks + + def _apply_color_mask( + self, pcd: o3d.geometry.PointCloud, rgb_color: np.ndarray + ) -> o3d.geometry.PointCloud: + """Apply weighted color mask to point cloud.""" + if len(np.asarray(pcd.colors)) > 0: + original_colors = np.asarray(pcd.colors) + generated_color = rgb_color.astype(np.float32) / 255.0 + colored_mask = ( + 1.0 - self.color_weight + ) * original_colors + self.color_weight * generated_color + colored_mask = np.clip(colored_mask, 0.0, 1.0) + pcd.colors = o3d.utility.Vector3dVector(colored_mask) + return pcd + + def _apply_filtering(self, pcd: o3d.geometry.PointCloud) -> o3d.geometry.PointCloud: + """Apply optional filtering to point cloud based on enabled flags.""" + current_pcd = pcd + + # Apply statistical filtering if enabled + if self.enable_statistical_filtering: + current_pcd, _ = current_pcd.remove_statistical_outlier( + nb_neighbors=self.statistical_neighbors, std_ratio=self.statistical_std_ratio + ) + + # Apply radius filtering if enabled + if self.enable_radius_filtering: + current_pcd, _ = current_pcd.remove_radius_outlier( + nb_points=self.radius_filtering_min_neighbors, radius=self.radius_filtering_radius + ) + + return current_pcd + + def _apply_subsampling(self, pcd: o3d.geometry.PointCloud) -> o3d.geometry.PointCloud: + """Apply subsampling to limit point cloud size using Open3D's voxel downsampling.""" + if self.enable_subsampling: + return pcd.voxel_down_sample(self.voxel_size) + return pcd + + def _extract_masks_from_objects(self, objects: List[ObjectData]) -> List[np.ndarray]: + """Extract segmentation masks from ObjectData objects.""" + return [obj["segmentation_mask"] for obj in objects] + + def get_full_point_cloud(self) -> o3d.geometry.PointCloud: + """Get the full point cloud.""" + return self._apply_subsampling(self.full_pcd) + + def process_images( + self, color_img: np.ndarray, depth_img: np.ndarray, objects: List[ObjectData] + ) -> List[ObjectData]: + """ + Process color and depth images with object detection results to create filtered point clouds. + + Args: + color_img: RGB image as numpy array (H, W, 3) + depth_img: Depth image as numpy array (H, W) in meters + objects: List of ObjectData from object detection stream + + Returns: + List of updated ObjectData with pointcloud and 3D information. Each ObjectData + dictionary is enhanced with the following new fields: + + **3D Spatial Information** (added when sufficient points for cuboid fitting): + - "position": Vector(x, y, z) - 3D center position in world coordinates (meters) + - "rotation": Vector(roll, pitch, yaw) - 3D orientation as Euler angles (radians) + - "size": {"width": float, "height": float, "depth": float} - 3D bounding box dimensions (meters) + + **Point Cloud Data**: + - "point_cloud": o3d.geometry.PointCloud - Filtered Open3D point cloud with colors + - "color": np.ndarray - Consistent RGB color [R,G,B] (0-255) generated from object_id + + **Grasp Generation Arrays** (Dimensional grasp format): + - "point_cloud_numpy": np.ndarray - Nx3 XYZ coordinates as float32 (meters) + - "colors_numpy": np.ndarray - Nx3 RGB colors as float32 (0.0-1.0 range) + + Raises: + ValueError: If inputs are invalid + RuntimeError: If processing fails + """ + # Validate inputs + self._validate_inputs(color_img, depth_img, objects) + + if not objects: + return [] + + # Filter to top N objects by confidence + if len(objects) > self.max_num_objects: + # Sort objects by confidence (highest first), handle None confidences + sorted_objects = sorted( + objects, + key=lambda obj: obj.get("confidence", 0.0) + if obj.get("confidence") is not None + else 0.0, + reverse=True, + ) + objects = sorted_objects[: self.max_num_objects] + + # Filter out objects with bboxes too large + image_area = color_img.shape[0] * color_img.shape[1] + max_bbox_area = image_area * (self.max_bbox_size_percent / 100.0) + + filtered_objects = [] + for obj in objects: + if "bbox" in obj and obj["bbox"] is not None: + bbox = obj["bbox"] + # Calculate bbox area (assuming bbox format [x1, y1, x2, y2]) + bbox_area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) + if bbox_area <= max_bbox_area: + filtered_objects.append(obj) + else: + filtered_objects.append(obj) + + objects = filtered_objects + + # Extract masks from ObjectData + masks = self._extract_masks_from_objects(objects) + + # Prepare masks + processed_masks = self._prepare_masks(masks, depth_img.shape) + + # Create point clouds efficiently + self.full_pcd, masked_pcds = create_point_cloud_and_extract_masks( + color_img, depth_img, processed_masks, self.depth_camera_matrix, depth_scale=1.0 + ) + + # Process each object and update ObjectData + updated_objects = [] + + for i, (obj, mask, pcd) in enumerate(zip(objects, processed_masks, masked_pcds)): + # Skip empty point clouds + if len(np.asarray(pcd.points)) == 0: + continue + + # Create a copy of the object data to avoid modifying the original + updated_obj = obj.copy() + + # Generate consistent color + object_id = obj.get("object_id", i) + rgb_color = self.generate_color_from_id(object_id) + + # Apply color mask + pcd = self._apply_color_mask(pcd, rgb_color) + + # Apply subsampling to control point cloud size + pcd = self._apply_subsampling(pcd) + + # Apply filtering (optional based on flags) + pcd_filtered = self._apply_filtering(pcd) + + # Fit cuboid and extract 3D information + points = np.asarray(pcd_filtered.points) + if len(points) >= self.min_points_for_cuboid: + cuboid_params = fit_cuboid(points, method=self.cuboid_method) + if cuboid_params is not None: + # Update position, rotation, and size from cuboid + center = cuboid_params["center"] + dimensions = cuboid_params["dimensions"] + rotation_matrix = cuboid_params["rotation"] + + # Convert rotation matrix to euler angles (roll, pitch, yaw) + sy = np.sqrt( + rotation_matrix[0, 0] * rotation_matrix[0, 0] + + rotation_matrix[1, 0] * rotation_matrix[1, 0] + ) + singular = sy < 1e-6 + + if not singular: + roll = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2]) + pitch = np.arctan2(-rotation_matrix[2, 0], sy) + yaw = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0]) + else: + roll = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1]) + pitch = np.arctan2(-rotation_matrix[2, 0], sy) + yaw = 0 + + # Update position, rotation, and size from cuboid + updated_obj["position"] = Vector(center[0], center[1], center[2]) + updated_obj["rotation"] = Vector(roll, pitch, yaw) + updated_obj["size"] = { + "width": float(dimensions[0]), + "height": float(dimensions[1]), + "depth": float(dimensions[2]), + } + + # Add point cloud data to ObjectData + updated_obj["point_cloud"] = pcd_filtered + updated_obj["color"] = rgb_color + + # Extract numpy arrays for grasp generation + points_array = np.asarray(pcd_filtered.points).astype(np.float32) # Nx3 XYZ coordinates + if pcd_filtered.has_colors(): + colors_array = np.asarray(pcd_filtered.colors).astype( + np.float32 + ) # Nx3 RGB (0-1 range) + else: + # If no colors, create array of zeros + colors_array = np.zeros((len(points_array), 3), dtype=np.float32) + + updated_obj["point_cloud_numpy"] = points_array + updated_obj["colors_numpy"] = colors_array + + updated_objects.append(updated_obj) + + return updated_objects + + def cleanup(self): + """Clean up resources.""" + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + +def create_test_pipeline(data_dir: str) -> tuple: + """ + Create a test pipeline with default settings. + + Args: + data_dir: Directory containing camera info files + + Returns: + Tuple of (filter_pipeline, color_info_path, depth_info_path) + """ + color_info_path = os.path.join(data_dir, "color_camera_info.yaml") + depth_info_path = os.path.join(data_dir, "depth_camera_info.yaml") + + # Default pipeline with subsampling disabled by default + filter_pipeline = PointcloudFiltering( + color_intrinsics=color_info_path, + depth_intrinsics=depth_info_path, + ) + + return filter_pipeline, color_info_path, depth_info_path + + +def load_test_images(data_dir: str) -> tuple: + """ + Load the first available test images from data directory. + + Args: + data_dir: Directory containing color and depth subdirectories + + Returns: + Tuple of (color_img, depth_img) or raises FileNotFoundError + """ + + def find_first_image(directory): + """Find the first image file in the given directory.""" + if not os.path.exists(directory): + return None + + image_extensions = [".jpg", ".jpeg", ".png", ".bmp"] + for filename in sorted(os.listdir(directory)): + if any(filename.lower().endswith(ext) for ext in image_extensions): + return os.path.join(directory, filename) + return None + + color_dir = os.path.join(data_dir, "color") + depth_dir = os.path.join(data_dir, "depth") + + color_img_path = find_first_image(color_dir) + depth_img_path = find_first_image(depth_dir) + + if not color_img_path or not depth_img_path: + raise FileNotFoundError(f"Could not find color or depth images in {data_dir}") + + # Load color image + color_img = cv2.imread(color_img_path) + if color_img is None: + raise FileNotFoundError(f"Could not load color image from {color_img_path}") + color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB) + + # Load depth image + depth_img = cv2.imread(depth_img_path, cv2.IMREAD_UNCHANGED) + if depth_img is None: + raise FileNotFoundError(f"Could not load depth image from {depth_img_path}") + + # Convert depth to meters if needed + if depth_img.dtype == np.uint16: + depth_img = depth_img.astype(np.float32) / 1000.0 + + return color_img, depth_img + + +def run_segmentation(color_img: np.ndarray, device: str = "auto") -> List[ObjectData]: + """ + Run segmentation on color image and return ObjectData objects. + + Args: + color_img: RGB color image + device: Device to use ('auto', 'cuda', or 'cpu') + + Returns: + List of ObjectData objects with segmentation results + """ + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + + # Import here to avoid circular imports + from dimos.perception.segmentation import Sam2DSegmenter + + segmenter = Sam2DSegmenter( + model_path="FastSAM-s.pt", device=device, use_tracker=False, use_analyzer=False + ) + + try: + masks, bboxes, target_ids, probs, names = segmenter.process_image(np.array(color_img)) + + # Create ObjectData objects + objects = [] + for i in range(len(bboxes)): + obj_data: ObjectData = { + "object_id": target_ids[i] if i < len(target_ids) else i, + "bbox": bboxes[i], + "depth": -1.0, # Will be populated by pointcloud filtering + "confidence": probs[i] if i < len(probs) else 1.0, + "class_id": i, + "label": names[i] if i < len(names) else "", + "segmentation_mask": masks[i].cpu().numpy() + if hasattr(masks[i], "cpu") + else masks[i], + "position": Vector(0, 0, 0), # Will be populated by pointcloud filtering + "rotation": Vector(0, 0, 0), # Will be populated by pointcloud filtering + "size": { + "width": 0.0, + "height": 0.0, + "depth": 0.0, + }, # Will be populated by pointcloud filtering + } + objects.append(obj_data) + + return objects + + finally: + segmenter.cleanup() + + +def visualize_results(objects: List[ObjectData]): + """ + Visualize point cloud filtering results with 3D bounding boxes. + + Args: + objects: List of ObjectData with point clouds + """ + all_pcds = [] + + for obj in objects: + if "point_cloud" in obj and obj["point_cloud"] is not None: + pcd = obj["point_cloud"] + all_pcds.append(pcd) + + # Draw 3D bounding box if position, rotation, and size are available + if ( + "position" in obj + and "rotation" in obj + and "size" in obj + and obj["position"] is not None + and obj["rotation"] is not None + and obj["size"] is not None + ): + try: + position = obj["position"] + rotation = obj["rotation"] + size = obj["size"] + + # Convert position to numpy array + if hasattr(position, "x"): # Vector object + center = np.array([position.x, position.y, position.z]) + else: # Dictionary + center = np.array([position["x"], position["y"], position["z"]]) + + # Convert rotation (euler angles) to rotation matrix + if hasattr(rotation, "x"): # Vector object (roll, pitch, yaw) + roll, pitch, yaw = rotation.x, rotation.y, rotation.z + else: # Dictionary + roll, pitch, yaw = rotation["roll"], rotation["pitch"], rotation["yaw"] + + # Create rotation matrix from euler angles (ZYX order) + # Roll (X), Pitch (Y), Yaw (Z) + cos_r, sin_r = np.cos(roll), np.sin(roll) + cos_p, sin_p = np.cos(pitch), np.sin(pitch) + cos_y, sin_y = np.cos(yaw), np.sin(yaw) + + # Rotation matrix for ZYX euler angles + R = np.array( + [ + [ + cos_y * cos_p, + cos_y * sin_p * sin_r - sin_y * cos_r, + cos_y * sin_p * cos_r + sin_y * sin_r, + ], + [ + sin_y * cos_p, + sin_y * sin_p * sin_r + cos_y * cos_r, + sin_y * sin_p * cos_r - cos_y * sin_r, + ], + [-sin_p, cos_p * sin_r, cos_p * cos_r], + ] + ) + + # Get dimensions + width = size.get("width", 0.1) + height = size.get("height", 0.1) + depth = size.get("depth", 0.1) + extent = np.array([width, height, depth]) + + # Create oriented bounding box + obb = o3d.geometry.OrientedBoundingBox(center=center, R=R, extent=extent) + obb.color = [1, 0, 0] # Red bounding boxes + all_pcds.append(obb) + + except Exception as e: + print( + f"Warning: Failed to create bounding box for object {obj.get('object_id', 'unknown')}: {e}" + ) + + # Add coordinate frame + coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1) + all_pcds.append(coordinate_frame) + + # Visualize + if all_pcds: + o3d.visualization.draw_geometries( + all_pcds, + window_name="Filtered Point Clouds with 3D Bounding Boxes", + width=1280, + height=720, + ) + + +def main(): + """Main function to demonstrate the PointcloudFiltering pipeline.""" + parser = argparse.ArgumentParser(description="Point cloud filtering pipeline demonstration") + parser.add_argument( + "--save-pickle", + type=str, + help="Save generated ObjectData to pickle file (provide filename)", + ) + parser.add_argument( + "--data-dir", type=str, help="Directory containing RGBD data (default: auto-detect)" + ) + args = parser.parse_args() + + try: + # Setup paths + if args.data_dir: + data_dir = args.data_dir + else: + script_dir = os.path.dirname(os.path.abspath(__file__)) + dimos_dir = os.path.abspath(os.path.join(script_dir, "../../../")) + data_dir = os.path.join(dimos_dir, "assets/rgbd_data") + + # Load test data + print("Loading test images...") + color_img, depth_img = load_test_images(data_dir) + print(f"Loaded images: color {color_img.shape}, depth {depth_img.shape}") + + # Run segmentation + print("Running segmentation...") + objects = run_segmentation(color_img) + print(f"Found {len(objects)} objects") + + # Create filtering pipeline + print("Creating filtering pipeline...") + filter_pipeline, _, _ = create_test_pipeline(data_dir) + + # Process images + print("Processing point clouds...") + updated_objects = filter_pipeline.process_images(color_img, depth_img, objects) + + # Print results + print(f"Processing complete:") + print(f" Objects processed: {len(updated_objects)}/{len(objects)}") + + # Print per-object stats + for i, obj in enumerate(updated_objects): + if "point_cloud" in obj and obj["point_cloud"] is not None: + num_points = len(np.asarray(obj["point_cloud"].points)) + position = obj.get("position", Vector(0, 0, 0)) + size = obj.get("size", {}) + print(f" Object {i + 1} (ID: {obj['object_id']}): {num_points} points") + print(f" Position: ({position.x:.2f}, {position.y:.2f}, {position.z:.2f})") + print( + f" Size: {size.get('width', 0):.3f} x {size.get('height', 0):.3f} x {size.get('depth', 0):.3f}" + ) + + # Save to pickle file if requested + if args.save_pickle: + pickle_path = args.save_pickle + if not pickle_path.endswith(".pkl"): + pickle_path += ".pkl" + + print(f"Saving ObjectData to {pickle_path}...") + + # Create serializable objects (exclude Open3D point clouds) + serializable_objects = [] + for obj in updated_objects: + obj_copy = obj.copy() + # Remove the Open3D point cloud object (can't be pickled) + if "point_cloud" in obj_copy: + del obj_copy["point_cloud"] + serializable_objects.append(obj_copy) + + with open(pickle_path, "wb") as f: + pickle.dump(serializable_objects, f) + + print(f"Successfully saved {len(serializable_objects)} objects to {pickle_path}") + print("To load: objects = pickle.load(open('filename.pkl', 'rb'))") + print( + "Note: Open3D point clouds excluded - use point_cloud_numpy and colors_numpy for processing" + ) + + # Visualize results + print("Visualizing results...") + visualize_results(updated_objects) + + except Exception as e: + print(f"Error: {e}") + import traceback + + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/dimos/perception/pointcloud/utils.py b/dimos/perception/pointcloud/utils.py new file mode 100644 index 0000000000..16cb0e0800 --- /dev/null +++ b/dimos/perception/pointcloud/utils.py @@ -0,0 +1,1367 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Point cloud utilities for RGBD data processing. + +This module provides efficient utilities for creating and manipulating point clouds +from RGBD images using Open3D. +""" + +import numpy as np +import yaml +import os +import cv2 +import open3d as o3d +from typing import List, Optional, Tuple, Union, Dict +from scipy.spatial import cKDTree + + +def depth_to_point_cloud(depth_image, camera_intrinsics, subsample_factor=4): + """ + Convert depth image to point cloud using camera intrinsics. + Subsamples points to reduce density. + + Args: + depth_image: HxW depth image in meters + camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix + subsample_factor: Factor to subsample points (higher = fewer points) + + Returns: + Nx3 array of 3D points + """ + # Filter out inf and nan values from depth image + depth_filtered = depth_image.copy() + + # Create mask for valid depth values (finite, positive, non-zero) + valid_mask = np.isfinite(depth_filtered) & (depth_filtered > 0) + + # Set invalid values to 0 + depth_filtered[~valid_mask] = 0.0 + + # Extract camera parameters + if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: + fx, fy, cx, cy = camera_intrinsics + else: + fx = camera_intrinsics[0, 0] + fy = camera_intrinsics[1, 1] + cx = camera_intrinsics[0, 2] + cy = camera_intrinsics[1, 2] + + # Create pixel coordinate grid + rows, cols = depth_filtered.shape + x_grid, y_grid = np.meshgrid( + np.arange(0, cols, subsample_factor), np.arange(0, rows, subsample_factor) + ) + + # Flatten grid and depth + x = x_grid.flatten() + y = y_grid.flatten() + z = depth_filtered[y_grid, x_grid].flatten() + + # Remove points with invalid depth (after filtering, this catches zeros) + valid = z > 0 + x = x[valid] + y = y[valid] + z = z[valid] + + # Convert to 3D points + X = (x - cx) * z / fx + Y = (y - cy) * z / fy + Z = z + + return np.column_stack([X, Y, Z]) + + +def load_camera_matrix_from_yaml( + camera_info: Optional[Union[str, List[float], np.ndarray, dict]], +) -> Optional[np.ndarray]: + """ + Load camera intrinsic matrix from various input formats. + + Args: + camera_info: Can be: + - Path to YAML file containing camera parameters + - List of [fx, fy, cx, cy] + - 3x3 numpy array (returned as-is) + - Dict with camera parameters + - None (returns None) + + Returns: + 3x3 camera intrinsic matrix or None if input is None + + Raises: + ValueError: If camera_info format is invalid or file cannot be read + FileNotFoundError: If YAML file path doesn't exist + """ + if camera_info is None: + return None + + # Handle case where camera_info is already a matrix + if isinstance(camera_info, np.ndarray) and camera_info.shape == (3, 3): + return camera_info.astype(np.float32) + + # Handle case where camera_info is [fx, fy, cx, cy] format + if isinstance(camera_info, list) and len(camera_info) == 4: + fx, fy, cx, cy = camera_info + return np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32) + + # Handle case where camera_info is a dict + if isinstance(camera_info, dict): + return _extract_matrix_from_dict(camera_info) + + # Handle case where camera_info is a path to a YAML file + if isinstance(camera_info, str): + if not os.path.isfile(camera_info): + raise FileNotFoundError(f"Camera info file not found: {camera_info}") + + try: + with open(camera_info, "r") as f: + data = yaml.safe_load(f) + return _extract_matrix_from_dict(data) + except Exception as e: + raise ValueError(f"Failed to read camera info from {camera_info}: {e}") + + raise ValueError( + f"Invalid camera_info format. Expected str, list, dict, or numpy array, got {type(camera_info)}" + ) + + +def _extract_matrix_from_dict(data: dict) -> np.ndarray: + """Extract camera matrix from dictionary with various formats.""" + # ROS format with 'K' field (most common) + if "K" in data: + k_data = data["K"] + if len(k_data) == 9: + return np.array(k_data, dtype=np.float32).reshape(3, 3) + + # Standard format with 'camera_matrix' + if "camera_matrix" in data: + if "data" in data["camera_matrix"]: + matrix_data = data["camera_matrix"]["data"] + if len(matrix_data) == 9: + return np.array(matrix_data, dtype=np.float32).reshape(3, 3) + + # Explicit intrinsics format + if all(k in data for k in ["fx", "fy", "cx", "cy"]): + fx, fy = float(data["fx"]), float(data["fy"]) + cx, cy = float(data["cx"]), float(data["cy"]) + return np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32) + + # Error case - provide helpful debug info + available_keys = list(data.keys()) + if "K" in data: + k_info = f"K field length: {len(data['K']) if hasattr(data['K'], '__len__') else 'unknown'}" + else: + k_info = "K field not found" + + raise ValueError( + f"Cannot extract camera matrix from data. " + f"Available keys: {available_keys}. {k_info}. " + f"Expected formats: 'K' (9 elements), 'camera_matrix.data' (9 elements), " + f"or individual 'fx', 'fy', 'cx', 'cy' fields." + ) + + +def create_o3d_point_cloud_from_rgbd( + color_img: np.ndarray, + depth_img: np.ndarray, + intrinsic: np.ndarray, + depth_scale: float = 1.0, + depth_trunc: float = 3.0, +) -> o3d.geometry.PointCloud: + """ + Create an Open3D point cloud from RGB and depth images. + + Args: + color_img: RGB image as numpy array (H, W, 3) + depth_img: Depth image as numpy array (H, W) + intrinsic: Camera intrinsic matrix (3x3 numpy array) + depth_scale: Scale factor to convert depth to meters + depth_trunc: Maximum depth in meters + + Returns: + Open3D point cloud object + + Raises: + ValueError: If input dimensions are invalid + """ + # Validate inputs + if len(color_img.shape) != 3 or color_img.shape[2] != 3: + raise ValueError(f"color_img must be (H, W, 3), got {color_img.shape}") + if len(depth_img.shape) != 2: + raise ValueError(f"depth_img must be (H, W), got {depth_img.shape}") + if color_img.shape[:2] != depth_img.shape: + raise ValueError( + f"Color and depth image dimensions don't match: {color_img.shape[:2]} vs {depth_img.shape}" + ) + if intrinsic.shape != (3, 3): + raise ValueError(f"intrinsic must be (3, 3), got {intrinsic.shape}") + + # Convert to Open3D format + color_o3d = o3d.geometry.Image(color_img.astype(np.uint8)) + + # Filter out inf and nan values from depth image + depth_filtered = depth_img.copy() + + # Create mask for valid depth values (finite, positive, non-zero) + valid_mask = np.isfinite(depth_filtered) & (depth_filtered > 0) + + # Set invalid values to 0 (which Open3D treats as no depth) + depth_filtered[~valid_mask] = 0.0 + + depth_o3d = o3d.geometry.Image(depth_filtered.astype(np.float32)) + + # Create Open3D intrinsic object + height, width = color_img.shape[:2] + fx, fy = intrinsic[0, 0], intrinsic[1, 1] + cx, cy = intrinsic[0, 2], intrinsic[1, 2] + intrinsic_o3d = o3d.camera.PinholeCameraIntrinsic( + width, + height, + fx, + fy, # fx, fy + cx, + cy, # cx, cy + ) + + # Create RGBD image + rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth( + color_o3d, + depth_o3d, + depth_scale=depth_scale, + depth_trunc=depth_trunc, + convert_rgb_to_intensity=False, + ) + + # Create point cloud + pcd = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, intrinsic_o3d) + + return pcd + + +def o3d_point_cloud_to_numpy(pcd: o3d.geometry.PointCloud) -> np.ndarray: + """ + Convert Open3D point cloud to numpy array of XYZRGB points. + + Args: + pcd: Open3D point cloud object + + Returns: + Nx6 array of XYZRGB points (empty array if no points) + """ + points = np.asarray(pcd.points) + if len(points) == 0: + return np.zeros((0, 6), dtype=np.float32) + + # Get colors if available + if pcd.has_colors(): + colors = np.asarray(pcd.colors) * 255.0 # Convert from [0,1] to [0,255] + return np.column_stack([points, colors]).astype(np.float32) + else: + # No colors available, return points with zero colors + zeros = np.zeros((len(points), 3), dtype=np.float32) + return np.column_stack([points, zeros]).astype(np.float32) + + +def numpy_to_o3d_point_cloud(points_rgb: np.ndarray) -> o3d.geometry.PointCloud: + """ + Convert numpy array of XYZRGB points to Open3D point cloud. + + Args: + points_rgb: Nx6 array of XYZRGB points or Nx3 array of XYZ points + + Returns: + Open3D point cloud object + + Raises: + ValueError: If array shape is invalid + """ + if len(points_rgb) == 0: + return o3d.geometry.PointCloud() + + if points_rgb.shape[1] < 3: + raise ValueError( + f"points_rgb must have at least 3 columns (XYZ), got {points_rgb.shape[1]}" + ) + + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(points_rgb[:, :3]) + + # Add colors if available + if points_rgb.shape[1] >= 6: + colors = points_rgb[:, 3:6] / 255.0 # Convert from [0,255] to [0,1] + colors = np.clip(colors, 0.0, 1.0) # Ensure valid range + pcd.colors = o3d.utility.Vector3dVector(colors) + + return pcd + + +def create_masked_point_cloud(color_img, depth_img, mask, intrinsic, depth_scale=1.0): + """ + Create a point cloud for a masked region of RGBD data using Open3D. + + Args: + color_img: RGB image (H, W, 3) + depth_img: Depth image (H, W) + mask: Boolean mask of the same size as color_img and depth_img + intrinsic: Camera intrinsic matrix (3x3 numpy array) + depth_scale: Scale factor to convert depth to meters + + Returns: + Open3D point cloud object for the masked region + """ + # Filter out inf and nan values from depth image + depth_filtered = depth_img.copy() + + # Create mask for valid depth values (finite, positive, non-zero) + valid_depth_mask = np.isfinite(depth_filtered) & (depth_filtered > 0) + + # Set invalid values to 0 + depth_filtered[~valid_depth_mask] = 0.0 + + # Create masked color and depth images + masked_color = color_img.copy() + masked_depth = depth_filtered.copy() + + # Apply mask + if not mask.shape[:2] == color_img.shape[:2]: + raise ValueError(f"Mask shape {mask.shape} doesn't match image shape {color_img.shape[:2]}") + + # Create a boolean mask that is properly expanded for the RGB channels + # For RGB image, we need to properly broadcast the mask to all 3 channels + if len(color_img.shape) == 3 and color_img.shape[2] == 3: + # Properly broadcast mask to match the RGB dimensions + mask_rgb = np.broadcast_to(mask[:, :, np.newaxis], color_img.shape) + masked_color[~mask_rgb] = 0 + else: + # For grayscale images + masked_color[~mask] = 0 + + # Apply mask to depth image + masked_depth[~mask] = 0 + + # Create point cloud + pcd = create_o3d_point_cloud_from_rgbd(masked_color, masked_depth, intrinsic, depth_scale) + + # Remove points with coordinates at origin (0,0,0) which are likely from masked out regions + points = np.asarray(pcd.points) + if len(points) > 0: + # Find points that are not at origin + dist_from_origin = np.sum(points**2, axis=1) + valid_indices = dist_from_origin > 1e-6 + + # Filter points and colors + pcd = pcd.select_by_index(np.where(valid_indices)[0]) + + return pcd + + +def create_point_cloud_and_extract_masks( + color_img: np.ndarray, + depth_img: np.ndarray, + masks: List[np.ndarray], + intrinsic: np.ndarray, + depth_scale: float = 1.0, + depth_trunc: float = 3.0, +) -> Tuple[o3d.geometry.PointCloud, List[o3d.geometry.PointCloud]]: + """ + Efficiently create a point cloud once and extract multiple masked regions. + + Args: + color_img: RGB image (H, W, 3) + depth_img: Depth image (H, W) + masks: List of boolean masks, each of shape (H, W) + intrinsic: Camera intrinsic matrix (3x3 numpy array) + depth_scale: Scale factor to convert depth to meters + depth_trunc: Maximum depth in meters + + Returns: + Tuple of (full_point_cloud, list_of_masked_point_clouds) + """ + if not masks: + return o3d.geometry.PointCloud(), [] + + # Create the full point cloud + full_pcd = create_o3d_point_cloud_from_rgbd( + color_img, depth_img, intrinsic, depth_scale, depth_trunc + ) + + if len(np.asarray(full_pcd.points)) == 0: + return full_pcd, [o3d.geometry.PointCloud() for _ in masks] + + # Create pixel-to-point mapping + valid_depth_mask = np.isfinite(depth_img) & (depth_img > 0) & (depth_img <= depth_trunc) + + valid_depth = valid_depth_mask.flatten() + if not np.any(valid_depth): + return full_pcd, [o3d.geometry.PointCloud() for _ in masks] + + pixel_to_point = np.full(len(valid_depth), -1, dtype=np.int32) + pixel_to_point[valid_depth] = np.arange(np.sum(valid_depth)) + + # Extract point clouds for each mask + masked_pcds = [] + max_points = len(np.asarray(full_pcd.points)) + + for mask in masks: + if mask.shape != depth_img.shape: + masked_pcds.append(o3d.geometry.PointCloud()) + continue + + mask_flat = mask.flatten() + valid_mask_indices = mask_flat & valid_depth + point_indices = pixel_to_point[valid_mask_indices] + valid_point_indices = point_indices[point_indices >= 0] + + if len(valid_point_indices) > 0: + valid_point_indices = np.clip(valid_point_indices, 0, max_points - 1) + valid_point_indices = np.unique(valid_point_indices) + masked_pcd = full_pcd.select_by_index(valid_point_indices.tolist()) + else: + masked_pcd = o3d.geometry.PointCloud() + + masked_pcds.append(masked_pcd) + + return full_pcd, masked_pcds + + +def extract_masked_point_cloud_efficient( + full_pcd: o3d.geometry.PointCloud, depth_img: np.ndarray, mask: np.ndarray +) -> o3d.geometry.PointCloud: + """ + Extract a masked region from an existing point cloud efficiently. + + This assumes the point cloud was created from the given depth image. + Use this when you have a pre-computed full point cloud and want to extract + individual masked regions. + + Args: + full_pcd: Complete Open3D point cloud + depth_img: Depth image used to create the point cloud (H, W) + mask: Boolean mask (H, W) + + Returns: + Open3D point cloud for the masked region + + Raises: + ValueError: If mask shape doesn't match depth image + """ + if mask.shape != depth_img.shape: + raise ValueError( + f"Mask shape {mask.shape} doesn't match depth image shape {depth_img.shape}" + ) + + # Early return if no points in full point cloud + if len(np.asarray(full_pcd.points)) == 0: + return o3d.geometry.PointCloud() + + # Get valid depth mask + valid_depth = depth_img.flatten() > 0 + mask_flat = mask.flatten() + + # Find pixels that are both valid and in the mask + valid_mask_indices = mask_flat & valid_depth + + # Get indices of valid points + point_indices = np.where(valid_mask_indices[valid_depth])[0] + + # Extract the masked point cloud + if len(point_indices) > 0: + return full_pcd.select_by_index(point_indices) + else: + return o3d.geometry.PointCloud() + + +def segment_and_remove_plane(pcd, distance_threshold=0.02, ransac_n=3, num_iterations=1000): + """ + Segment the dominant plane from a point cloud using RANSAC and remove it. + Often used to remove table tops, floors, walls, or other planar surfaces. + + Args: + pcd: Open3D point cloud object + distance_threshold: Maximum distance a point can be from the plane to be considered an inlier (in meters) + ransac_n: Number of points to sample for each RANSAC iteration + num_iterations: Number of RANSAC iterations + + Returns: + Open3D point cloud with the dominant plane removed + """ + # Make a copy of the input point cloud to avoid modifying the original + pcd_filtered = o3d.geometry.PointCloud() + pcd_filtered.points = o3d.utility.Vector3dVector(np.asarray(pcd.points)) + if pcd.has_colors(): + pcd_filtered.colors = o3d.utility.Vector3dVector(np.asarray(pcd.colors)) + if pcd.has_normals(): + pcd_filtered.normals = o3d.utility.Vector3dVector(np.asarray(pcd.normals)) + + # Check if point cloud has enough points + if len(pcd_filtered.points) < ransac_n: + return pcd_filtered + + # Run RANSAC to find the largest plane + _, inliers = pcd_filtered.segment_plane( + distance_threshold=distance_threshold, ransac_n=ransac_n, num_iterations=num_iterations + ) + + # Remove the dominant plane (regardless of orientation) + pcd_without_dominant_plane = pcd_filtered.select_by_index(inliers, invert=True) + return pcd_without_dominant_plane + + +def filter_point_cloud_statistical( + pcd: o3d.geometry.PointCloud, nb_neighbors: int = 20, std_ratio: float = 2.0 +) -> Tuple[o3d.geometry.PointCloud, np.ndarray]: + """ + Apply statistical outlier filtering to point cloud. + + Args: + pcd: Input point cloud + nb_neighbors: Number of neighbors to analyze for each point + std_ratio: Threshold level based on standard deviation + + Returns: + Tuple of (filtered_point_cloud, outlier_indices) + """ + if len(np.asarray(pcd.points)) == 0: + return pcd, np.array([]) + + return pcd.remove_statistical_outlier(nb_neighbors=nb_neighbors, std_ratio=std_ratio) + + +def filter_point_cloud_radius( + pcd: o3d.geometry.PointCloud, nb_points: int = 16, radius: float = 0.05 +) -> Tuple[o3d.geometry.PointCloud, np.ndarray]: + """ + Apply radius-based outlier filtering to point cloud. + + Args: + pcd: Input point cloud + nb_points: Minimum number of points within radius + radius: Search radius in meters + + Returns: + Tuple of (filtered_point_cloud, outlier_indices) + """ + if len(np.asarray(pcd.points)) == 0: + return pcd, np.array([]) + + return pcd.remove_radius_outlier(nb_points=nb_points, radius=radius) + + +def compute_point_cloud_bounds(pcd: o3d.geometry.PointCloud) -> dict: + """ + Compute bounding box information for a point cloud. + + Args: + pcd: Input point cloud + + Returns: + Dictionary with bounds information + """ + points = np.asarray(pcd.points) + if len(points) == 0: + return { + "min": np.array([0, 0, 0]), + "max": np.array([0, 0, 0]), + "center": np.array([0, 0, 0]), + "size": np.array([0, 0, 0]), + "volume": 0.0, + } + + min_bound = points.min(axis=0) + max_bound = points.max(axis=0) + center = (min_bound + max_bound) / 2 + size = max_bound - min_bound + volume = np.prod(size) + + return {"min": min_bound, "max": max_bound, "center": center, "size": size, "volume": volume} + + +def project_3d_points_to_2d( + points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray] +) -> np.ndarray: + """ + Project 3D points to 2D image coordinates using camera intrinsics. + + Args: + points_3d: Nx3 array of 3D points (X, Y, Z) + camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix + + Returns: + Nx2 array of 2D image coordinates (u, v) + """ + if len(points_3d) == 0: + return np.zeros((0, 2), dtype=np.int32) + + # Filter out points with zero or negative depth + valid_mask = points_3d[:, 2] > 0 + if not np.any(valid_mask): + return np.zeros((0, 2), dtype=np.int32) + + valid_points = points_3d[valid_mask] + + # Extract camera parameters + if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: + fx, fy, cx, cy = camera_intrinsics + else: + fx = camera_intrinsics[0, 0] + fy = camera_intrinsics[1, 1] + cx = camera_intrinsics[0, 2] + cy = camera_intrinsics[1, 2] + + # Project to image coordinates + u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx + v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy + + # Round to integer pixel coordinates + points_2d = np.column_stack([u, v]).astype(np.int32) + + return points_2d + + +def overlay_point_clouds_on_image( + base_image: np.ndarray, + point_clouds: List[o3d.geometry.PointCloud], + camera_intrinsics: Union[List[float], np.ndarray], + colors: List[Tuple[int, int, int]], + point_size: int = 2, + alpha: float = 0.7, +) -> np.ndarray: + """ + Overlay multiple colored point clouds onto an image. + + Args: + base_image: Base image to overlay onto (H, W, 3) - assumed to be RGB + point_clouds: List of Open3D point cloud objects + camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix + colors: List of RGB color tuples for each point cloud. If None, generates distinct colors. + point_size: Size of points to draw (in pixels) + alpha: Blending factor for overlay (0.0 = fully transparent, 1.0 = fully opaque) + + Returns: + Image with overlaid point clouds (H, W, 3) + """ + if len(point_clouds) == 0: + return base_image.copy() + + # Create overlay image + overlay = base_image.copy() + height, width = base_image.shape[:2] + + # Process each point cloud + for i, pcd in enumerate(point_clouds): + if pcd is None: + continue + + points_3d = np.asarray(pcd.points) + if len(points_3d) == 0: + continue + + # Project 3D points to 2D + points_2d = project_3d_points_to_2d(points_3d, camera_intrinsics) + + if len(points_2d) == 0: + continue + + # Filter points within image bounds + valid_mask = ( + (points_2d[:, 0] >= 0) + & (points_2d[:, 0] < width) + & (points_2d[:, 1] >= 0) + & (points_2d[:, 1] < height) + ) + valid_points_2d = points_2d[valid_mask] + + if len(valid_points_2d) == 0: + continue + + # Get color for this point cloud + color = colors[i % len(colors)] + + # Ensure color is a tuple of integers for OpenCV + if isinstance(color, (list, tuple, np.ndarray)): + color = tuple(int(c) for c in color[:3]) + else: + color = (255, 255, 255) + + # Draw points on overlay + for point in valid_points_2d: + u, v = point + # Draw a small filled circle for each point + cv2.circle(overlay, (u, v), point_size, color, -1) + + # Blend overlay with base image + result = cv2.addWeighted(base_image, 1 - alpha, overlay, alpha, 0) + + return result + + +def create_point_cloud_overlay_visualization( + base_image: np.ndarray, + objects: List[dict], + intrinsics: np.ndarray, +) -> np.ndarray: + """ + Create a visualization showing object point clouds and bounding boxes overlaid on a base image. + + Args: + base_image: Base image to overlay onto (H, W, 3) + objects: List of object dictionaries containing 'point_cloud', 'color', 'position', 'rotation', 'size' keys + intrinsics: Camera intrinsics as [fx, fy, cx, cy] or 3x3 matrix + + Returns: + Visualization image with overlaid point clouds and bounding boxes (H, W, 3) + """ + # Extract point clouds and colors from objects + point_clouds = [] + colors = [] + for obj in objects: + if "point_cloud" in obj and obj["point_cloud"] is not None: + point_clouds.append(obj["point_cloud"]) + + # Convert color to tuple + color = obj["color"] + if isinstance(color, np.ndarray): + color = tuple(int(c) for c in color) + elif isinstance(color, (list, tuple)): + color = tuple(int(c) for c in color[:3]) + colors.append(color) + + # Create visualization + if point_clouds: + result = overlay_point_clouds_on_image( + base_image=base_image, + point_clouds=point_clouds, + camera_intrinsics=intrinsics, + colors=colors, + point_size=3, + alpha=0.8, + ) + else: + result = base_image.copy() + + # Draw 3D bounding boxes + height_img, width_img = result.shape[:2] + for i, obj in enumerate(objects): + if all(key in obj and obj[key] is not None for key in ["position", "rotation", "size"]): + try: + # Create and project 3D bounding box + corners_3d = create_3d_bounding_box_corners( + obj["position"], obj["rotation"], obj["size"] + ) + corners_2d = project_3d_points_to_2d(corners_3d, intrinsics) + + # Check if any corners are visible + valid_mask = ( + (corners_2d[:, 0] >= 0) + & (corners_2d[:, 0] < width_img) + & (corners_2d[:, 1] >= 0) + & (corners_2d[:, 1] < height_img) + ) + + if np.any(valid_mask): + # Get color + bbox_color = colors[i] if i < len(colors) else (255, 255, 255) + draw_3d_bounding_box_on_image(result, corners_2d, bbox_color, thickness=2) + except: + continue + + return result + + +def create_3d_bounding_box_corners(position, rotation, size): + """ + Create 8 corners of a 3D bounding box from position, rotation, and size. + + Args: + position: Vector or dict with x, y, z coordinates + rotation: Vector or dict with roll, pitch, yaw angles + size: Dict with width, height, depth + + Returns: + 8x3 numpy array of corner coordinates + """ + # Convert position to numpy array + if hasattr(position, "x"): # Vector object + center = np.array([position.x, position.y, position.z]) + else: # Dictionary + center = np.array([position["x"], position["y"], position["z"]]) + + # Convert rotation (euler angles) to rotation matrix + if hasattr(rotation, "x"): # Vector object (roll, pitch, yaw) + roll, pitch, yaw = rotation.x, rotation.y, rotation.z + else: # Dictionary + roll, pitch, yaw = rotation["roll"], rotation["pitch"], rotation["yaw"] + + # Create rotation matrix from euler angles (ZYX order) + cos_r, sin_r = np.cos(roll), np.sin(roll) + cos_p, sin_p = np.cos(pitch), np.sin(pitch) + cos_y, sin_y = np.cos(yaw), np.sin(yaw) + + # Rotation matrix for ZYX euler angles + R = np.array( + [ + [ + cos_y * cos_p, + cos_y * sin_p * sin_r - sin_y * cos_r, + cos_y * sin_p * cos_r + sin_y * sin_r, + ], + [ + sin_y * cos_p, + sin_y * sin_p * sin_r + cos_y * cos_r, + sin_y * sin_p * cos_r - cos_y * sin_r, + ], + [-sin_p, cos_p * sin_r, cos_p * cos_r], + ] + ) + + # Get dimensions + width = size.get("width", 0.1) + height = size.get("height", 0.1) + depth = size.get("depth", 0.1) + + # Create 8 corners of the bounding box (before rotation) + corners = np.array( + [ + [-width / 2, -height / 2, -depth / 2], # 0 + [width / 2, -height / 2, -depth / 2], # 1 + [width / 2, height / 2, -depth / 2], # 2 + [-width / 2, height / 2, -depth / 2], # 3 + [-width / 2, -height / 2, depth / 2], # 4 + [width / 2, -height / 2, depth / 2], # 5 + [width / 2, height / 2, depth / 2], # 6 + [-width / 2, height / 2, depth / 2], # 7 + ] + ) + + # Apply rotation and translation + rotated_corners = corners @ R.T + center + + return rotated_corners + + +def draw_3d_bounding_box_on_image(image, corners_2d, color, thickness=2): + """ + Draw a 3D bounding box on an image using projected 2D corners. + + Args: + image: Image to draw on + corners_2d: 8x2 array of 2D corner coordinates + color: RGB color tuple + thickness: Line thickness + """ + # Define the 12 edges of a cube (connecting corner indices) + edges = [ + (0, 1), + (1, 2), + (2, 3), + (3, 0), # Bottom face + (4, 5), + (5, 6), + (6, 7), + (7, 4), # Top face + (0, 4), + (1, 5), + (2, 6), + (3, 7), # Vertical edges + ] + + # Draw each edge + for start_idx, end_idx in edges: + start_point = tuple(corners_2d[start_idx].astype(int)) + end_point = tuple(corners_2d[end_idx].astype(int)) + cv2.line(image, start_point, end_point, color, thickness) + + +def extract_and_cluster_misc_points( + full_pcd: o3d.geometry.PointCloud, + all_objects: List[dict], + eps: float = 0.03, + min_points: int = 100, + enable_filtering: bool = True, + voxel_size: float = 0.02, +) -> Tuple[List[o3d.geometry.PointCloud], o3d.geometry.VoxelGrid]: + """ + Extract miscellaneous/background points and cluster them using DBSCAN. + + Args: + full_pcd: Complete scene point cloud + all_objects: List of objects with point clouds to subtract + eps: DBSCAN epsilon parameter (max distance between points in cluster) + min_points: DBSCAN min_samples parameter (min points to form cluster) + enable_filtering: Whether to apply statistical and radius filtering + voxel_size: Size of voxels for voxel grid generation + + Returns: + Tuple of (clustered_point_clouds, voxel_grid) + """ + if full_pcd is None or len(np.asarray(full_pcd.points)) == 0: + return [], o3d.geometry.VoxelGrid() + + if not all_objects: + # If no objects detected, cluster the full point cloud + clusters = _cluster_point_cloud_dbscan(full_pcd, eps, min_points) + voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size) + return clusters, voxel_grid + + try: + # Start with a copy of the full point cloud + misc_pcd = o3d.geometry.PointCloud(full_pcd) + + # Remove object points by combining all object point clouds + all_object_points = [] + for obj in all_objects: + if "point_cloud" in obj and obj["point_cloud"] is not None: + obj_points = np.asarray(obj["point_cloud"].points) + if len(obj_points) > 0: + all_object_points.append(obj_points) + + if not all_object_points: + # No object points to remove, cluster full point cloud + clusters = _cluster_point_cloud_dbscan(misc_pcd, eps, min_points) + voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size) + return clusters, voxel_grid + + # Combine all object points + combined_obj_points = np.vstack(all_object_points) + + # For efficiency, downsample both point clouds + misc_downsampled = misc_pcd.voxel_down_sample(voxel_size=0.005) + + # Create object point cloud for efficient operations + obj_pcd = o3d.geometry.PointCloud() + obj_pcd.points = o3d.utility.Vector3dVector(combined_obj_points) + obj_downsampled = obj_pcd.voxel_down_sample(voxel_size=0.005) + + misc_points = np.asarray(misc_downsampled.points) + obj_points_down = np.asarray(obj_downsampled.points) + + if len(misc_points) == 0 or len(obj_points_down) == 0: + clusters = _cluster_point_cloud_dbscan(misc_downsampled, eps, min_points) + voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size) + return clusters, voxel_grid + + # Build tree for object points + obj_tree = cKDTree(obj_points_down) + + # Find distances from misc points to nearest object points + distances, _ = obj_tree.query(misc_points, k=1) + + # Keep points that are far enough from any object point + threshold = 0.015 # 1.5cm threshold + keep_mask = distances > threshold + + if not np.any(keep_mask): + return [], o3d.geometry.VoxelGrid() + + # Filter misc points + misc_indices = np.where(keep_mask)[0] + final_misc_pcd = misc_downsampled.select_by_index(misc_indices) + + if len(np.asarray(final_misc_pcd.points)) == 0: + return [], o3d.geometry.VoxelGrid() + + # Apply additional filtering if enabled + if enable_filtering: + # Apply statistical outlier filtering + filtered_misc_pcd, _ = filter_point_cloud_statistical( + final_misc_pcd, nb_neighbors=30, std_ratio=2.0 + ) + + if len(np.asarray(filtered_misc_pcd.points)) == 0: + return [], o3d.geometry.VoxelGrid() + + # Apply radius outlier filtering + final_filtered_misc_pcd, _ = filter_point_cloud_radius( + filtered_misc_pcd, + nb_points=20, + radius=0.03, # 3cm radius + ) + + if len(np.asarray(final_filtered_misc_pcd.points)) == 0: + return [], o3d.geometry.VoxelGrid() + + final_misc_pcd = final_filtered_misc_pcd + + # Cluster the misc points using DBSCAN + clusters = _cluster_point_cloud_dbscan(final_misc_pcd, eps, min_points) + + # Create voxel grid from all misc points (before clustering) + voxel_grid = _create_voxel_grid_from_point_cloud(final_misc_pcd, voxel_size) + + return clusters, voxel_grid + + except Exception as e: + print(f"Error in misc point extraction and clustering: {e}") + # Fallback: return downsampled full point cloud as single cluster + try: + downsampled = full_pcd.voxel_down_sample(voxel_size=0.02) + if len(np.asarray(downsampled.points)) > 0: + voxel_grid = _create_voxel_grid_from_point_cloud(downsampled, voxel_size) + return [downsampled], voxel_grid + else: + return [], o3d.geometry.VoxelGrid() + except: + return [], o3d.geometry.VoxelGrid() + + +def _create_voxel_grid_from_point_cloud( + pcd: o3d.geometry.PointCloud, voxel_size: float = 0.02 +) -> o3d.geometry.VoxelGrid: + """ + Create a voxel grid from a point cloud. + + Args: + pcd: Input point cloud + voxel_size: Size of each voxel + + Returns: + Open3D VoxelGrid object + """ + if len(np.asarray(pcd.points)) == 0: + return o3d.geometry.VoxelGrid() + + try: + # Create voxel grid from point cloud + voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud(pcd, voxel_size) + + # Color the voxels with a semi-transparent gray + for voxel in voxel_grid.get_voxels(): + voxel.color = [0.5, 0.5, 0.5] # Gray color + + print( + f"Created voxel grid with {len(voxel_grid.get_voxels())} voxels (voxel_size={voxel_size})" + ) + return voxel_grid + + except Exception as e: + print(f"Error creating voxel grid: {e}") + return o3d.geometry.VoxelGrid() + + +def _create_voxel_grid_from_clusters( + clusters: List[o3d.geometry.PointCloud], voxel_size: float = 0.02 +) -> o3d.geometry.VoxelGrid: + """ + Create a voxel grid from multiple clustered point clouds. + + Args: + clusters: List of clustered point clouds + voxel_size: Size of each voxel + + Returns: + Open3D VoxelGrid object + """ + if not clusters: + return o3d.geometry.VoxelGrid() + + # Combine all clusters into one point cloud + combined_points = [] + for cluster in clusters: + points = np.asarray(cluster.points) + if len(points) > 0: + combined_points.append(points) + + if not combined_points: + return o3d.geometry.VoxelGrid() + + # Create combined point cloud + all_points = np.vstack(combined_points) + combined_pcd = o3d.geometry.PointCloud() + combined_pcd.points = o3d.utility.Vector3dVector(all_points) + + return _create_voxel_grid_from_point_cloud(combined_pcd, voxel_size) + + +def _cluster_point_cloud_dbscan( + pcd: o3d.geometry.PointCloud, eps: float = 0.05, min_points: int = 50 +) -> List[o3d.geometry.PointCloud]: + """ + Cluster a point cloud using DBSCAN and return list of clustered point clouds. + + Args: + pcd: Point cloud to cluster + eps: DBSCAN epsilon parameter + min_points: DBSCAN min_samples parameter + + Returns: + List of point clouds, one for each cluster + """ + if len(np.asarray(pcd.points)) == 0: + return [] + + try: + # Apply DBSCAN clustering + labels = np.array(pcd.cluster_dbscan(eps=eps, min_points=min_points)) + + # Get unique cluster labels (excluding noise points labeled as -1) + unique_labels = np.unique(labels) + cluster_pcds = [] + + for label in unique_labels: + if label == -1: # Skip noise points + continue + + # Get indices for this cluster + cluster_indices = np.where(labels == label)[0] + + if len(cluster_indices) > 0: + # Create point cloud for this cluster + cluster_pcd = pcd.select_by_index(cluster_indices) + + # Assign a random color to this cluster + cluster_color = np.random.rand(3) # Random RGB color + cluster_pcd.paint_uniform_color(cluster_color) + + cluster_pcds.append(cluster_pcd) + + print( + f"DBSCAN clustering found {len(cluster_pcds)} clusters from {len(np.asarray(pcd.points))} points" + ) + return cluster_pcds + + except Exception as e: + print(f"Error in DBSCAN clustering: {e}") + return [pcd] # Return original point cloud as fallback + + +def get_standard_coordinate_transform(): + """ + Get a standard coordinate transformation matrix for consistent visualization. + + This transformation ensures that: + - X (red) axis points right + - Y (green) axis points up + - Z (blue) axis points toward viewer + + Returns: + 4x4 transformation matrix + """ + # Standard transformation matrix to ensure consistent coordinate frame orientation + transform = np.array( + [ + [1, 0, 0, 0], # X points right + [0, -1, 0, 0], # Y points up (flip from OpenCV to standard) + [0, 0, -1, 0], # Z points toward viewer (flip depth) + [0, 0, 0, 1], + ] + ) + return transform + + +def visualize_clustered_point_clouds( + clustered_pcds: List[o3d.geometry.PointCloud], + window_name: str = "Clustered Point Clouds", + point_size: float = 2.0, + show_coordinate_frame: bool = True, + coordinate_frame_size: float = 0.1, +) -> None: + """ + Visualize multiple clustered point clouds with different colors. + + Args: + clustered_pcds: List of point clouds (already colored) + window_name: Name of the visualization window + point_size: Size of points in the visualization + show_coordinate_frame: Whether to show coordinate frame + coordinate_frame_size: Size of the coordinate frame + """ + if not clustered_pcds: + print("Warning: No clustered point clouds to visualize") + return + + # Apply standard coordinate transformation + transform = get_standard_coordinate_transform() + geometries = [] + for pcd in clustered_pcds: + pcd_copy = o3d.geometry.PointCloud(pcd) + pcd_copy.transform(transform) + geometries.append(pcd_copy) + + # Add coordinate frame + if show_coordinate_frame: + coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame( + size=coordinate_frame_size + ) + coordinate_frame.transform(transform) + geometries.append(coordinate_frame) + + total_points = sum(len(np.asarray(pcd.points)) for pcd in clustered_pcds) + print(f"Visualizing {len(clustered_pcds)} clusters with {total_points} total points") + + try: + vis = o3d.visualization.Visualizer() + vis.create_window(window_name=window_name, width=1280, height=720) + for geom in geometries: + vis.add_geometry(geom) + render_option = vis.get_render_option() + render_option.point_size = point_size + vis.run() + vis.destroy_window() + except Exception as e: + print(f"Failed to create interactive visualization: {e}") + o3d.visualization.draw_geometries( + geometries, window_name=window_name, width=1280, height=720 + ) + + +def visualize_pcd( + pcd: o3d.geometry.PointCloud, + window_name: str = "Point Cloud Visualization", + point_size: float = 1.0, + show_coordinate_frame: bool = True, + coordinate_frame_size: float = 0.1, +) -> None: + """ + Visualize an Open3D point cloud using Open3D's visualization window. + + Args: + pcd: Open3D point cloud to visualize + window_name: Name of the visualization window + point_size: Size of points in the visualization + show_coordinate_frame: Whether to show coordinate frame + coordinate_frame_size: Size of the coordinate frame + """ + if pcd is None: + print("Warning: Point cloud is None, nothing to visualize") + return + + if len(np.asarray(pcd.points)) == 0: + print("Warning: Point cloud is empty, nothing to visualize") + return + + # Apply standard coordinate transformation + transform = get_standard_coordinate_transform() + pcd_copy = o3d.geometry.PointCloud(pcd) + pcd_copy.transform(transform) + geometries = [pcd_copy] + + # Add coordinate frame + if show_coordinate_frame: + coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame( + size=coordinate_frame_size + ) + coordinate_frame.transform(transform) + geometries.append(coordinate_frame) + + print(f"Visualizing point cloud with {len(np.asarray(pcd.points))} points") + + try: + vis = o3d.visualization.Visualizer() + vis.create_window(window_name=window_name, width=1280, height=720) + for geom in geometries: + vis.add_geometry(geom) + render_option = vis.get_render_option() + render_option.point_size = point_size + vis.run() + vis.destroy_window() + except Exception as e: + print(f"Failed to create interactive visualization: {e}") + o3d.visualization.draw_geometries( + geometries, window_name=window_name, width=1280, height=720 + ) + + +def visualize_voxel_grid( + voxel_grid: o3d.geometry.VoxelGrid, + window_name: str = "Voxel Grid Visualization", + show_coordinate_frame: bool = True, + coordinate_frame_size: float = 0.1, +) -> None: + """ + Visualize an Open3D voxel grid using Open3D's visualization window. + + Args: + voxel_grid: Open3D voxel grid to visualize + window_name: Name of the visualization window + show_coordinate_frame: Whether to show coordinate frame + coordinate_frame_size: Size of the coordinate frame + """ + if voxel_grid is None: + print("Warning: Voxel grid is None, nothing to visualize") + return + + if len(voxel_grid.get_voxels()) == 0: + print("Warning: Voxel grid is empty, nothing to visualize") + return + + # VoxelGrid doesn't support transform, so we need to transform the source points instead + # For now, just visualize as-is with transformed coordinate frame + geometries = [voxel_grid] + + # Add coordinate frame + if show_coordinate_frame: + coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame( + size=coordinate_frame_size + ) + coordinate_frame.transform(get_standard_coordinate_transform()) + geometries.append(coordinate_frame) + + print(f"Visualizing voxel grid with {len(voxel_grid.get_voxels())} voxels") + + try: + vis = o3d.visualization.Visualizer() + vis.create_window(window_name=window_name, width=1280, height=720) + for geom in geometries: + vis.add_geometry(geom) + vis.run() + vis.destroy_window() + except Exception as e: + print(f"Failed to create interactive visualization: {e}") + o3d.visualization.draw_geometries( + geometries, window_name=window_name, width=1280, height=720 + ) + + +def combine_object_pointclouds( + point_clouds: Union[List[np.ndarray], List[o3d.geometry.PointCloud]], + colors: Optional[List[np.ndarray]] = None, +) -> o3d.geometry.PointCloud: + """ + Combine multiple point clouds into a single Open3D point cloud. + + Args: + point_clouds: List of point clouds as numpy arrays or Open3D point clouds + colors: List of colors as numpy arrays + Returns: + Combined Open3D point cloud + """ + all_points = [] + all_colors = [] + + for i, pcd in enumerate(point_clouds): + if isinstance(pcd, np.ndarray): + points = pcd[:, :3] + all_points.append(points) + if colors: + all_colors.append(colors[i]) + + elif isinstance(pcd, o3d.geometry.PointCloud): + points = np.asarray(pcd.points) + all_points.append(points) + if pcd.has_colors(): + colors = np.asarray(pcd.colors) + all_colors.append(colors) + + if not all_points: + return o3d.geometry.PointCloud() + + combined_pcd = o3d.geometry.PointCloud() + combined_pcd.points = o3d.utility.Vector3dVector(np.vstack(all_points)) + + if all_colors: + combined_pcd.colors = o3d.utility.Vector3dVector(np.vstack(all_colors)) + + return combined_pcd diff --git a/dimos/perception/segmentation/sam_2d_seg.py b/dimos/perception/segmentation/sam_2d_seg.py index d33c7faa0d..fcf27584e6 100644 --- a/dimos/perception/segmentation/sam_2d_seg.py +++ b/dimos/perception/segmentation/sam_2d_seg.py @@ -181,7 +181,17 @@ def process_image(self, image): tracked_names, ) else: - # Return filtered results directly if tracker is disabled + # When tracker disabled, just use the filtered results directly + if self.use_analyzer: + # Add unanalyzed IDs to the analysis queue + for track_id in filtered_track_ids: + if ( + track_id not in self.object_names + and track_id not in self.to_be_analyzed + ): + self.to_be_analyzed.append(track_id) + + # Simply return filtered results return ( filtered_masks, filtered_bboxes, @@ -299,6 +309,9 @@ def main(): # Example 4: Basic segmentation only (both tracker and analyzer disabled) # segmenter = Sam2DSegmenter(use_tracker=False, use_analyzer=False) + # Example 5: Analyzer without tracker (new capability) + # segmenter = Sam2DSegmenter(use_tracker=False, use_analyzer=True) + try: while cap.isOpened(): ret, frame = cap.read() @@ -311,7 +324,7 @@ def main(): masks, bboxes, target_ids, probs, names = segmenter.process_image(frame) # Run analysis if enabled - if segmenter.use_tracker and segmenter.use_analyzer: + if segmenter.use_analyzer: segmenter.run_analysis(frame, bboxes, target_ids) names = segmenter.get_object_names(target_ids, names) diff --git a/dimos/perception/segmentation/utils.py b/dimos/perception/segmentation/utils.py index c96a7d4a64..4101edfa40 100644 --- a/dimos/perception/segmentation/utils.py +++ b/dimos/perception/segmentation/utils.py @@ -241,6 +241,12 @@ def plot_results(image, masks, bboxes, track_ids, probs, names, alpha=0.5): if isinstance(mask, torch.Tensor): mask = mask.cpu().numpy() + # Ensure mask is in proper format for OpenCV resize + if mask.dtype == bool: + mask = mask.astype(np.uint8) + elif mask.dtype != np.uint8 and mask.dtype != np.float32: + mask = mask.astype(np.float32) + mask_resized = cv2.resize(mask, (w, h), interpolation=cv2.INTER_LINEAR) # Generate consistent color based on track_id diff --git a/dimos/stream/stereo_camera_streams/zed.py b/dimos/stream/stereo_camera_streams/zed.py new file mode 100644 index 0000000000..ceb7eac58b --- /dev/null +++ b/dimos/stream/stereo_camera_streams/zed.py @@ -0,0 +1,164 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +from typing import Optional, Dict + +from reactivex import Observable, create +from reactivex import operators as ops + +from dimos.hardware.zed_camera import ZEDCamera + +try: + import pyzed.sl as sl +except ImportError: + sl = None + logging.warning("ZED SDK not found. Please install pyzed to use ZED camera functionality.") + +logger = logging.getLogger(__name__) + + +class ZEDCameraStream: + """ZED Camera stream that provides RGB and depth data as observables.""" + + def __init__( + self, + camera_id: int = 0, + resolution: Optional["sl.RESOLUTION"] = None, + depth_mode: Optional["sl.DEPTH_MODE"] = None, + fps: int = 30, + ): + """ + Initialize ZED camera stream. + + Args: + camera_id: Camera ID (0 for first ZED) + resolution: ZED camera resolution (defaults to HD720) + depth_mode: Depth computation mode (defaults to NEURAL) + fps: Camera frame rate (default: 30) + """ + if sl is None: + raise ImportError("ZED SDK not installed. Please install pyzed package.") + + self.camera_id = camera_id + self.fps = fps + + # Set default values if not provided + if resolution is None: + resolution = sl.RESOLUTION.HD720 + if depth_mode is None: + depth_mode = sl.DEPTH_MODE.NEURAL + + self.resolution = resolution + self.depth_mode = depth_mode + + # Initialize ZED camera + self.zed_camera = ZEDCamera( + camera_id=camera_id, resolution=resolution, depth_mode=depth_mode, fps=fps + ) + + self.is_opened = False + + def _initialize_camera(self) -> None: + """Initialize the ZED camera if not already initialized.""" + if not self.is_opened: + if not self.zed_camera.open(): + raise RuntimeError(f"Failed to open ZED camera {self.camera_id}") + self.is_opened = True + logger.info(f"ZED camera {self.camera_id} opened successfully") + + def create_stream(self) -> Observable: + """ + Create an observable stream of RGB and depth frames. + + Returns: + Observable: An observable emitting dictionaries with 'rgb' and 'depth' keys. + """ + + def emit_frames(observer, scheduler): + try: + # Initialize camera + if not self.is_opened: + self._initialize_camera() + + while True: + # Capture frame directly + left_img, right_img, depth_img = self.zed_camera.capture_frame() + + if left_img is not None and depth_img is not None: + frame_data = { + "rgb": left_img, + "depth": depth_img, + "right": right_img, + "timestamp": time.time(), + } + + observer.on_next(frame_data) + + except Exception as e: + logger.error(f"Error during ZED frame emission: {e}") + observer.on_error(e) + finally: + # Clean up resources + self._cleanup_camera() + observer.on_completed() + + return create(emit_frames).pipe( + ops.share(), # Share the stream among multiple subscribers + ) + + def get_camera_info(self) -> Dict[str, float]: + """ + Get ZED camera intrinsics (fx, fy, cx, cy). + + Returns: + Dictionary containing camera intrinsics: fx, fy, cx, cy + """ + if not self.is_opened: + self._initialize_camera() + + try: + camera_info = self.zed_camera.get_camera_info() + left_cam = camera_info.get("left_cam", {}) + + return { + "fx": left_cam.get("fx", 0.0), + "fy": left_cam.get("fy", 0.0), + "cx": left_cam.get("cx", 0.0), + "cy": left_cam.get("cy", 0.0), + } + except Exception as e: + logger.error(f"Error getting camera info: {e}") + return {} + + def _cleanup_camera(self) -> None: + """Clean up camera resources.""" + if self.is_opened: + self.zed_camera.close() + self.is_opened = False + logger.info("ZED camera resources cleaned up") + + def cleanup(self) -> None: + """Clean up all resources.""" + self._cleanup_camera() + + def __enter__(self): + """Context manager entry.""" + self._initialize_camera() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + self._cleanup_camera() diff --git a/dimos/types/manipulation.py b/dimos/types/manipulation.py index d61d73a7ed..fee4e69ebb 100644 --- a/dimos/types/manipulation.py +++ b/dimos/types/manipulation.py @@ -13,7 +13,7 @@ # limitations under the License. from enum import Enum -from typing import Dict, List, Optional, Any, Union, TypedDict, Tuple, Literal +from typing import Dict, List, Optional, Any, Union, TypedDict, Tuple, Literal, TYPE_CHECKING from dataclasses import dataclass, field, fields from abc import ABC, abstractmethod import uuid @@ -21,6 +21,9 @@ import time from dimos.types.vector import Vector +if TYPE_CHECKING: + import open3d as o3d + class ConstraintType(Enum): """Types of manipulation constraints.""" @@ -72,6 +75,7 @@ class ForceConstraint(AbstractConstraint): class ObjectData(TypedDict, total=False): """Data about an object in the manipulation scene.""" + # Basic detection information object_id: int # Unique ID for the object bbox: List[float] # Bounding box [x1, y1, x2, y2] depth: float # Depth in meters from Metric3d @@ -80,9 +84,16 @@ class ObjectData(TypedDict, total=False): label: str # Semantic label (e.g., 'cup', 'table') movement_tolerance: float # (0.0 = immovable, 1.0 = freely movable) segmentation_mask: np.ndarray # Binary mask of the object's pixels - position: Dict[str, float] # 3D position {x, y, z} - rotation: Dict[str, float] # 3D rotation {roll, pitch, yaw} - size: Dict[str, float] # Object dimensions {width, height} + + # 3D pose and dimensions + position: Union[Dict[str, float], Vector] # 3D position {x, y, z} or Vector + rotation: Union[Dict[str, float], Vector] # 3D rotation {roll, pitch, yaw} or Vector + size: Dict[str, float] # Object dimensions {width, height, depth} + + # Point cloud data + point_cloud: "o3d.geometry.PointCloud" # Open3D point cloud object + point_cloud_numpy: np.ndarray # Nx6 array of XYZRGB points + color: np.ndarray # RGB color for visualization [R, G, B] class ManipulationMetadata(TypedDict, total=False): diff --git a/tests/test_manipulation_perception_pipeline.py.py b/tests/test_manipulation_perception_pipeline.py.py new file mode 100644 index 0000000000..8b333ec310 --- /dev/null +++ b/tests/test_manipulation_perception_pipeline.py.py @@ -0,0 +1,167 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import time +import threading +from reactivex import operators as ops + +import tests.test_header + +from pyzed import sl +from dimos.stream.stereo_camera_streams.zed import ZEDCameraStream +from dimos.web.robot_web_interface import RobotWebInterface +from dimos.utils.logging_config import logger +from dimos.perception.manip_aio_pipeline import ManipulationPipeline + + +def monitor_grasps(pipeline): + """Monitor and print grasp updates in a separate thread.""" + print(" Grasp monitor started...") + + last_grasp_count = 0 + last_update_time = time.time() + + while True: + try: + # Get latest grasps using the getter function + grasps = pipeline.get_latest_grasps(timeout=0.5) + current_time = time.time() + + if grasps is not None: + current_count = len(grasps) + if current_count != last_grasp_count: + print(f" Grasps received: {current_count} (at {time.strftime('%H:%M:%S')})") + if current_count > 0: + best_score = max(grasp.get("score", 0.0) for grasp in grasps) + print(f" Best grasp score: {best_score:.3f}") + last_grasp_count = current_count + last_update_time = current_time + else: + # Show periodic "still waiting" message + if current_time - last_update_time > 10.0: + print(f" Still waiting for grasps... ({time.strftime('%H:%M:%S')})") + last_update_time = current_time + + time.sleep(1.0) # Check every second + + except Exception as e: + print(f" Error in grasp monitor: {e}") + time.sleep(2.0) + + +def main(): + """Test point cloud filtering with grasp generation using ManipulationPipeline.""" + print(" Testing point cloud filtering + grasp generation with ManipulationPipeline...") + + # Configuration + min_confidence = 0.6 + web_port = 5555 + grasp_server_url = "ws://18.224.39.74:8000/ws/grasp" + + try: + # Initialize ZED camera stream + zed_stream = ZEDCameraStream(resolution=sl.RESOLUTION.HD1080, fps=10) + + # Get camera intrinsics + camera_intrinsics_dict = zed_stream.get_camera_info() + camera_intrinsics = [ + camera_intrinsics_dict["fx"], + camera_intrinsics_dict["fy"], + camera_intrinsics_dict["cx"], + camera_intrinsics_dict["cy"], + ] + + # Create the concurrent manipulation pipeline WITH grasp generation + pipeline = ManipulationPipeline( + camera_intrinsics=camera_intrinsics, + min_confidence=min_confidence, + max_objects=10, + grasp_server_url=grasp_server_url, + enable_grasp_generation=True, # Enable grasp generation + ) + + # Create ZED stream + zed_frame_stream = zed_stream.create_stream().pipe(ops.share()) + + # Create concurrent processing streams + streams = pipeline.create_streams(zed_frame_stream) + detection_viz_stream = streams["detection_viz"] + pointcloud_viz_stream = streams["pointcloud_viz"] + grasps_stream = streams.get("grasps") # Get grasp stream if available + grasp_overlay_stream = streams.get("grasp_overlay") # Get grasp overlay stream if available + + except ImportError: + print("Error: ZED SDK not installed. Please install pyzed package.") + sys.exit(1) + except RuntimeError as e: + print(f"Error: Failed to open ZED camera: {e}") + sys.exit(1) + + try: + # Set up web interface with concurrent visualization streams + print("Initializing web interface...") + web_interface = RobotWebInterface( + port=web_port, + object_detection=detection_viz_stream, + pointcloud_stream=pointcloud_viz_stream, + grasp_overlay_stream=grasp_overlay_stream, + ) + + # Start grasp monitoring in background thread + grasp_monitor_thread = threading.Thread( + target=monitor_grasps, args=(pipeline,), daemon=True + ) + grasp_monitor_thread.start() + + print(f"\n Point Cloud + Grasp Generation Test Running:") + print(f" Web Interface: http://localhost:{web_port}") + print(f" Object Detection View: RGB with bounding boxes") + print(f" Point Cloud View: Depth with colored point clouds and 3D bounding boxes") + print(f" Confidence threshold: {min_confidence}") + print(f" Grasp server: {grasp_server_url}") + print(f" Available streams: {list(streams.keys())}") + print("\nPress Ctrl+C to stop the test\n") + + # Start web server (blocking call) + web_interface.run() + + except KeyboardInterrupt: + print("\nTest interrupted by user") + except Exception as e: + print(f"Error during test: {e}") + finally: + print("Cleaning up resources...") + if "zed_stream" in locals(): + zed_stream.cleanup() + if "pipeline" in locals(): + pipeline.cleanup() + print("Test completed") + + +if __name__ == "__main__": + main() diff --git a/tests/test_manipulation_pipeline_single_frame.py b/tests/test_manipulation_pipeline_single_frame.py new file mode 100644 index 0000000000..1fc42d0810 --- /dev/null +++ b/tests/test_manipulation_pipeline_single_frame.py @@ -0,0 +1,248 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test manipulation processor with direct visualization and grasp data output.""" + +import os +import sys +import cv2 +import numpy as np +import time +import argparse +import matplotlib + +# Try to use TkAgg backend for live display, fallback to Agg if not available +try: + matplotlib.use("TkAgg") +except: + try: + matplotlib.use("Qt5Agg") + except: + matplotlib.use("Agg") # Fallback to non-interactive +import matplotlib.pyplot as plt +import open3d as o3d +from typing import Dict, List + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid +from dimos.perception.manip_aio_processer import ManipulationProcessor +from dimos.perception.pointcloud.utils import ( + load_camera_matrix_from_yaml, + visualize_pcd, + combine_object_pointclouds, +) +from dimos.utils.logging_config import setup_logger + +from dimos.perception.grasp_generation.utils import visualize_grasps_3d, create_grasp_overlay + +logger = setup_logger("test_pipeline_viz") + + +def load_first_frame(data_dir: str): + """Load first RGB-D frame and camera intrinsics.""" + # Load images + color_img = cv2.imread(os.path.join(data_dir, "color", "00000.png")) + color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB) + + depth_img = cv2.imread(os.path.join(data_dir, "depth", "00000.png"), cv2.IMREAD_ANYDEPTH) + if depth_img.dtype == np.uint16: + depth_img = depth_img.astype(np.float32) / 1000.0 + # Load intrinsics + camera_matrix = load_camera_matrix_from_yaml(os.path.join(data_dir, "color_camera_info.yaml")) + intrinsics = [ + camera_matrix[0, 0], + camera_matrix[1, 1], + camera_matrix[0, 2], + camera_matrix[1, 2], + ] + + return color_img, depth_img, intrinsics + + +def create_point_cloud(color_img, depth_img, intrinsics): + """Create Open3D point cloud.""" + fx, fy, cx, cy = intrinsics + height, width = depth_img.shape + + o3d_intrinsics = o3d.camera.PinholeCameraIntrinsic(width, height, fx, fy, cx, cy) + color_o3d = o3d.geometry.Image(color_img) + depth_o3d = o3d.geometry.Image((depth_img * 1000).astype(np.uint16)) + + rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth( + color_o3d, depth_o3d, depth_scale=1000.0, convert_rgb_to_intensity=False + ) + + return o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, o3d_intrinsics) + + +def run_processor(color_img, depth_img, intrinsics, grasp_server_url=None): + """Run processor and collect results.""" + processor_kwargs = { + "camera_intrinsics": intrinsics, + "enable_grasp_generation": True, + "enable_segmentation": True, + } + + if grasp_server_url: + processor_kwargs["grasp_server_url"] = grasp_server_url + + processor = ManipulationProcessor(**processor_kwargs) + + # Process frame without grasp generation + results = processor.process_frame(color_img, depth_img, generate_grasps=False) + + # Run grasp generation separately + grasps = processor.run_grasp_generation(results["all_objects"], results["full_pointcloud"]) + results["grasps"] = grasps + results["grasp_overlay"] = create_grasp_overlay(color_img, grasps, intrinsics) + + processor.cleanup() + return results + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data-dir", default="assets/rgbd_data") + parser.add_argument("--wait-time", type=float, default=5.0) + parser.add_argument( + "--grasp-server-url", + default="ws://18.224.39.74:8000/ws/grasp", + help="WebSocket URL for Dimensional Grasp server", + ) + args = parser.parse_args() + + # Load data + color_img, depth_img, intrinsics = load_first_frame(args.data_dir) + logger.info(f"Loaded images: color {color_img.shape}, depth {depth_img.shape}") + + # Run processor + results = run_processor(color_img, depth_img, intrinsics, args.grasp_server_url) + + # Print results summary + print(f"Processing time: {results.get('processing_time', 0):.3f}s") + print(f"Detection objects: {len(results.get('detected_objects', []))}") + print(f"All objects processed: {len(results.get('all_objects', []))}") + + # Print grasp summary + grasp_data = results["grasps"] + total_grasps = len(grasp_data) if isinstance(grasp_data, list) else 0 + best_score = max(grasp["score"] for grasp in grasp_data) if grasp_data else 0 + + print(f"Grasps: {total_grasps} total (best score: {best_score:.3f})") + + # Create visualizations + plot_configs = [] + if results["detection_viz"] is not None: + plot_configs.append(("detection_viz", "Object Detection")) + if results["segmentation_viz"] is not None: + plot_configs.append(("segmentation_viz", "Semantic Segmentation")) + if results["pointcloud_viz"] is not None: + plot_configs.append(("pointcloud_viz", "All Objects Point Cloud")) + if results["detected_pointcloud_viz"] is not None: + plot_configs.append(("detected_pointcloud_viz", "Detection Objects Point Cloud")) + if results["misc_pointcloud_viz"] is not None: + plot_configs.append(("misc_pointcloud_viz", "Misc/Background Points")) + if results["grasp_overlay"] is not None: + plot_configs.append(("grasp_overlay", "Grasp Overlay")) + + # Create subplot layout + num_plots = len(plot_configs) + if num_plots <= 3: + fig, axes = plt.subplots(1, num_plots, figsize=(6 * num_plots, 5)) + else: + rows = 2 + cols = (num_plots + 1) // 2 + fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 5 * rows)) + + if num_plots == 1: + axes = [axes] + elif num_plots > 2: + axes = axes.flatten() + + # Plot each result + for i, (key, title) in enumerate(plot_configs): + axes[i].imshow(results[key]) + axes[i].set_title(title) + axes[i].axis("off") + + # Hide unused subplots + if num_plots > 3: + for i in range(num_plots, len(axes)): + axes[i].axis("off") + + plt.tight_layout() + plt.savefig("manipulation_results.png", dpi=150, bbox_inches="tight") + plt.show(block=True) + plt.close() + + point_clouds = [obj["point_cloud"] for obj in results["all_objects"]] + colors = [obj["color"] for obj in results["all_objects"]] + combined_pcd = combine_object_pointclouds(point_clouds, colors) + + # 3D Grasp visualization + if grasp_data: + # Convert grasp format to visualization format for 3D display + viz_grasps = [] + for grasp in grasp_data: + translation = grasp.get("translation", [0, 0, 0]) + rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3).tolist())) + score = grasp.get("score", 0.0) + width = grasp.get("width", 0.08) + + viz_grasp = { + "translation": translation, + "rotation_matrix": rotation_matrix, + "width": width, + "score": score, + } + viz_grasps.append(viz_grasp) + + # Use unified 3D visualization + visualize_grasps_3d(combined_pcd, viz_grasps) + + # Visualize full point cloud + visualize_pcd( + results["full_pointcloud"], + window_name="Full Scene Point Cloud", + point_size=2.0, + show_coordinate_frame=True, + ) + + # Visualize all objects point cloud + visualize_pcd( + combined_pcd, + window_name="All Objects Point Cloud", + point_size=3.0, + show_coordinate_frame=True, + ) + + # Visualize misc clusters + visualize_clustered_point_clouds( + results["misc_clusters"], + window_name="Misc/Background Clusters (DBSCAN)", + point_size=3.0, + show_coordinate_frame=True, + ) + + # Visualize voxel grid + visualize_voxel_grid( + results["misc_voxel_grid"], + window_name="Misc/Background Voxel Grid", + show_coordinate_frame=True, + ) + + +if __name__ == "__main__": + main() diff --git a/tests/test_manipulation_pipeline_single_frame_lcm.py b/tests/test_manipulation_pipeline_single_frame_lcm.py new file mode 100644 index 0000000000..d8090575eb --- /dev/null +++ b/tests/test_manipulation_pipeline_single_frame_lcm.py @@ -0,0 +1,431 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test manipulation processor with LCM topic subscription.""" + +import os +import sys +import cv2 +import numpy as np +import time +import argparse +import threading +import pickle +import matplotlib +import json +import copy + +# Try to use TkAgg backend for live display, fallback to Agg if not available +try: + matplotlib.use("TkAgg") +except: + try: + matplotlib.use("Qt5Agg") + except: + matplotlib.use("Agg") # Fallback to non-interactive +import matplotlib.pyplot as plt +import open3d as o3d +from typing import Dict, List, Optional + +# LCM imports +import lcm +from lcm_msgs.sensor_msgs import Image as LCMImage +from lcm_msgs.sensor_msgs import CameraInfo as LCMCameraInfo + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid +from dimos.perception.manip_aio_processer import ManipulationProcessor +from dimos.perception.grasp_generation.utils import visualize_grasps_3d +from dimos.perception.pointcloud.utils import visualize_pcd +from dimos.utils.logging_config import setup_logger + +logger = setup_logger("test_pipeline_lcm") + + +class LCMDataCollector: + """Collects one message from each required LCM topic.""" + + def __init__(self, lcm_url: str = "udpm://239.255.76.67:7667?ttl=1"): + self.lcm = lcm.LCM(lcm_url) + + # Data storage + self.rgb_data: Optional[np.ndarray] = None + self.depth_data: Optional[np.ndarray] = None + self.camera_intrinsics: Optional[List[float]] = None + + # Synchronization + self.data_lock = threading.Lock() + self.data_ready_event = threading.Event() + + # Flags to track received messages + self.rgb_received = False + self.depth_received = False + self.camera_info_received = False + + # Subscribe to topics + self.lcm.subscribe("head_cam_rgb#sensor_msgs.Image", self._handle_rgb_message) + self.lcm.subscribe("head_cam_depth#sensor_msgs.Image", self._handle_depth_message) + self.lcm.subscribe("head_cam_info#sensor_msgs.CameraInfo", self._handle_camera_info_message) + + logger.info("LCM Data Collector initialized") + logger.info("Subscribed to topics:") + logger.info(" - head_cam_rgb#sensor_msgs.Image") + logger.info(" - head_cam_depth#sensor_msgs.Image") + logger.info(" - head_cam_info#sensor_msgs.CameraInfo") + + def _handle_rgb_message(self, channel: str, data: bytes): + """Handle RGB image message.""" + if self.rgb_received: + return # Already got one, ignore subsequent messages + + try: + msg = LCMImage.decode(data) + + # Convert message data to numpy array + if msg.encoding == "rgb8": + # RGB8 format: 3 bytes per pixel + rgb_array = np.frombuffer(msg.data[: msg.data_length], dtype=np.uint8) + rgb_image = rgb_array.reshape((msg.height, msg.width, 3)) + + with self.data_lock: + self.rgb_data = rgb_image + self.rgb_received = True + logger.info( + f"RGB message received: {msg.width}x{msg.height}, encoding: {msg.encoding}" + ) + self._check_all_data_received() + + else: + logger.warning(f"Unsupported RGB encoding: {msg.encoding}") + + except Exception as e: + logger.error(f"Error processing RGB message: {e}") + + def _handle_depth_message(self, channel: str, data: bytes): + """Handle depth image message.""" + if self.depth_received: + return # Already got one, ignore subsequent messages + + try: + msg = LCMImage.decode(data) + + # Convert message data to numpy array + if msg.encoding == "32FC1": + # 32FC1 format: 4 bytes (float32) per pixel + depth_array = np.frombuffer(msg.data[: msg.data_length], dtype=np.float32) + depth_image = depth_array.reshape((msg.height, msg.width)) + + with self.data_lock: + self.depth_data = depth_image + self.depth_received = True + logger.info( + f"Depth message received: {msg.width}x{msg.height}, encoding: {msg.encoding}" + ) + logger.info( + f"Depth range: {depth_image.min():.3f} - {depth_image.max():.3f} meters" + ) + self._check_all_data_received() + + else: + logger.warning(f"Unsupported depth encoding: {msg.encoding}") + + except Exception as e: + logger.error(f"Error processing depth message: {e}") + + def _handle_camera_info_message(self, channel: str, data: bytes): + """Handle camera info message.""" + if self.camera_info_received: + return # Already got one, ignore subsequent messages + + try: + msg = LCMCameraInfo.decode(data) + + # Extract intrinsics from K matrix: [fx, 0, cx, 0, fy, cy, 0, 0, 1] + K = msg.K + fx = K[0] # K[0,0] + fy = K[4] # K[1,1] + cx = K[2] # K[0,2] + cy = K[5] # K[1,2] + + intrinsics = [fx, fy, cx, cy] + + with self.data_lock: + self.camera_intrinsics = intrinsics + self.camera_info_received = True + logger.info(f"Camera info received: {msg.width}x{msg.height}") + logger.info(f"Intrinsics: fx={fx:.1f}, fy={fy:.1f}, cx={cx:.1f}, cy={cy:.1f}") + self._check_all_data_received() + + except Exception as e: + logger.error(f"Error processing camera info message: {e}") + + def _check_all_data_received(self): + """Check if all required data has been received.""" + if self.rgb_received and self.depth_received and self.camera_info_received: + logger.info("✅ All required data received!") + self.data_ready_event.set() + + def wait_for_data(self, timeout: float = 30.0) -> bool: + """Wait for all data to be received.""" + logger.info("Waiting for RGB, depth, and camera info messages...") + + # Start LCM handling in a separate thread + lcm_thread = threading.Thread(target=self._lcm_handle_loop, daemon=True) + lcm_thread.start() + + # Wait for data with timeout + return self.data_ready_event.wait(timeout) + + def _lcm_handle_loop(self): + """LCM message handling loop.""" + try: + while not self.data_ready_event.is_set(): + self.lcm.handle_timeout(100) # 100ms timeout + except Exception as e: + logger.error(f"Error in LCM handling loop: {e}") + + def get_data(self): + """Get the collected data.""" + with self.data_lock: + return self.rgb_data, self.depth_data, self.camera_intrinsics + + +def create_point_cloud(color_img, depth_img, intrinsics): + """Create Open3D point cloud.""" + fx, fy, cx, cy = intrinsics + height, width = depth_img.shape + + o3d_intrinsics = o3d.camera.PinholeCameraIntrinsic(width, height, fx, fy, cx, cy) + color_o3d = o3d.geometry.Image(color_img) + depth_o3d = o3d.geometry.Image((depth_img * 1000).astype(np.uint16)) + + rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth( + color_o3d, depth_o3d, depth_scale=1000.0, convert_rgb_to_intensity=False + ) + + return o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, o3d_intrinsics) + + +def run_processor(color_img, depth_img, intrinsics): + """Run processor and collect results.""" + # Create processor + processor = ManipulationProcessor( + camera_intrinsics=intrinsics, + grasp_server_url="ws://18.224.39.74:8000/ws/grasp", + enable_grasp_generation=False, + enable_segmentation=True, + ) + + # Process single frame directly + results = processor.process_frame(color_img, depth_img) + + # Debug: print available results + print(f"Available results: {list(results.keys())}") + + processor.cleanup() + + return results + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lcm-url", default="udpm://239.255.76.67:7667?ttl=1", help="LCM URL for subscription" + ) + parser.add_argument( + "--timeout", type=float, default=30.0, help="Timeout in seconds to wait for messages" + ) + parser.add_argument( + "--save-images", action="store_true", help="Save received RGB and depth images to files" + ) + args = parser.parse_args() + + # Create data collector + collector = LCMDataCollector(args.lcm_url) + + # Wait for data + if not collector.wait_for_data(args.timeout): + logger.error(f"Timeout waiting for data after {args.timeout} seconds") + logger.error("Make sure Unity is running and publishing to the LCM topics") + return + + # Get the collected data + color_img, depth_img, intrinsics = collector.get_data() + + logger.info(f"Loaded images: color {color_img.shape}, depth {depth_img.shape}") + logger.info(f"Intrinsics: {intrinsics}") + + # Save images if requested + if args.save_images: + try: + cv2.imwrite("received_rgb.png", cv2.cvtColor(color_img, cv2.COLOR_RGB2BGR)) + # Save depth as 16-bit for visualization + depth_viz = (np.clip(depth_img * 1000, 0, 65535)).astype(np.uint16) + cv2.imwrite("received_depth.png", depth_viz) + logger.info("Saved received_rgb.png and received_depth.png") + except Exception as e: + logger.warning(f"Failed to save images: {e}") + + # Run processor + results = run_processor(color_img, depth_img, intrinsics) + + # Debug: Print what we received + print(f"\n✅ Processor Results:") + print(f" Available results: {list(results.keys())}") + print(f" Processing time: {results.get('processing_time', 0):.3f}s") + + # Show timing breakdown if available + if "timing_breakdown" in results: + breakdown = results["timing_breakdown"] + print(f" Timing breakdown:") + print(f" - Detection: {breakdown.get('detection', 0):.3f}s") + print(f" - Segmentation: {breakdown.get('segmentation', 0):.3f}s") + print(f" - Point cloud: {breakdown.get('pointcloud', 0):.3f}s") + print(f" - Misc extraction: {breakdown.get('misc_extraction', 0):.3f}s") + + # Print object information + detected_count = len(results.get("detected_objects", [])) + all_count = len(results.get("all_objects", [])) + + print(f" Detection objects: {detected_count}") + print(f" All objects processed: {all_count}") + + # Print misc clusters information + if "misc_clusters" in results and results["misc_clusters"]: + cluster_count = len(results["misc_clusters"]) + total_misc_points = sum( + len(np.asarray(cluster.points)) for cluster in results["misc_clusters"] + ) + print(f" Misc clusters: {cluster_count} clusters with {total_misc_points} total points") + else: + print(f" Misc clusters: None") + + # Print grasp summary + if "grasps" in results and results["grasps"]: + total_grasps = 0 + best_score = 0 + for grasp in results["grasps"]: + score = grasp.get("score", 0) + if score > best_score: + best_score = score + total_grasps += 1 + print(f" Grasps generated: {total_grasps} (best score: {best_score:.3f})") + else: + print(" Grasps: None generated") + + # Save results to pickle file + pickle_path = "manipulation_results.pkl" + print(f"\nSaving results to pickle file: {pickle_path}") + + def serialize_point_cloud(pcd): + """Convert Open3D PointCloud to serializable format.""" + if pcd is None: + return None + data = { + "points": np.asarray(pcd.points).tolist() if hasattr(pcd, "points") else [], + "colors": np.asarray(pcd.colors).tolist() + if hasattr(pcd, "colors") and pcd.colors + else [], + } + return data + + def serialize_voxel_grid(voxel_grid): + """Convert Open3D VoxelGrid to serializable format.""" + if voxel_grid is None: + return None + + # Extract voxel data + voxels = voxel_grid.get_voxels() + data = { + "voxel_size": voxel_grid.voxel_size, + "origin": np.asarray(voxel_grid.origin).tolist(), + "voxels": [ + ( + v.grid_index[0], + v.grid_index[1], + v.grid_index[2], + v.color[0], + v.color[1], + v.color[2], + ) + for v in voxels + ], + } + return data + + # Create a copy of results with non-picklable objects converted + pickle_data = { + "color_img": color_img, + "depth_img": depth_img, + "intrinsics": intrinsics, + "results": {}, + } + + # Convert and store all results, properly handling Open3D objects + for key, value in results.items(): + if key.endswith("_viz") or key in [ + "processing_time", + "timing_breakdown", + "detection2d_objects", + "segmentation2d_objects", + ]: + # These are already serializable + pickle_data["results"][key] = value + elif key == "full_pointcloud": + # Serialize PointCloud object + pickle_data["results"][key] = serialize_point_cloud(value) + print(f"Serialized {key}") + elif key == "misc_voxel_grid": + # Serialize VoxelGrid object + pickle_data["results"][key] = serialize_voxel_grid(value) + print(f"Serialized {key}") + elif key == "misc_clusters": + # List of PointCloud objects + if value: + serialized_clusters = [serialize_point_cloud(cluster) for cluster in value] + pickle_data["results"][key] = serialized_clusters + print(f"Serialized {key} ({len(serialized_clusters)} clusters)") + elif key == "detected_objects" or key == "all_objects": + # Objects with PointCloud attributes + serialized_objects = [] + for obj in value: + obj_dict = {k: v for k, v in obj.items() if k != "point_cloud"} + if "point_cloud" in obj: + obj_dict["point_cloud"] = serialize_point_cloud(obj.get("point_cloud")) + serialized_objects.append(obj_dict) + pickle_data["results"][key] = serialized_objects + print(f"Serialized {key} ({len(serialized_objects)} objects)") + else: + try: + # Try to pickle as is + pickle_data["results"][key] = value + print(f"Preserved {key} as is") + except (TypeError, ValueError): + print(f"Warning: Could not serialize {key}, skipping") + + with open(pickle_path, "wb") as f: + pickle.dump(pickle_data, f) + + print(f"Results saved successfully with all 3D data serialized!") + print(f"Pickled data keys: {list(pickle_data['results'].keys())}") + + # Visualization code has been moved to visualization_script.py + # The results have been pickled and can be loaded from there + print("\nVisualization code has been moved to visualization_script.py") + print("Run 'python visualization_script.py' to visualize the results") + + +if __name__ == "__main__": + main() diff --git a/tests/test_pointcloud_filtering.py b/tests/test_pointcloud_filtering.py new file mode 100644 index 0000000000..308b4fc6ac --- /dev/null +++ b/tests/test_pointcloud_filtering.py @@ -0,0 +1,105 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import time +import threading +from reactivex import operators as ops + +import tests.test_header + +from pyzed import sl +from dimos.stream.stereo_camera_streams.zed import ZEDCameraStream +from dimos.web.robot_web_interface import RobotWebInterface +from dimos.utils.logging_config import logger +from dimos.perception.manip_aio_pipeline import ManipulationPipeline + + +def main(): + """Test point cloud filtering using the concurrent stream-based ManipulationPipeline.""" + print("Testing point cloud filtering with ManipulationPipeline...") + + # Configuration + min_confidence = 0.6 + web_port = 5555 + + try: + # Initialize ZED camera stream + zed_stream = ZEDCameraStream(resolution=sl.RESOLUTION.HD1080, fps=10) + + # Get camera intrinsics + camera_intrinsics_dict = zed_stream.get_camera_info() + camera_intrinsics = [ + camera_intrinsics_dict["fx"], + camera_intrinsics_dict["fy"], + camera_intrinsics_dict["cx"], + camera_intrinsics_dict["cy"], + ] + + # Create the concurrent manipulation pipeline + pipeline = ManipulationPipeline( + camera_intrinsics=camera_intrinsics, + min_confidence=min_confidence, + max_objects=10, + ) + + # Create ZED stream + zed_frame_stream = zed_stream.create_stream().pipe(ops.share()) + + # Create concurrent processing streams + streams = pipeline.create_streams(zed_frame_stream) + detection_viz_stream = streams["detection_viz"] + pointcloud_viz_stream = streams["pointcloud_viz"] + + except ImportError: + print("Error: ZED SDK not installed. Please install pyzed package.") + sys.exit(1) + except RuntimeError as e: + print(f"Error: Failed to open ZED camera: {e}") + sys.exit(1) + + try: + # Set up web interface with concurrent visualization streams + print("Initializing web interface...") + web_interface = RobotWebInterface( + port=web_port, + object_detection=detection_viz_stream, + pointcloud_stream=pointcloud_viz_stream, + ) + + print(f"\nPoint Cloud Filtering Test Running:") + print(f"Web Interface: http://localhost:{web_port}") + print(f"Object Detection View: RGB with bounding boxes") + print(f"Point Cloud View: Depth with colored point clouds and 3D bounding boxes") + print(f"Confidence threshold: {min_confidence}") + print("\nPress Ctrl+C to stop the test\n") + + # Start web server (blocking call) + web_interface.run() + + except KeyboardInterrupt: + print("\nTest interrupted by user") + except Exception as e: + print(f"Error during test: {e}") + finally: + print("Cleaning up resources...") + if "zed_stream" in locals(): + zed_stream.cleanup() + if "pipeline" in locals(): + pipeline.cleanup() + print("Test completed") + + +if __name__ == "__main__": + main() diff --git a/tests/test_zed_setup.py b/tests/test_zed_setup.py new file mode 100755 index 0000000000..ca50bb63fb --- /dev/null +++ b/tests/test_zed_setup.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Simple test script to verify ZED camera setup and basic functionality. +""" + +import sys +from pathlib import Path + + +def test_imports(): + """Test that all required modules can be imported.""" + print("Testing imports...") + + try: + import numpy as np + + print("✓ NumPy imported successfully") + except ImportError as e: + print(f"✗ NumPy import failed: {e}") + return False + + try: + import cv2 + + print("✓ OpenCV imported successfully") + except ImportError as e: + print(f"✗ OpenCV import failed: {e}") + return False + + try: + from PIL import Image, ImageDraw, ImageFont + + print("✓ PIL imported successfully") + except ImportError as e: + print(f"✗ PIL import failed: {e}") + return False + + try: + import pyzed.sl as sl + + print("✓ ZED SDK (pyzed) imported successfully") + # Note: SDK version method varies between versions + except ImportError as e: + print(f"✗ ZED SDK import failed: {e}") + print(" Please install ZED SDK and pyzed package") + return False + + try: + from dimos.hardware.zed_camera import ZEDCamera + + print("✓ ZEDCamera class imported successfully") + except ImportError as e: + print(f"✗ ZEDCamera import failed: {e}") + return False + + try: + from dimos.perception.zed_visualizer import ZEDVisualizer + + print("✓ ZEDVisualizer class imported successfully") + except ImportError as e: + print(f"✗ ZEDVisualizer import failed: {e}") + return False + + return True + + +def test_camera_detection(): + """Test if ZED cameras are detected.""" + print("\nTesting camera detection...") + + try: + import pyzed.sl as sl + + # List available cameras + cameras = sl.Camera.get_device_list() + print(f"Found {len(cameras)} ZED camera(s):") + + for i, camera_info in enumerate(cameras): + print(f" Camera {i}:") + print(f" Model: {camera_info.camera_model}") + print(f" Serial: {camera_info.serial_number}") + print(f" State: {camera_info.camera_state}") + + return len(cameras) > 0 + + except Exception as e: + print(f"Error detecting cameras: {e}") + return False + + +def test_basic_functionality(): + """Test basic ZED camera functionality without actually opening the camera.""" + print("\nTesting basic functionality...") + + try: + import pyzed.sl as sl + from dimos.hardware.zed_camera import ZEDCamera + from dimos.perception.zed_visualizer import ZEDVisualizer + + # Test camera initialization (without opening) + camera = ZEDCamera( + camera_id=0, + resolution=sl.RESOLUTION.HD720, + depth_mode=sl.DEPTH_MODE.NEURAL, + ) + print("✓ ZEDCamera instance created successfully") + + # Test visualizer initialization + visualizer = ZEDVisualizer(max_depth=10.0) + print("✓ ZEDVisualizer instance created successfully") + + # Test creating a dummy visualization + dummy_rgb = np.zeros((480, 640, 3), dtype=np.uint8) + dummy_depth = np.ones((480, 640), dtype=np.float32) * 2.0 + + vis = visualizer.create_side_by_side_image(dummy_rgb, dummy_depth) + print("✓ Dummy visualization created successfully") + + return True + + except Exception as e: + print(f"✗ Basic functionality test failed: {e}") + return False + + +def main(): + """Run all tests.""" + print("ZED Camera Setup Test") + print("=" * 50) + + # Test imports + if not test_imports(): + print("\n❌ Import tests failed. Please install missing dependencies.") + return False + + # Test camera detection + cameras_found = test_camera_detection() + if not cameras_found: + print( + "\n⚠️ No ZED cameras detected. Please connect a ZED camera to test capture functionality." + ) + + # Test basic functionality + if not test_basic_functionality(): + print("\n❌ Basic functionality tests failed.") + return False + + print("\n" + "=" * 50) + if cameras_found: + print("✅ All tests passed! You can now run the ZED demo:") + print(" python examples/zed_neural_depth_demo.py --display-time 10") + else: + print("✅ Setup is ready, but no camera detected.") + print(" Connect a ZED camera and run:") + print(" python examples/zed_neural_depth_demo.py --display-time 10") + + return True + + +if __name__ == "__main__": + # Add the project root to Python path + sys.path.append(str(Path(__file__).parent)) + + # Import numpy after path setup + import numpy as np + + success = main() + sys.exit(0 if success else 1) diff --git a/tests/visualization_script.py b/tests/visualization_script.py new file mode 100644 index 0000000000..d0c4c6af84 --- /dev/null +++ b/tests/visualization_script.py @@ -0,0 +1,1041 @@ +#!/usr/bin/env python3 +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Visualize pickled manipulation pipeline results.""" + +import os +import sys +import pickle +import numpy as np +import json +import matplotlib + +# Try to use TkAgg backend for live display, fallback to Agg if not available +try: + matplotlib.use("TkAgg") +except: + try: + matplotlib.use("Qt5Agg") + except: + matplotlib.use("Agg") # Fallback to non-interactive +import matplotlib.pyplot as plt +import open3d as o3d + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid +from dimos.perception.grasp_generation.utils import visualize_grasps_3d +from dimos.perception.pointcloud.utils import visualize_pcd +from dimos.utils.logging_config import setup_logger +import trimesh + +import tf_lcm_py +import cv2 +from contextlib import contextmanager +import lcm_msgs +from lcm_msgs.sensor_msgs import JointState, PointCloud2, CameraInfo, PointCloud2, PointField +from lcm_msgs.std_msgs import Header +from typing import List, Tuple, Optional +import atexit +from datetime import datetime +import time + +from pydrake.all import ( + AddMultibodyPlantSceneGraph, + CoulombFriction, + Diagram, + DiagramBuilder, + InverseKinematics, + MeshcatVisualizer, + MeshcatVisualizerParams, + MultibodyPlant, + Parser, + RigidTransform, + RollPitchYaw, + RotationMatrix, + JointIndex, + Solve, + StartMeshcat, +) +from pydrake.geometry import ( + CollisionFilterDeclaration, + Mesh, + ProximityProperties, + InMemoryMesh, + Box, + Cylinder, +) +from pydrake.math import RigidTransform as DrakeRigidTransform +from pydrake.common import MemoryFile + +from pydrake.all import ( + MinimumDistanceLowerBoundConstraint, + MultibodyPlant, + Parser, + DiagramBuilder, + AddMultibodyPlantSceneGraph, + MeshcatVisualizer, + StartMeshcat, + RigidTransform, + Role, + RollPitchYaw, + RotationMatrix, + Solve, + InverseKinematics, + MeshcatVisualizerParams, + MinimumDistanceLowerBoundConstraint, + DoDifferentialInverseKinematics, + DifferentialInverseKinematicsStatus, + DifferentialInverseKinematicsParameters, + DepthImageToPointCloud, +) +from manipulation.scenarios import AddMultibodyTriad +from manipulation.meshcat_utils import ( # TODO(russt): switch to pydrake version + _MeshcatPoseSliders, +) +from manipulation.scenarios import AddIiwa, AddShape, AddWsg + +logger = setup_logger("visualization_script") + + +def create_point_cloud(color_img, depth_img, intrinsics): + """Create Open3D point cloud from RGB and depth images.""" + fx, fy, cx, cy = intrinsics + height, width = depth_img.shape + + o3d_intrinsics = o3d.camera.PinholeCameraIntrinsic(width, height, fx, fy, cx, cy) + color_o3d = o3d.geometry.Image(color_img) + depth_o3d = o3d.geometry.Image((depth_img * 1000).astype(np.uint16)) + + rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth( + color_o3d, depth_o3d, depth_scale=1000.0, convert_rgb_to_intensity=False + ) + + return o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, o3d_intrinsics) + + +def deserialize_point_cloud(data): + """Reconstruct Open3D PointCloud from serialized data.""" + if data is None: + return None + + pcd = o3d.geometry.PointCloud() + if "points" in data and data["points"]: + pcd.points = o3d.utility.Vector3dVector(np.array(data["points"])) + if "colors" in data and data["colors"]: + pcd.colors = o3d.utility.Vector3dVector(np.array(data["colors"])) + return pcd + + +def deserialize_voxel_grid(data): + """Reconstruct Open3D VoxelGrid from serialized data.""" + if data is None: + return None + + # Create a point cloud to convert to voxel grid + pcd = o3d.geometry.PointCloud() + voxel_size = data["voxel_size"] + origin = np.array(data["origin"]) + + # Create points from voxel indices + points = [] + colors = [] + for voxel in data["voxels"]: + # Each voxel is (i, j, k, r, g, b) + i, j, k, r, g, b = voxel + # Convert voxel grid index to 3D point + point = origin + np.array([i, j, k]) * voxel_size + points.append(point) + colors.append([r, g, b]) + + if points: + pcd.points = o3d.utility.Vector3dVector(np.array(points)) + pcd.colors = o3d.utility.Vector3dVector(np.array(colors)) + + # Convert to voxel grid + voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud(pcd, voxel_size) + return voxel_grid + + +def visualize_results(pickle_path="manipulation_results.pkl"): + """Load pickled results and visualize them.""" + print(f"Loading results from {pickle_path}...") + try: + with open(pickle_path, "rb") as f: + data = pickle.load(f) + + results = data["results"] + color_img = data["color_img"] + depth_img = data["depth_img"] + intrinsics = data["intrinsics"] + + print(f"Loaded results with keys: {list(results.keys())}") + + except FileNotFoundError: + print(f"Error: Pickle file {pickle_path} not found.") + print("Make sure to run test_manipulation_pipeline_single_frame_lcm.py first.") + return + except Exception as e: + print(f"Error loading pickle file: {e}") + return + + # Determine number of subplots based on what results we have + num_plots = 0 + plot_configs = [] + + if "detection_viz" in results and results["detection_viz"] is not None: + plot_configs.append(("detection_viz", "Object Detection")) + num_plots += 1 + + if "segmentation_viz" in results and results["segmentation_viz"] is not None: + plot_configs.append(("segmentation_viz", "Semantic Segmentation")) + num_plots += 1 + + if "pointcloud_viz" in results and results["pointcloud_viz"] is not None: + plot_configs.append(("pointcloud_viz", "All Objects Point Cloud")) + num_plots += 1 + + if "detected_pointcloud_viz" in results and results["detected_pointcloud_viz"] is not None: + plot_configs.append(("detected_pointcloud_viz", "Detection Objects Point Cloud")) + num_plots += 1 + + if "misc_pointcloud_viz" in results and results["misc_pointcloud_viz"] is not None: + plot_configs.append(("misc_pointcloud_viz", "Misc/Background Points")) + num_plots += 1 + + if "grasp_overlay" in results and results["grasp_overlay"] is not None: + plot_configs.append(("grasp_overlay", "Grasp Overlay")) + num_plots += 1 + + if num_plots == 0: + print("No visualization results to display") + return + + # Create subplot layout + if num_plots <= 3: + fig, axes = plt.subplots(1, num_plots, figsize=(6 * num_plots, 5)) + else: + rows = 2 + cols = (num_plots + 1) // 2 + fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 5 * rows)) + + # Ensure axes is always a list for consistent indexing + if num_plots == 1: + axes = [axes] + elif num_plots > 2: + axes = axes.flatten() + + # Plot each result + for i, (key, title) in enumerate(plot_configs): + axes[i].imshow(results[key]) + axes[i].set_title(title) + axes[i].axis("off") + + # Hide unused subplots if any + if num_plots > 3: + for i in range(num_plots, len(axes)): + axes[i].axis("off") + + plt.tight_layout() + + # Save and show the plot + output_path = "visualization_results.png" + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f"Results visualization saved to: {output_path}") + + # Show plot live as well + plt.show(block=True) + plt.close() + + # Deserialize and reconstruct 3D objects from the pickle file + print("\nReconstructing 3D visualization objects from serialized data...") + + # Reconstruct full point cloud if available + full_pcd = None + if "full_pointcloud" in results and results["full_pointcloud"] is not None: + full_pcd = deserialize_point_cloud(results["full_pointcloud"]) + print(f"Reconstructed full point cloud with {len(np.asarray(full_pcd.points))} points") + + # Visualize reconstructed full point cloud + try: + visualize_pcd( + full_pcd, + window_name="Reconstructed Full Scene Point Cloud", + point_size=2.0, + show_coordinate_frame=True, + ) + except (KeyboardInterrupt, EOFError): + print("\nSkipping full point cloud visualization") + except Exception as e: + print(f"Error in point cloud visualization: {e}") + else: + print("No full point cloud available for visualization") + + # Reconstruct misc clusters if available + if "misc_clusters" in results and results["misc_clusters"]: + misc_clusters = [deserialize_point_cloud(cluster) for cluster in results["misc_clusters"]] + cluster_count = len(misc_clusters) + total_misc_points = sum(len(np.asarray(cluster.points)) for cluster in misc_clusters) + print(f"Reconstructed {cluster_count} misc clusters with {total_misc_points} total points") + + # Visualize reconstructed misc clusters + try: + visualize_clustered_point_clouds( + misc_clusters, + window_name="Reconstructed Misc/Background Clusters (DBSCAN)", + point_size=3.0, + show_coordinate_frame=True, + ) + except (KeyboardInterrupt, EOFError): + print("\nSkipping misc clusters visualization") + except Exception as e: + print(f"Error in misc clusters visualization: {e}") + else: + print("No misc clusters available for visualization") + + # Reconstruct voxel grid if available + if "misc_voxel_grid" in results and results["misc_voxel_grid"] is not None: + misc_voxel_grid = deserialize_voxel_grid(results["misc_voxel_grid"]) + if misc_voxel_grid: + voxel_count = len(misc_voxel_grid.get_voxels()) + print(f"Reconstructed voxel grid with {voxel_count} voxels") + + # Visualize reconstructed voxel grid + try: + visualize_voxel_grid( + misc_voxel_grid, + window_name="Reconstructed Misc/Background Voxel Grid", + show_coordinate_frame=True, + ) + except (KeyboardInterrupt, EOFError): + print("\nSkipping voxel grid visualization") + except Exception as e: + print(f"Error in voxel grid visualization: {e}") + else: + print("Failed to reconstruct voxel grid") + else: + print("No voxel grid available for visualization") + + +class DrakeKinematicsEnv: + def __init__( + self, + urdf_path: str, + kinematic_chain_joints: List[str], + links_to_ignore: Optional[List[str]] = None, + ): + self._resources_to_cleanup = [] + + # Register cleanup at exit + atexit.register(self.cleanup_resources) + + # Initialize tf resources once and reuse them + self.buffer = tf_lcm_py.Buffer(30.0) + self._resources_to_cleanup.append(self.buffer) + with self.safe_lcm_instance() as lcm_instance: + self.tf_lcm_instance = lcm_instance + self._resources_to_cleanup.append(self.tf_lcm_instance) + # Create TransformListener with our LCM instance and buffer + self.listener = tf_lcm_py.TransformListener(self.tf_lcm_instance, self.buffer) + self._resources_to_cleanup.append(self.listener) + + # Check if URDF file exists + if not os.path.exists(urdf_path): + raise FileNotFoundError(f"URDF file not found: {urdf_path}") + + # Drake utils initialization + self.meshcat = StartMeshcat() + print(f"Meshcat started at: {self.meshcat.web_url()}") + + self.urdf_path = urdf_path + self.builder = DiagramBuilder() + + self.plant, self.scene_graph = AddMultibodyPlantSceneGraph(self.builder, time_step=0.01) + self.parser = Parser(self.plant) + + # Load the robot URDF + print(f"Loading URDF from: {self.urdf_path}") + self.model_instances = self.parser.AddModelsFromUrl(f"file://{self.urdf_path}") + self.kinematic_chain_joints = kinematic_chain_joints + self.model_instance = self.model_instances[0] if self.model_instances else None + + if not self.model_instances: + raise RuntimeError("Failed to load any model instances from URDF") + + print(f"Loaded {len(self.model_instances)} model instances") + + # Set up collision filtering + if links_to_ignore: + bodies = [] + for link_name in links_to_ignore: + try: + body = self.plant.GetBodyByName(link_name) + if body is not None: + bodies.extend(self.plant.GetBodiesWeldedTo(body)) + except RuntimeError: + print(f"Warning: Link '{link_name}' not found in URDF") + + if bodies: + arm_geoms = self.plant.CollectRegisteredGeometries(bodies) + decl = CollisionFilterDeclaration().ExcludeWithin(arm_geoms) + manager = self.scene_graph.collision_filter_manager() + manager.Apply(decl) + + # Load and process point cloud data + self._load_and_process_point_clouds() + + # Finalize the plant before adding visualizer + self.plant.Finalize() + + # Print some debug info about the plant + print(f"Plant has {self.plant.num_bodies()} bodies") + print(f"Plant has {self.plant.num_joints()} joints") + for i in range(self.plant.num_joints()): + joint = self.plant.get_joint(JointIndex(i)) + print(f" Joint {i}: {joint.name()} (type: {joint.type_name()})") + + # Add visualizer + self.visualizer = MeshcatVisualizer.AddToBuilder( + self.builder, self.scene_graph, self.meshcat, params=MeshcatVisualizerParams() + ) + + # Build the diagram + self.diagram = self.builder.Build() + self.diagram_context = self.diagram.CreateDefaultContext() + self.plant_context = self.plant.GetMyContextFromRoot(self.diagram_context) + + # Set up joint indices + self.joint_indices = [] + for joint_name in self.kinematic_chain_joints: + try: + joint = self.plant.GetJointByName(joint_name) + if joint.num_positions() > 0: + start_index = joint.position_start() + for i in range(joint.num_positions()): + self.joint_indices.append(start_index + i) + print( + f"Added joint '{joint_name}' at indices {start_index} to {start_index + joint.num_positions() - 1}" + ) + except RuntimeError: + print(f"Warning: Joint '{joint_name}' not found in URDF.") + + # Get important frames/bodies + try: + self.end_effector_link = self.plant.GetBodyByName("link6") + self.end_effector_frame = self.plant.GetFrameByName("link6") + print("Found end effector link6") + except RuntimeError: + print("Warning: link6 not found") + self.end_effector_link = None + self.end_effector_frame = None + + try: + self.camera_link = self.plant.GetBodyByName("camera_center_link") + print("Found camera_center_link") + except RuntimeError: + print("Warning: camera_center_link not found") + self.camera_link = None + + # Set robot to a reasonable initial configuration + self._set_initial_configuration() + + # Force initial visualization update + self._update_visualization() + + print("Drake environment initialization complete!") + print(f"Visit {self.meshcat.web_url()} to see the visualization") + + def _load_and_process_point_clouds(self): + """Load point cloud data from pickle file and add to scene""" + pickle_path = "manipulation_results.pkl" + try: + with open(pickle_path, "rb") as f: + data = pickle.load(f) + + results = data["results"] + print(f"Loaded results with keys: {list(results.keys())}") + + except FileNotFoundError: + print(f"Warning: Pickle file {pickle_path} not found.") + print("Skipping point cloud loading.") + return + except Exception as e: + print(f"Warning: Error loading pickle file: {e}") + return + + full_detected_pcd = o3d.geometry.PointCloud() + for obj in results["detected_objects"]: + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(obj["point_cloud_numpy"]) + full_detected_pcd += pcd + + self.process_and_add_object_class("all_objects", results) + self.process_and_add_object_class("misc_clusters", results) + misc_clusters = results["misc_clusters"] + print(type(misc_clusters[0]["points"])) + print(np.asarray(misc_clusters[0]["points"]).shape) + + def process_and_add_object_class(self, object_key: str, results: dict): + # Process detected objects + if object_key in results: + detected_objects = results[object_key] + if detected_objects: + print(f"Processing {len(detected_objects)} {object_key}") + all_decomposed_meshes = [] + + transform = self.get_transform("world", "camera_center_link") + for i in range(len(detected_objects)): + try: + if object_key == "misc_clusters": + points = np.asarray(detected_objects[i]["points"]) + elif "point_cloud_numpy" in detected_objects[i]: + points = detected_objects[i]["point_cloud_numpy"] + elif ( + "point_cloud" in detected_objects[i] + and detected_objects[i]["point_cloud"] + ): + # Handle serialized point cloud + points = np.array(detected_objects[i]["point_cloud"]["points"]) + else: + print(f"Warning: No point cloud data found for object {i}") + continue + + if len(points) < 10: # Need more points for mesh reconstruction + print( + f"Warning: Object {i} has too few points ({len(points)}) for mesh reconstruction" + ) + continue + + # Swap y-z axes since this is a common problem + points = np.column_stack((points[:, 0], points[:, 2], -points[:, 1])) + # Transform points to world frame + points = self.transform_point_cloud_with_open3d(points, transform) + + # Use fast DBSCAN clustering + convex hulls approach + clustered_hulls = self._create_clustered_convex_hulls(points, i) + all_decomposed_meshes.extend(clustered_hulls) + + print( + f"Created {len(clustered_hulls)} clustered convex hulls for object {i}" + ) + + except Exception as e: + print(f"Warning: Failed to process object {i}: {e}") + + if all_decomposed_meshes: + self.register_convex_hulls_as_collision(all_decomposed_meshes, object_key) + print(f"Registered {len(all_decomposed_meshes)} total clustered convex hulls") + else: + print("Warning: No valid clustered convex hulls created from detected objects") + else: + print("No detected objects found") + + def _create_clustered_convex_hulls( + self, points: np.ndarray, object_id: int + ) -> List[o3d.geometry.TriangleMesh]: + """ + Create convex hulls from DBSCAN clusters of point cloud data. + Fast approach: cluster points, then convex hull each cluster. + + Args: + points: Nx3 numpy array of 3D points + object_id: ID for debugging/logging + + Returns: + List of Open3D triangle meshes (convex hulls of clusters) + """ + try: + # Create Open3D point cloud + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(points) + + # Quick outlier removal (optional, can skip for speed) + if len(points) > 50: # Only for larger point clouds + pcd, _ = pcd.remove_statistical_outlier(nb_neighbors=10, std_ratio=2.0) + points = np.asarray(pcd.points) + + if len(points) < 4: + print(f"Warning: Too few points after filtering for object {object_id}") + return [] + + # Try multiple DBSCAN parameter combinations to find clusters + clusters = [] + labels = None + + # Calculate some basic statistics for parameter estimation + if len(points) > 10: + # Compute nearest neighbor distances for better eps estimation + distances = pcd.compute_nearest_neighbor_distance() + avg_nn_distance = np.mean(distances) + std_nn_distance = np.std(distances) + + print( + f"Object {object_id}: {len(points)} points, avg_nn_dist={avg_nn_distance:.4f}" + ) + + for i in range(20): + try: + eps = avg_nn_distance * (2.0 + (i * 0.1)) + min_samples = 20 + labels = np.array(pcd.cluster_dbscan(eps=eps, min_points=min_samples)) + unique_labels = np.unique(labels) + clusters = unique_labels[unique_labels >= 0] # Remove noise label (-1) + + noise_points = np.sum(labels == -1) + clustered_points = len(points) - noise_points + + print( + f" Try {i + 1}: eps={eps:.4f}, min_samples={min_samples} → {len(clusters)} clusters, {clustered_points}/{len(points)} points clustered" + ) + + # Accept if we found clusters and most points are clustered + if ( + len(clusters) > 0 and clustered_points >= len(points) * 0.95 + ): # At least 30% of points clustered + print(f" ✓ Accepted parameter set {i + 1}") + break + + except Exception as e: + print( + f" Try {i + 1}: Failed with eps={eps:.4f}, min_samples={min_samples}: {e}" + ) + continue + + if len(clusters) == 0 or labels is None: + print( + f"No clusters found for object {object_id} after all attempts, using entire point cloud" + ) + # Fallback: use entire point cloud as single convex hull + hull_mesh, _ = pcd.compute_convex_hull() + hull_mesh.compute_vertex_normals() + return [hull_mesh] + + print( + f"Found {len(clusters)} clusters for object {object_id} (eps={eps:.3f}, min_samples={min_samples})" + ) + + # Create convex hull for each cluster + convex_hulls = [] + for cluster_id in clusters: + try: + # Get points for this cluster + cluster_mask = labels == cluster_id + cluster_points = points[cluster_mask] + + if len(cluster_points) < 4: + print( + f"Skipping cluster {cluster_id} with only {len(cluster_points)} points" + ) + continue + + # Create point cloud for this cluster + cluster_pcd = o3d.geometry.PointCloud() + cluster_pcd.points = o3d.utility.Vector3dVector(cluster_points) + + # Compute convex hull + hull_mesh, _ = cluster_pcd.compute_convex_hull() + hull_mesh.compute_vertex_normals() + + # Validate hull + if ( + len(np.asarray(hull_mesh.vertices)) >= 4 + and len(np.asarray(hull_mesh.triangles)) >= 4 + ): + convex_hulls.append(hull_mesh) + print( + f" Cluster {cluster_id}: {len(cluster_points)} points → convex hull with {len(np.asarray(hull_mesh.vertices))} vertices" + ) + else: + print(f" Skipping degenerate hull for cluster {cluster_id}") + + except Exception as e: + print(f"Error processing cluster {cluster_id} for object {object_id}: {e}") + + if not convex_hulls: + print( + f"No valid convex hulls created for object {object_id}, using entire point cloud" + ) + # Fallback: use entire point cloud as single convex hull + hull_mesh, _ = pcd.compute_convex_hull() + hull_mesh.compute_vertex_normals() + return [hull_mesh] + + return convex_hulls + + except Exception as e: + print(f"Error in DBSCAN clustering for object {object_id}: {e}") + # Final fallback: single convex hull + try: + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(points) + hull_mesh, _ = pcd.compute_convex_hull() + hull_mesh.compute_vertex_normals() + return [hull_mesh] + except: + return [] + + def _set_initial_configuration(self): + """Set the robot to a reasonable initial joint configuration""" + # Set all joints to zero initially + if self.joint_indices: + q = np.zeros(len(self.joint_indices)) + + # You can customize these values for a better initial pose + # For example, if you know good default joint angles: + if len(q) >= 6: # Assuming at least 6 DOF arm + q[1] = 0.0 # joint1 + q[2] = 0.0 # joint2 + q[3] = 0.0 # joint3 + q[4] = 0.0 # joint4 + q[5] = 0.0 # joint5 + q[6] = 0.0 # joint6 + + # Set the joint positions in the plant context + positions = self.plant.GetPositions(self.plant_context) + for i, joint_idx in enumerate(self.joint_indices): + if joint_idx < len(positions): + positions[joint_idx] = q[i] + + self.plant.SetPositions(self.plant_context, positions) + print(f"Set initial joint configuration: {q}") + else: + print("Warning: No joint indices found, using default configuration") + + def _update_visualization(self): + """Force update the visualization""" + try: + # Get the visualizer's context from the diagram context + visualizer_context = self.visualizer.GetMyContextFromRoot(self.diagram_context) + self.visualizer.ForcedPublish(visualizer_context) + print("Visualization updated successfully") + except Exception as e: + print(f"Error updating visualization: {e}") + + def set_joint_positions(self, joint_positions): + """Set specific joint positions and update visualization""" + if len(joint_positions) != len(self.joint_indices): + raise ValueError( + f"Expected {len(self.joint_indices)} joint positions, got {len(joint_positions)}" + ) + + positions = self.plant.GetPositions(self.plant_context) + for i, joint_idx in enumerate(self.joint_indices): + if joint_idx < len(positions): + positions[joint_idx] = joint_positions[i] + + self.plant.SetPositions(self.plant_context, positions) + self._update_visualization() + print(f"Updated joint positions: {joint_positions}") + + def register_convex_hulls_as_collision( + self, meshes: List[o3d.geometry.TriangleMesh], hull_type: str + ): + """Register convex hulls as collision and visual geometry""" + if not meshes: + print("No meshes to register") + return + + world = self.plant.world_body() + proximity = ProximityProperties() + + for i, mesh in enumerate(meshes): + try: + # Convert Open3D → numpy arrays → trimesh.Trimesh + vertices = np.asarray(mesh.vertices) + faces = np.asarray(mesh.triangles) + + if len(vertices) == 0 or len(faces) == 0: + print(f"Warning: Mesh {i} is empty, skipping") + continue + + tmesh = trimesh.Trimesh(vertices=vertices, faces=faces) + + # Export to OBJ in memory + tmesh_obj_blob = tmesh.export(file_type="obj") + mem_file = MemoryFile( + contents=tmesh_obj_blob, extension=".obj", filename_hint=f"convex_hull_{i}.obj" + ) + in_memory_mesh = InMemoryMesh() + in_memory_mesh.mesh_file = mem_file + drake_mesh = Mesh(in_memory_mesh, scale=1.0) + + pos = np.array([0.0, 0.0, 0.0]) + rpy = RollPitchYaw(0.0, 0.0, 0.0) + X_WG = DrakeRigidTransform(RotationMatrix(rpy), pos) + + # Register collision and visual geometry + self.plant.RegisterCollisionGeometry( + body=world, + X_BG=X_WG, + shape=drake_mesh, + name=f"convex_hull_collision_{i}_{hull_type}", + properties=proximity, + ) + self.plant.RegisterVisualGeometry( + body=world, + X_BG=X_WG, + shape=drake_mesh, + name=f"convex_hull_visual_{i}_{hull_type}", + diffuse_color=np.array([0.7, 0.5, 0.3, 0.8]), # Orange-ish color + ) + + print( + f"Registered convex hull {i} with {len(vertices)} vertices and {len(faces)} faces" + ) + + except Exception as e: + print(f"Warning: Failed to register mesh {i}: {e}") + + # Add a simple table for reference + try: + table_shape = Box(1.0, 1.0, 0.1) # Thinner table + table_pose = RigidTransform(p=[0.5, 0.0, -0.05]) # In front of robot + self.plant.RegisterCollisionGeometry( + world, table_pose, table_shape, "table_collision", proximity + ) + self.plant.RegisterVisualGeometry( + world, table_pose, table_shape, "table_visual", [0.8, 0.6, 0.4, 1.0] + ) + print("Added reference table") + except Exception as e: + print(f"Warning: Failed to add table: {e}") + + def get_seeded_random_rgba(self, id: int): + np.random.seed(id) + return np.random.rand(4) + + @contextmanager + def safe_lcm_instance(self): + """Context manager for safely managing LCM instance lifecycle""" + lcm_instance = tf_lcm_py.LCM() + try: + yield lcm_instance + finally: + pass + + def cleanup_resources(self): + """Clean up resources before exiting""" + # Only clean up once when exiting + print("Cleaning up resources...") + # Force cleanup of resources in reverse order (last created first) + for resource in reversed(self._resources_to_cleanup): + try: + # For objects like TransformListener that might have a close or shutdown method + if hasattr(resource, "close"): + resource.close() + elif hasattr(resource, "shutdown"): + resource.shutdown() + + # Explicitly delete the resource + del resource + except Exception as e: + print(f"Error during cleanup: {e}") + + # Clear the resources list + self._resources_to_cleanup = [] + + def get_transform(self, target_frame, source_frame): + print("Getting transform from", source_frame, "to", target_frame) + attempts = 0 + max_attempts = 20 # Reduced from 120 to avoid long blocking + + while attempts < max_attempts: + try: + # Process LCM messages with error handling + if not self.tf_lcm_instance.handle_timeout(100): # 100ms timeout + # If handle_timeout returns false, we might need to re-check if LCM is still good + if not self.tf_lcm_instance.good(): + print("WARNING: LCM instance is no longer in a good state") + + # Get the most recent timestamp from the buffer instead of using current time + try: + timestamp = self.buffer.get_most_recent_timestamp() + if attempts % 10 == 0: + print(f"Using timestamp from buffer: {timestamp}") + except Exception as e: + # Fall back to current time if get_most_recent_timestamp fails + timestamp = datetime.now() + if not hasattr(timestamp, "timestamp"): + timestamp.timestamp = ( + lambda: time.mktime(timestamp.timetuple()) + timestamp.microsecond / 1e6 + ) + if attempts % 10 == 0: + print(f"Falling back to current time: {timestamp}") + + # Check if we can find the transform + if self.buffer.can_transform(target_frame, source_frame, timestamp): + # print(f"Found transform between '{target_frame}' and '{source_frame}'!") + + # Look up the transform with the timestamp from the buffer + transform = self.buffer.lookup_transform( + target_frame, + source_frame, + timestamp, + timeout=10.0, + time_tolerance=0.1, + lcm_module=lcm_msgs, + ) + + return transform + + # Increment counter and report status every 10 attempts + attempts += 1 + if attempts % 10 == 0: + print(f"Still waiting... (attempt {attempts}/{max_attempts})") + frames = self.buffer.get_all_frame_names() + if frames: + print(f"Frames received so far ({len(frames)} total):") + for frame in sorted(frames): + print(f" {frame}") + else: + print("No frames received yet") + + # Brief pause + time.sleep(0.5) + + except Exception as e: + print(f"Error during transform lookup: {e}") + attempts += 1 + time.sleep(1) # Longer pause after an error + + print(f"\nERROR: No transform found after {max_attempts} attempts") + return None + + def transform_point_cloud_with_open3d(self, points_np: np.ndarray, transform) -> np.ndarray: + """ + Transforms a point cloud using Open3D given a transform. + + Args: + points_np (np.ndarray): Nx3 array of 3D points. + transform: Transform from tf_lcm_py. + + Returns: + np.ndarray: Nx3 array of transformed 3D points. + """ + if points_np.shape[1] != 3: + print("Input point cloud must have shape Nx3.") + return points_np + + # Convert transform to 4x4 numpy matrix + tf_matrix = np.eye(4) + + # Extract rotation quaternion components + qw = transform.transform.rotation.w + qx = transform.transform.rotation.x + qy = transform.transform.rotation.y + qz = transform.transform.rotation.z + + # Convert quaternion to rotation matrix + # Formula from: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation#Quaternion-derived_rotation_matrix + tf_matrix[0, 0] = 1 - 2 * qy * qy - 2 * qz * qz + tf_matrix[0, 1] = 2 * qx * qy - 2 * qz * qw + tf_matrix[0, 2] = 2 * qx * qz + 2 * qy * qw + + tf_matrix[1, 0] = 2 * qx * qy + 2 * qz * qw + tf_matrix[1, 1] = 1 - 2 * qx * qx - 2 * qz * qz + tf_matrix[1, 2] = 2 * qy * qz - 2 * qx * qw + + tf_matrix[2, 0] = 2 * qx * qz - 2 * qy * qw + tf_matrix[2, 1] = 2 * qy * qz + 2 * qx * qw + tf_matrix[2, 2] = 1 - 2 * qx * qx - 2 * qy * qy + + # Set translation + tf_matrix[0, 3] = transform.transform.translation.x + tf_matrix[1, 3] = transform.transform.translation.y + tf_matrix[2, 3] = transform.transform.translation.z + + # Create Open3D point cloud + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(points_np) + + # Apply transformation + pcd.transform(tf_matrix) + + # Return as NumPy array + return np.asarray(pcd.points) + + +# Updated main function +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Visualize manipulation results") + parser.add_argument("--visualize-only", action="store_true", help="Only visualize results") + args = parser.parse_args() + + if args.visualize_only: + visualize_results() + exit(0) + + try: + # Then set up Drake environment + kinematic_chain_joints = [ + "pillar_platform_joint", + "joint1", + "joint2", + "joint3", + "joint4", + "joint5", + "joint6", + ] + + links_to_ignore = [ + "devkit_base_link", + "pillar_platform", + "piper_angled_mount", + "pan_tilt_base", + "pan_tilt_head", + "pan_tilt_pan", + "base_link", + "link1", + "link2", + "link3", + "link4", + "link5", + "link6", + ] + + urdf_path = "./assets/devkit_base_descr.urdf" + urdf_path = os.path.abspath(urdf_path) + + print(f"Attempting to load URDF from: {urdf_path}") + + env = DrakeKinematicsEnv(urdf_path, kinematic_chain_joints, links_to_ignore) + env.set_joint_positions([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + transform = env.get_transform("world", "camera_center_link") + print( + transform.transform.translation.x, + transform.transform.translation.y, + transform.transform.translation.z, + ) + print( + transform.transform.rotation.w, + transform.transform.rotation.x, + transform.transform.rotation.y, + transform.transform.rotation.z, + ) + + # Keep the visualization alive + print("\nVisualization is running. Press Ctrl+C to exit.") + while True: + time.sleep(1) + + except KeyboardInterrupt: + print("\nExiting...") + except Exception as e: + print(f"Error: {e}") + import traceback + + traceback.print_exc() diff --git a/tests/zed_neural_depth_demo.py b/tests/zed_neural_depth_demo.py new file mode 100755 index 0000000000..5edce9633f --- /dev/null +++ b/tests/zed_neural_depth_demo.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +ZED Camera Neural Depth Demo - OpenCV Live Visualization with Data Saving + +This script demonstrates live visualization of ZED camera RGB and depth data using OpenCV. +Press SPACE to save RGB and depth images to rgbd_data2 folder. +Press ESC or 'q' to quit. +""" + +import os +import sys +import time +import argparse +import logging +from pathlib import Path +import numpy as np +import cv2 +import yaml +from datetime import datetime +import open3d as o3d + +# Add the project root to Python path +sys.path.append(str(Path(__file__).parent.parent)) + +try: + import pyzed.sl as sl +except ImportError: + print("ERROR: ZED SDK not found. Please install the ZED SDK and pyzed Python package.") + print("Download from: https://www.stereolabs.com/developers/release/") + sys.exit(1) + +from dimos.hardware.zed_camera import ZEDCamera +from dimos.perception.pointcloud.utils import visualize_pcd, visualize_clustered_point_clouds + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +class ZEDLiveVisualizer: + """Live OpenCV visualization for ZED camera data with saving functionality.""" + + def __init__(self, camera, max_depth=10.0, output_dir="assets/rgbd_data2"): + self.camera = camera + self.max_depth = max_depth + self.output_dir = Path(output_dir) + self.save_counter = 0 + + # Store captured pointclouds for later visualization + self.captured_pointclouds = [] + + # Display settings for 480p + self.display_width = 640 + self.display_height = 480 + + # Create output directory structure + self.setup_output_directory() + + # Get camera info for saving + self.camera_info = camera.get_camera_info() + + # Save camera info files once + self.save_camera_info() + + # OpenCV window name (single window) + self.window_name = "ZED Camera - RGB + Depth" + + # Create window + cv2.namedWindow(self.window_name, cv2.WINDOW_AUTOSIZE) + + def setup_output_directory(self): + """Create the output directory structure.""" + self.output_dir.mkdir(exist_ok=True) + (self.output_dir / "color").mkdir(exist_ok=True) + (self.output_dir / "depth").mkdir(exist_ok=True) + (self.output_dir / "pointclouds").mkdir(exist_ok=True) + logger.info(f"Created output directory: {self.output_dir}") + + def save_camera_info(self): + """Save camera info YAML files with ZED camera parameters.""" + # Get current timestamp + now = datetime.now() + timestamp_sec = int(now.timestamp()) + timestamp_nanosec = int((now.timestamp() % 1) * 1e9) + + # Get camera resolution + resolution = self.camera_info.get("resolution", {}) + width = int(resolution.get("width", 1280)) + height = int(resolution.get("height", 720)) + + # Extract left camera parameters (for RGB) from already available camera_info + left_cam = self.camera_info.get("left_cam", {}) + # Convert numpy values to Python floats + fx = float(left_cam.get("fx", 749.341552734375)) + fy = float(left_cam.get("fy", 748.5587768554688)) + cx = float(left_cam.get("cx", 639.4312744140625)) + cy = float(left_cam.get("cy", 357.2478942871094)) + + # Build distortion coefficients from ZED format + # ZED provides k1, k2, p1, p2, k3 - convert to rational_polynomial format + k1 = float(left_cam.get("k1", 0.0)) + k2 = float(left_cam.get("k2", 0.0)) + p1 = float(left_cam.get("p1", 0.0)) + p2 = float(left_cam.get("p2", 0.0)) + k3 = float(left_cam.get("k3", 0.0)) + distortion = [k1, k2, p1, p2, k3, 0.0, 0.0, 0.0] + + # Create camera info structure with plain Python types + camera_info = { + "D": distortion, + "K": [fx, 0.0, cx, 0.0, fy, cy, 0.0, 0.0, 1.0], + "P": [fx, 0.0, cx, 0.0, 0.0, fy, cy, 0.0, 0.0, 0.0, 1.0, 0.0], + "R": [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0], + "binning_x": 0, + "binning_y": 0, + "distortion_model": "rational_polynomial", + "header": { + "frame_id": "camera_color_optical_frame", + "stamp": {"nanosec": timestamp_nanosec, "sec": timestamp_sec}, + }, + "height": height, + "roi": {"do_rectify": False, "height": 0, "width": 0, "x_offset": 0, "y_offset": 0}, + "width": width, + } + + # Save color camera info + color_info_path = self.output_dir / "color_camera_info.yaml" + with open(color_info_path, "w") as f: + yaml.dump(camera_info, f, default_flow_style=False) + + # Save depth camera info (same as color for ZED) + depth_info_path = self.output_dir / "depth_camera_info.yaml" + with open(depth_info_path, "w") as f: + yaml.dump(camera_info, f, default_flow_style=False) + + logger.info(f"Saved camera info files to {self.output_dir}") + + def normalize_depth_for_display(self, depth_map): + """Normalize depth map for OpenCV visualization.""" + # Handle invalid values + valid_mask = (depth_map > 0) & np.isfinite(depth_map) + + if not np.any(valid_mask): + return np.zeros_like(depth_map, dtype=np.uint8) + + # Normalize to 0-255 for display + depth_norm = np.zeros_like(depth_map, dtype=np.float32) + depth_clipped = np.clip(depth_map[valid_mask], 0, self.max_depth) + depth_norm[valid_mask] = depth_clipped / self.max_depth + + # Convert to 8-bit and apply colormap + depth_8bit = (depth_norm * 255).astype(np.uint8) + depth_colored = cv2.applyColorMap(depth_8bit, cv2.COLORMAP_JET) + + return depth_colored + + def save_frame(self, rgb_img, depth_map): + """Save RGB, depth images, and pointcloud with proper naming convention.""" + # Generate filename with 5-digit zero-padding + filename = f"{self.save_counter:05d}.png" + pcd_filename = f"{self.save_counter:05d}.ply" + + # Save RGB image + rgb_path = self.output_dir / "color" / filename + cv2.imwrite(str(rgb_path), rgb_img) + + # Save depth image (convert to 16-bit for proper depth storage) + depth_path = self.output_dir / "depth" / filename + # Convert meters to millimeters and save as 16-bit + depth_mm = (depth_map * 1000).astype(np.uint16) + cv2.imwrite(str(depth_path), depth_mm) + + # Capture and save pointcloud + pcd = self.camera.capture_pointcloud() + if pcd is not None and len(np.asarray(pcd.points)) > 0: + pcd_path = self.output_dir / "pointclouds" / pcd_filename + o3d.io.write_point_cloud(str(pcd_path), pcd) + + # Store pointcloud for later visualization + self.captured_pointclouds.append(pcd) + + logger.info( + f"Saved frame {self.save_counter}: {rgb_path}, {depth_path}, and {pcd_path}" + ) + else: + logger.warning(f"Failed to capture pointcloud for frame {self.save_counter}") + logger.info(f"Saved frame {self.save_counter}: {rgb_path} and {depth_path}") + + self.save_counter += 1 + + def visualize_captured_pointclouds(self): + """Visualize all captured pointclouds using Open3D, one by one.""" + if not self.captured_pointclouds: + logger.info("No pointclouds captured to visualize") + return + + logger.info( + f"Visualizing {len(self.captured_pointclouds)} captured pointclouds one by one..." + ) + logger.info("Close each pointcloud window to proceed to the next one") + + for i, pcd in enumerate(self.captured_pointclouds): + if len(np.asarray(pcd.points)) > 0: + logger.info(f"Displaying pointcloud {i + 1}/{len(self.captured_pointclouds)}") + visualize_pcd(pcd, window_name=f"ZED Pointcloud {i + 1:05d}", point_size=2.0) + else: + logger.warning(f"Pointcloud {i + 1} is empty, skipping...") + + logger.info("Finished displaying all pointclouds") + + def update_display(self): + """Update the live display with new frames.""" + # Capture frame + left_img, right_img, depth_map = self.camera.capture_frame() + + if left_img is None or depth_map is None: + return False, None, None + + # Resize RGB to 480p + rgb_resized = cv2.resize(left_img, (self.display_width, self.display_height)) + + # Create depth visualization + depth_colored = self.normalize_depth_for_display(depth_map) + + # Resize depth to 480p + depth_resized = cv2.resize(depth_colored, (self.display_width, self.display_height)) + + # Add text overlays + text_color = (255, 255, 255) + font = cv2.FONT_HERSHEY_SIMPLEX + font_scale = 0.6 + thickness = 2 + + # Add title and instructions to RGB + cv2.putText( + rgb_resized, "RGB Camera Feed", (10, 25), font, font_scale, text_color, thickness + ) + cv2.putText( + rgb_resized, + "SPACE: Save | ESC/Q: Quit", + (10, 50), + font, + font_scale - 0.1, + text_color, + thickness, + ) + + # Add title and stats to depth + cv2.putText( + depth_resized, + f"Depth Map (0-{self.max_depth}m)", + (10, 25), + font, + font_scale, + text_color, + thickness, + ) + cv2.putText( + depth_resized, + f"Saved: {self.save_counter} frames", + (10, 50), + font, + font_scale - 0.1, + text_color, + thickness, + ) + + # Stack images horizontally + combined_display = np.hstack((rgb_resized, depth_resized)) + + # Display combined image + cv2.imshow(self.window_name, combined_display) + + return True, left_img, depth_map + + def handle_key_events(self, rgb_img, depth_map): + """Handle keyboard input.""" + key = cv2.waitKey(1) & 0xFF + + if key == ord(" "): # Space key - save frame + if rgb_img is not None and depth_map is not None: + self.save_frame(rgb_img, depth_map) + return "save" + elif key == 27 or key == ord("q"): # ESC or 'q' - quit + return "quit" + + return "continue" + + def cleanup(self): + """Clean up OpenCV windows.""" + cv2.destroyAllWindows() + + +def main(): + parser = argparse.ArgumentParser( + description="ZED Camera Neural Depth Demo - OpenCV with Data Saving" + ) + parser.add_argument("--camera-id", type=int, default=0, help="ZED camera ID (default: 0)") + parser.add_argument( + "--resolution", + type=str, + default="HD1080", + choices=["HD2K", "HD1080", "HD720", "VGA"], + help="Camera resolution (default: HD1080)", + ) + parser.add_argument( + "--max-depth", + type=float, + default=10.0, + help="Maximum depth for visualization in meters (default: 10.0)", + ) + parser.add_argument( + "--camera-fps", type=int, default=15, help="Camera capture FPS (default: 30)" + ) + parser.add_argument( + "--depth-mode", + type=str, + default="NEURAL", + choices=["NEURAL", "NEURAL_PLUS"], + help="Depth mode (NEURAL=faster, NEURAL_PLUS=more accurate)", + ) + parser.add_argument( + "--output-dir", + type=str, + default="assets/rgbd_data2", + help="Output directory for saved data (default: rgbd_data2)", + ) + + args = parser.parse_args() + + # Map resolution string to ZED enum + resolution_map = { + "HD2K": sl.RESOLUTION.HD2K, + "HD1080": sl.RESOLUTION.HD1080, + "HD720": sl.RESOLUTION.HD720, + "VGA": sl.RESOLUTION.VGA, + } + + depth_mode_map = {"NEURAL": sl.DEPTH_MODE.NEURAL, "NEURAL_PLUS": sl.DEPTH_MODE.NEURAL_PLUS} + + try: + # Initialize ZED camera with neural depth + logger.info( + f"Initializing ZED camera with {args.depth_mode} depth processing at {args.camera_fps} FPS..." + ) + camera = ZEDCamera( + camera_id=args.camera_id, + resolution=resolution_map[args.resolution], + depth_mode=depth_mode_map[args.depth_mode], + fps=args.camera_fps, + ) + + # Open camera + with camera: + # Get camera information + info = camera.get_camera_info() + logger.info(f"Camera Model: {info.get('model', 'Unknown')}") + logger.info(f"Serial Number: {info.get('serial_number', 'Unknown')}") + logger.info(f"Firmware: {info.get('firmware', 'Unknown')}") + logger.info(f"Resolution: {info.get('resolution', {})}") + logger.info(f"Baseline: {info.get('baseline', 0):.3f}m") + + # Initialize visualizer + visualizer = ZEDLiveVisualizer( + camera, max_depth=args.max_depth, output_dir=args.output_dir + ) + + logger.info("Starting live visualization...") + logger.info("Controls:") + logger.info(" SPACE - Save current RGB and depth frame") + logger.info(" ESC/Q - Quit") + + frame_count = 0 + start_time = time.time() + + try: + while True: + loop_start = time.time() + + # Update display + success, rgb_img, depth_map = visualizer.update_display() + + if success: + frame_count += 1 + + # Handle keyboard events + action = visualizer.handle_key_events(rgb_img, depth_map) + + if action == "quit": + break + elif action == "save": + # Frame was saved, no additional action needed + pass + + # Print performance stats every 60 frames + if frame_count % 60 == 0: + elapsed = time.time() - start_time + fps = frame_count / elapsed + logger.info( + f"Frame {frame_count} | FPS: {fps:.1f} | Saved: {visualizer.save_counter}" + ) + + # Small delay to prevent CPU overload + elapsed = time.time() - loop_start + min_frame_time = 1.0 / 60.0 # Cap at 60 FPS + if elapsed < min_frame_time: + time.sleep(min_frame_time - elapsed) + + except KeyboardInterrupt: + logger.info("Stopped by user") + + # Final stats + total_time = time.time() - start_time + if total_time > 0: + avg_fps = frame_count / total_time + logger.info( + f"Final stats: {frame_count} frames in {total_time:.1f}s (avg {avg_fps:.1f} FPS)" + ) + logger.info(f"Total saved frames: {visualizer.save_counter}") + + # Visualize captured pointclouds + visualizer.visualize_captured_pointclouds() + + except Exception as e: + logger.error(f"Error during execution: {e}") + raise + finally: + if "visualizer" in locals(): + visualizer.cleanup() + logger.info("Demo completed") + + +if __name__ == "__main__": + main() From 6c2ec82fb508d1d1093430bf885fdffb035dee2f Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Wed, 16 Jul 2025 12:46:41 -0700 Subject: [PATCH 2/7] added pytest to pointcloud filtering --- .../pointcloud/pointcloud_filtering.py | 314 ------------------ .../pointcloud/test_pointcloud_filtering.py | 259 +++++++++++++++ ...test_manipulation_pipeline_single_frame.py | 9 +- ..._manipulation_pipeline_single_frame_lcm.py | 6 +- 4 files changed, 263 insertions(+), 325 deletions(-) create mode 100644 dimos/perception/pointcloud/test_pointcloud_filtering.py diff --git a/dimos/perception/pointcloud/pointcloud_filtering.py b/dimos/perception/pointcloud/pointcloud_filtering.py index 64fd8953a6..ea50cd03d2 100644 --- a/dimos/perception/pointcloud/pointcloud_filtering.py +++ b/dimos/perception/pointcloud/pointcloud_filtering.py @@ -358,317 +358,3 @@ def cleanup(self): """Clean up resources.""" if torch.cuda.is_available(): torch.cuda.empty_cache() - - -def create_test_pipeline(data_dir: str) -> tuple: - """ - Create a test pipeline with default settings. - - Args: - data_dir: Directory containing camera info files - - Returns: - Tuple of (filter_pipeline, color_info_path, depth_info_path) - """ - color_info_path = os.path.join(data_dir, "color_camera_info.yaml") - depth_info_path = os.path.join(data_dir, "depth_camera_info.yaml") - - # Default pipeline with subsampling disabled by default - filter_pipeline = PointcloudFiltering( - color_intrinsics=color_info_path, - depth_intrinsics=depth_info_path, - ) - - return filter_pipeline, color_info_path, depth_info_path - - -def load_test_images(data_dir: str) -> tuple: - """ - Load the first available test images from data directory. - - Args: - data_dir: Directory containing color and depth subdirectories - - Returns: - Tuple of (color_img, depth_img) or raises FileNotFoundError - """ - - def find_first_image(directory): - """Find the first image file in the given directory.""" - if not os.path.exists(directory): - return None - - image_extensions = [".jpg", ".jpeg", ".png", ".bmp"] - for filename in sorted(os.listdir(directory)): - if any(filename.lower().endswith(ext) for ext in image_extensions): - return os.path.join(directory, filename) - return None - - color_dir = os.path.join(data_dir, "color") - depth_dir = os.path.join(data_dir, "depth") - - color_img_path = find_first_image(color_dir) - depth_img_path = find_first_image(depth_dir) - - if not color_img_path or not depth_img_path: - raise FileNotFoundError(f"Could not find color or depth images in {data_dir}") - - # Load color image - color_img = cv2.imread(color_img_path) - if color_img is None: - raise FileNotFoundError(f"Could not load color image from {color_img_path}") - color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB) - - # Load depth image - depth_img = cv2.imread(depth_img_path, cv2.IMREAD_UNCHANGED) - if depth_img is None: - raise FileNotFoundError(f"Could not load depth image from {depth_img_path}") - - # Convert depth to meters if needed - if depth_img.dtype == np.uint16: - depth_img = depth_img.astype(np.float32) / 1000.0 - - return color_img, depth_img - - -def run_segmentation(color_img: np.ndarray, device: str = "auto") -> List[ObjectData]: - """ - Run segmentation on color image and return ObjectData objects. - - Args: - color_img: RGB color image - device: Device to use ('auto', 'cuda', or 'cpu') - - Returns: - List of ObjectData objects with segmentation results - """ - if device == "auto": - device = "cuda" if torch.cuda.is_available() else "cpu" - - # Import here to avoid circular imports - from dimos.perception.segmentation import Sam2DSegmenter - - segmenter = Sam2DSegmenter( - model_path="FastSAM-s.pt", device=device, use_tracker=False, use_analyzer=False - ) - - try: - masks, bboxes, target_ids, probs, names = segmenter.process_image(np.array(color_img)) - - # Create ObjectData objects - objects = [] - for i in range(len(bboxes)): - obj_data: ObjectData = { - "object_id": target_ids[i] if i < len(target_ids) else i, - "bbox": bboxes[i], - "depth": -1.0, # Will be populated by pointcloud filtering - "confidence": probs[i] if i < len(probs) else 1.0, - "class_id": i, - "label": names[i] if i < len(names) else "", - "segmentation_mask": masks[i].cpu().numpy() - if hasattr(masks[i], "cpu") - else masks[i], - "position": Vector(0, 0, 0), # Will be populated by pointcloud filtering - "rotation": Vector(0, 0, 0), # Will be populated by pointcloud filtering - "size": { - "width": 0.0, - "height": 0.0, - "depth": 0.0, - }, # Will be populated by pointcloud filtering - } - objects.append(obj_data) - - return objects - - finally: - segmenter.cleanup() - - -def visualize_results(objects: List[ObjectData]): - """ - Visualize point cloud filtering results with 3D bounding boxes. - - Args: - objects: List of ObjectData with point clouds - """ - all_pcds = [] - - for obj in objects: - if "point_cloud" in obj and obj["point_cloud"] is not None: - pcd = obj["point_cloud"] - all_pcds.append(pcd) - - # Draw 3D bounding box if position, rotation, and size are available - if ( - "position" in obj - and "rotation" in obj - and "size" in obj - and obj["position"] is not None - and obj["rotation"] is not None - and obj["size"] is not None - ): - try: - position = obj["position"] - rotation = obj["rotation"] - size = obj["size"] - - # Convert position to numpy array - if hasattr(position, "x"): # Vector object - center = np.array([position.x, position.y, position.z]) - else: # Dictionary - center = np.array([position["x"], position["y"], position["z"]]) - - # Convert rotation (euler angles) to rotation matrix - if hasattr(rotation, "x"): # Vector object (roll, pitch, yaw) - roll, pitch, yaw = rotation.x, rotation.y, rotation.z - else: # Dictionary - roll, pitch, yaw = rotation["roll"], rotation["pitch"], rotation["yaw"] - - # Create rotation matrix from euler angles (ZYX order) - # Roll (X), Pitch (Y), Yaw (Z) - cos_r, sin_r = np.cos(roll), np.sin(roll) - cos_p, sin_p = np.cos(pitch), np.sin(pitch) - cos_y, sin_y = np.cos(yaw), np.sin(yaw) - - # Rotation matrix for ZYX euler angles - R = np.array( - [ - [ - cos_y * cos_p, - cos_y * sin_p * sin_r - sin_y * cos_r, - cos_y * sin_p * cos_r + sin_y * sin_r, - ], - [ - sin_y * cos_p, - sin_y * sin_p * sin_r + cos_y * cos_r, - sin_y * sin_p * cos_r - cos_y * sin_r, - ], - [-sin_p, cos_p * sin_r, cos_p * cos_r], - ] - ) - - # Get dimensions - width = size.get("width", 0.1) - height = size.get("height", 0.1) - depth = size.get("depth", 0.1) - extent = np.array([width, height, depth]) - - # Create oriented bounding box - obb = o3d.geometry.OrientedBoundingBox(center=center, R=R, extent=extent) - obb.color = [1, 0, 0] # Red bounding boxes - all_pcds.append(obb) - - except Exception as e: - print( - f"Warning: Failed to create bounding box for object {obj.get('object_id', 'unknown')}: {e}" - ) - - # Add coordinate frame - coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1) - all_pcds.append(coordinate_frame) - - # Visualize - if all_pcds: - o3d.visualization.draw_geometries( - all_pcds, - window_name="Filtered Point Clouds with 3D Bounding Boxes", - width=1280, - height=720, - ) - - -def main(): - """Main function to demonstrate the PointcloudFiltering pipeline.""" - parser = argparse.ArgumentParser(description="Point cloud filtering pipeline demonstration") - parser.add_argument( - "--save-pickle", - type=str, - help="Save generated ObjectData to pickle file (provide filename)", - ) - parser.add_argument( - "--data-dir", type=str, help="Directory containing RGBD data (default: auto-detect)" - ) - args = parser.parse_args() - - try: - # Setup paths - if args.data_dir: - data_dir = args.data_dir - else: - script_dir = os.path.dirname(os.path.abspath(__file__)) - dimos_dir = os.path.abspath(os.path.join(script_dir, "../../../")) - data_dir = os.path.join(dimos_dir, "assets/rgbd_data") - - # Load test data - print("Loading test images...") - color_img, depth_img = load_test_images(data_dir) - print(f"Loaded images: color {color_img.shape}, depth {depth_img.shape}") - - # Run segmentation - print("Running segmentation...") - objects = run_segmentation(color_img) - print(f"Found {len(objects)} objects") - - # Create filtering pipeline - print("Creating filtering pipeline...") - filter_pipeline, _, _ = create_test_pipeline(data_dir) - - # Process images - print("Processing point clouds...") - updated_objects = filter_pipeline.process_images(color_img, depth_img, objects) - - # Print results - print(f"Processing complete:") - print(f" Objects processed: {len(updated_objects)}/{len(objects)}") - - # Print per-object stats - for i, obj in enumerate(updated_objects): - if "point_cloud" in obj and obj["point_cloud"] is not None: - num_points = len(np.asarray(obj["point_cloud"].points)) - position = obj.get("position", Vector(0, 0, 0)) - size = obj.get("size", {}) - print(f" Object {i + 1} (ID: {obj['object_id']}): {num_points} points") - print(f" Position: ({position.x:.2f}, {position.y:.2f}, {position.z:.2f})") - print( - f" Size: {size.get('width', 0):.3f} x {size.get('height', 0):.3f} x {size.get('depth', 0):.3f}" - ) - - # Save to pickle file if requested - if args.save_pickle: - pickle_path = args.save_pickle - if not pickle_path.endswith(".pkl"): - pickle_path += ".pkl" - - print(f"Saving ObjectData to {pickle_path}...") - - # Create serializable objects (exclude Open3D point clouds) - serializable_objects = [] - for obj in updated_objects: - obj_copy = obj.copy() - # Remove the Open3D point cloud object (can't be pickled) - if "point_cloud" in obj_copy: - del obj_copy["point_cloud"] - serializable_objects.append(obj_copy) - - with open(pickle_path, "wb") as f: - pickle.dump(serializable_objects, f) - - print(f"Successfully saved {len(serializable_objects)} objects to {pickle_path}") - print("To load: objects = pickle.load(open('filename.pkl', 'rb'))") - print( - "Note: Open3D point clouds excluded - use point_cloud_numpy and colors_numpy for processing" - ) - - # Visualize results - print("Visualizing results...") - visualize_results(updated_objects) - - except Exception as e: - print(f"Error: {e}") - import traceback - - traceback.print_exc() - - -if __name__ == "__main__": - main() diff --git a/dimos/perception/pointcloud/test_pointcloud_filtering.py b/dimos/perception/pointcloud/test_pointcloud_filtering.py new file mode 100644 index 0000000000..4b4e5c7c4f --- /dev/null +++ b/dimos/perception/pointcloud/test_pointcloud_filtering.py @@ -0,0 +1,259 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import cv2 +import numpy as np +import pytest +import open3d as o3d + +from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering +from dimos.perception.pointcloud.utils import load_camera_matrix_from_yaml +from dimos.types.manipulation import ObjectData + + +class TestPointcloudFiltering: + def test_pointcloud_filtering_initialization(self): + """Test PointcloudFiltering initializes correctly with default parameters.""" + try: + filtering = PointcloudFiltering() + assert filtering is not None + assert filtering.color_weight == 0.3 + assert filtering.enable_statistical_filtering == True + assert filtering.enable_radius_filtering == True + assert filtering.enable_subsampling == True + except Exception as e: + pytest.skip(f"Skipping test due to initialization error: {e}") + + def test_pointcloud_filtering_with_custom_params(self): + """Test PointcloudFiltering with custom parameters.""" + try: + filtering = PointcloudFiltering( + color_weight=0.5, + enable_statistical_filtering=False, + enable_radius_filtering=False, + voxel_size=0.01, + max_num_objects=5, + ) + assert filtering.color_weight == 0.5 + assert filtering.enable_statistical_filtering == False + assert filtering.enable_radius_filtering == False + assert filtering.voxel_size == 0.01 + assert filtering.max_num_objects == 5 + except Exception as e: + pytest.skip(f"Skipping test due to initialization error: {e}") + + def test_pointcloud_filtering_process_images(self): + """Test PointcloudFiltering can process RGB-D images and return filtered point clouds.""" + try: + # Import data inside method to avoid pytest fixture confusion + from dimos.utils.data import get_data + + # Load test RGB-D data + data_dir = get_data("rgbd_frames") + + # Load first frame + color_path = os.path.join(data_dir, "color", "00000.png") + depth_path = os.path.join(data_dir, "depth", "00000.png") + intrinsics_path = os.path.join(data_dir, "color_camera_info.yaml") + + assert os.path.exists(color_path), f"Color image not found: {color_path}" + assert os.path.exists(depth_path), f"Depth image not found: {depth_path}" + assert os.path.exists(intrinsics_path), f"Intrinsics file not found: {intrinsics_path}" + + # Load images + color_img = cv2.imread(color_path) + color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB) + + depth_img = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH) + if depth_img.dtype == np.uint16: + depth_img = depth_img.astype(np.float32) / 1000.0 + + # Load camera intrinsics + camera_matrix = load_camera_matrix_from_yaml(intrinsics_path) + if camera_matrix is None: + pytest.skip("Failed to load camera intrinsics") + + # Create mock objects with segmentation masks + height, width = color_img.shape[:2] + + # Create simple rectangular masks for testing + mock_objects = [] + + # Object 1: Top-left quadrant + mask1 = np.zeros((height, width), dtype=bool) + mask1[height // 4 : height // 2, width // 4 : width // 2] = True + + obj1: ObjectData = { + "object_id": 1, + "confidence": 0.9, + "bbox": [width // 4, height // 4, width // 2, height // 2], + "segmentation_mask": mask1, + "name": "test_object_1", + } + mock_objects.append(obj1) + + # Object 2: Bottom-right quadrant + mask2 = np.zeros((height, width), dtype=bool) + mask2[height // 2 : 3 * height // 4, width // 2 : 3 * width // 4] = True + + obj2: ObjectData = { + "object_id": 2, + "confidence": 0.8, + "bbox": [width // 2, height // 2, 3 * width // 4, 3 * height // 4], + "segmentation_mask": mask2, + "name": "test_object_2", + } + mock_objects.append(obj2) + + # Initialize filtering with intrinsics + filtering = PointcloudFiltering( + color_intrinsics=camera_matrix, + depth_intrinsics=camera_matrix, + enable_statistical_filtering=False, # Disable for faster testing + enable_radius_filtering=False, # Disable for faster testing + voxel_size=0.01, # Larger voxel for faster processing + ) + + # Process images + results = filtering.process_images(color_img, depth_img, mock_objects) + + print( + f"Processing results - Input objects: {len(mock_objects)}, Output objects: {len(results)}" + ) + + # Verify results + assert isinstance(results, list), "Results should be a list" + assert len(results) <= len(mock_objects), "Should not return more objects than input" + + # Check each result object + for i, result in enumerate(results): + print(f"Object {i}: {result.get('name', 'unknown')}") + + # Verify required fields exist + assert "point_cloud" in result, "Result should contain point_cloud" + assert "color" in result, "Result should contain color" + assert "point_cloud_numpy" in result, "Result should contain point_cloud_numpy" + + # Verify point cloud is valid Open3D object + pcd = result["point_cloud"] + assert isinstance(pcd, o3d.geometry.PointCloud), ( + "point_cloud should be Open3D PointCloud" + ) + + # Verify numpy arrays + points_array = result["point_cloud_numpy"] + assert isinstance(points_array, np.ndarray), ( + "point_cloud_numpy should be numpy array" + ) + assert points_array.shape[1] == 3, "Point array should have 3 columns (x,y,z)" + assert points_array.dtype == np.float32, "Point array should be float32" + + # Verify color + color = result["color"] + assert isinstance(color, np.ndarray), "Color should be numpy array" + assert color.shape == (3,), "Color should be RGB triplet" + assert color.dtype == np.uint8, "Color should be uint8" + + # Check if 3D information was added (when enough points for cuboid fitting) + points = np.asarray(pcd.points) + if len(points) >= filtering.min_points_for_cuboid: + if "position" in result: + assert "rotation" in result, "Should have rotation if position exists" + assert "size" in result, "Should have size if position exists" + + # Verify position format + from dimos.types.vector import Vector + + position = result["position"] + assert isinstance(position, Vector), "Position should be Vector" + + # Verify size format + size = result["size"] + assert isinstance(size, dict), "Size should be dict" + assert "width" in size and "height" in size and "depth" in size + + print(f" - Points: {len(points)}") + print(f" - Color: {color}") + if "position" in result: + print(f" - Position: {result['position']}") + print(f" - Size: {result['size']}") + + # Test full point cloud access + full_pcd = filtering.get_full_point_cloud() + if full_pcd is not None: + assert isinstance(full_pcd, o3d.geometry.PointCloud), ( + "Full point cloud should be Open3D PointCloud" + ) + full_points = np.asarray(full_pcd.points) + print(f"Full point cloud points: {len(full_points)}") + + print("All pointcloud filtering tests passed!") + + except Exception as e: + pytest.skip(f"Skipping test due to error: {e}") + + def test_pointcloud_filtering_empty_objects(self): + """Test PointcloudFiltering with empty object list.""" + try: + from dimos.utils.data import get_data + + # Load test data + data_dir = get_data("rgbd_frames") + color_path = os.path.join(data_dir, "color", "00000.png") + depth_path = os.path.join(data_dir, "depth", "00000.png") + + if not (os.path.exists(color_path) and os.path.exists(depth_path)): + pytest.skip("Test images not found") + + color_img = cv2.imread(color_path) + color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB) + depth_img = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH) + if depth_img.dtype == np.uint16: + depth_img = depth_img.astype(np.float32) / 1000.0 + + filtering = PointcloudFiltering() + + # Test with empty object list + results = filtering.process_images(color_img, depth_img, []) + + assert isinstance(results, list), "Results should be a list" + assert len(results) == 0, "Should return empty list for empty input" + + except Exception as e: + pytest.skip(f"Skipping test due to error: {e}") + + def test_color_generation_consistency(self): + """Test that color generation is consistent for the same object ID.""" + try: + filtering = PointcloudFiltering() + + # Test color generation consistency + color1 = filtering.generate_color_from_id(42) + color2 = filtering.generate_color_from_id(42) + color3 = filtering.generate_color_from_id(43) + + assert np.array_equal(color1, color2), "Same ID should generate same color" + assert not np.array_equal(color1, color3), ( + "Different IDs should generate different colors" + ) + assert color1.shape == (3,), "Color should be RGB triplet" + assert color1.dtype == np.uint8, "Color should be uint8" + + except Exception as e: + pytest.skip(f"Skipping test due to error: {e}") + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/test_manipulation_pipeline_single_frame.py b/tests/test_manipulation_pipeline_single_frame.py index 1fc42d0810..061eb9035e 100644 --- a/tests/test_manipulation_pipeline_single_frame.py +++ b/tests/test_manipulation_pipeline_single_frame.py @@ -15,12 +15,12 @@ """Test manipulation processor with direct visualization and grasp data output.""" import os -import sys import cv2 import numpy as np -import time import argparse import matplotlib +import tests.test_header +from dimos.utils.data import get_data # Try to use TkAgg backend for live display, fallback to Agg if not available try: @@ -32,9 +32,6 @@ matplotlib.use("Agg") # Fallback to non-interactive import matplotlib.pyplot as plt import open3d as o3d -from typing import Dict, List - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid from dimos.perception.manip_aio_processer import ManipulationProcessor @@ -114,7 +111,7 @@ def run_processor(color_img, depth_img, intrinsics, grasp_server_url=None): def main(): parser = argparse.ArgumentParser() - parser.add_argument("--data-dir", default="assets/rgbd_data") + parser.add_argument("--data-dir", default=get_data("rgbd_frames")) parser.add_argument("--wait-time", type=float, default=5.0) parser.add_argument( "--grasp-server-url", diff --git a/tests/test_manipulation_pipeline_single_frame_lcm.py b/tests/test_manipulation_pipeline_single_frame_lcm.py index d8090575eb..635f82c9c9 100644 --- a/tests/test_manipulation_pipeline_single_frame_lcm.py +++ b/tests/test_manipulation_pipeline_single_frame_lcm.py @@ -18,13 +18,11 @@ import sys import cv2 import numpy as np -import time import argparse import threading import pickle import matplotlib -import json -import copy +import tests.test_header # Try to use TkAgg backend for live display, fallback to Agg if not available try: @@ -43,8 +41,6 @@ from lcm_msgs.sensor_msgs import Image as LCMImage from lcm_msgs.sensor_msgs import CameraInfo as LCMCameraInfo -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid from dimos.perception.manip_aio_processer import ManipulationProcessor from dimos.perception.grasp_generation.utils import visualize_grasps_3d From 05c1b4ff08daad2911ab7e854688089d336c75ed Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Wed, 16 Jul 2025 14:24:39 -0700 Subject: [PATCH 3/7] Added RGB-D to LFS --- data/.lfs/rgbd_frames.tar.gz | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 data/.lfs/rgbd_frames.tar.gz diff --git a/data/.lfs/rgbd_frames.tar.gz b/data/.lfs/rgbd_frames.tar.gz new file mode 100644 index 0000000000..8081c76961 --- /dev/null +++ b/data/.lfs/rgbd_frames.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:381b9fd296a885f5211a668df16c68581d2aee458c8734c3256a7461f0decccd +size 948391033 From 37a5621db0510f45981ba86c1811393784d8e06b Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Wed, 16 Jul 2025 14:25:46 -0700 Subject: [PATCH 4/7] Don't gitignore /data/.lfs .tar.gz files --- .gitignore | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 6378d36468..f5c93cf65f 100644 --- a/.gitignore +++ b/.gitignore @@ -37,7 +37,8 @@ package-lock.json # Ignore build artifacts dist/ -# Ignore data and modelfiles -data/ +# Ignore data directory but keep .lfs subdirectory +data/* +!data/.lfs/ FastSAM-x.pt yolo11n.pt From d20e5f9695e08f79b0c85b46d8003ed6aa921d0c Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 17 Jul 2025 23:08:11 -0700 Subject: [PATCH 5/7] Removed unused pointcloud utilities --- .../pointcloud/pointcloud_filtering.py | 1 - dimos/perception/pointcloud/utils.py | 285 ------------------ 2 files changed, 286 deletions(-) diff --git a/dimos/perception/pointcloud/pointcloud_filtering.py b/dimos/perception/pointcloud/pointcloud_filtering.py index ea50cd03d2..3de2f3ae6a 100644 --- a/dimos/perception/pointcloud/pointcloud_filtering.py +++ b/dimos/perception/pointcloud/pointcloud_filtering.py @@ -26,7 +26,6 @@ from dimos.perception.pointcloud.utils import ( load_camera_matrix_from_yaml, create_point_cloud_and_extract_masks, - o3d_point_cloud_to_numpy, ) from dimos.perception.pointcloud.cuboid_fit import fit_cuboid diff --git a/dimos/perception/pointcloud/utils.py b/dimos/perception/pointcloud/utils.py index 16cb0e0800..0813c2ca0e 100644 --- a/dimos/perception/pointcloud/utils.py +++ b/dimos/perception/pointcloud/utils.py @@ -28,62 +28,6 @@ from scipy.spatial import cKDTree -def depth_to_point_cloud(depth_image, camera_intrinsics, subsample_factor=4): - """ - Convert depth image to point cloud using camera intrinsics. - Subsamples points to reduce density. - - Args: - depth_image: HxW depth image in meters - camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix - subsample_factor: Factor to subsample points (higher = fewer points) - - Returns: - Nx3 array of 3D points - """ - # Filter out inf and nan values from depth image - depth_filtered = depth_image.copy() - - # Create mask for valid depth values (finite, positive, non-zero) - valid_mask = np.isfinite(depth_filtered) & (depth_filtered > 0) - - # Set invalid values to 0 - depth_filtered[~valid_mask] = 0.0 - - # Extract camera parameters - if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: - fx, fy, cx, cy = camera_intrinsics - else: - fx = camera_intrinsics[0, 0] - fy = camera_intrinsics[1, 1] - cx = camera_intrinsics[0, 2] - cy = camera_intrinsics[1, 2] - - # Create pixel coordinate grid - rows, cols = depth_filtered.shape - x_grid, y_grid = np.meshgrid( - np.arange(0, cols, subsample_factor), np.arange(0, rows, subsample_factor) - ) - - # Flatten grid and depth - x = x_grid.flatten() - y = y_grid.flatten() - z = depth_filtered[y_grid, x_grid].flatten() - - # Remove points with invalid depth (after filtering, this catches zeros) - valid = z > 0 - x = x[valid] - y = y[valid] - z = z[valid] - - # Convert to 3D points - X = (x - cx) * z / fx - Y = (y - cy) * z / fy - Z = z - - return np.column_stack([X, Y, Z]) - - def load_camera_matrix_from_yaml( camera_info: Optional[Union[str, List[float], np.ndarray, dict]], ) -> Optional[np.ndarray]: @@ -251,123 +195,6 @@ def create_o3d_point_cloud_from_rgbd( return pcd -def o3d_point_cloud_to_numpy(pcd: o3d.geometry.PointCloud) -> np.ndarray: - """ - Convert Open3D point cloud to numpy array of XYZRGB points. - - Args: - pcd: Open3D point cloud object - - Returns: - Nx6 array of XYZRGB points (empty array if no points) - """ - points = np.asarray(pcd.points) - if len(points) == 0: - return np.zeros((0, 6), dtype=np.float32) - - # Get colors if available - if pcd.has_colors(): - colors = np.asarray(pcd.colors) * 255.0 # Convert from [0,1] to [0,255] - return np.column_stack([points, colors]).astype(np.float32) - else: - # No colors available, return points with zero colors - zeros = np.zeros((len(points), 3), dtype=np.float32) - return np.column_stack([points, zeros]).astype(np.float32) - - -def numpy_to_o3d_point_cloud(points_rgb: np.ndarray) -> o3d.geometry.PointCloud: - """ - Convert numpy array of XYZRGB points to Open3D point cloud. - - Args: - points_rgb: Nx6 array of XYZRGB points or Nx3 array of XYZ points - - Returns: - Open3D point cloud object - - Raises: - ValueError: If array shape is invalid - """ - if len(points_rgb) == 0: - return o3d.geometry.PointCloud() - - if points_rgb.shape[1] < 3: - raise ValueError( - f"points_rgb must have at least 3 columns (XYZ), got {points_rgb.shape[1]}" - ) - - pcd = o3d.geometry.PointCloud() - pcd.points = o3d.utility.Vector3dVector(points_rgb[:, :3]) - - # Add colors if available - if points_rgb.shape[1] >= 6: - colors = points_rgb[:, 3:6] / 255.0 # Convert from [0,255] to [0,1] - colors = np.clip(colors, 0.0, 1.0) # Ensure valid range - pcd.colors = o3d.utility.Vector3dVector(colors) - - return pcd - - -def create_masked_point_cloud(color_img, depth_img, mask, intrinsic, depth_scale=1.0): - """ - Create a point cloud for a masked region of RGBD data using Open3D. - - Args: - color_img: RGB image (H, W, 3) - depth_img: Depth image (H, W) - mask: Boolean mask of the same size as color_img and depth_img - intrinsic: Camera intrinsic matrix (3x3 numpy array) - depth_scale: Scale factor to convert depth to meters - - Returns: - Open3D point cloud object for the masked region - """ - # Filter out inf and nan values from depth image - depth_filtered = depth_img.copy() - - # Create mask for valid depth values (finite, positive, non-zero) - valid_depth_mask = np.isfinite(depth_filtered) & (depth_filtered > 0) - - # Set invalid values to 0 - depth_filtered[~valid_depth_mask] = 0.0 - - # Create masked color and depth images - masked_color = color_img.copy() - masked_depth = depth_filtered.copy() - - # Apply mask - if not mask.shape[:2] == color_img.shape[:2]: - raise ValueError(f"Mask shape {mask.shape} doesn't match image shape {color_img.shape[:2]}") - - # Create a boolean mask that is properly expanded for the RGB channels - # For RGB image, we need to properly broadcast the mask to all 3 channels - if len(color_img.shape) == 3 and color_img.shape[2] == 3: - # Properly broadcast mask to match the RGB dimensions - mask_rgb = np.broadcast_to(mask[:, :, np.newaxis], color_img.shape) - masked_color[~mask_rgb] = 0 - else: - # For grayscale images - masked_color[~mask] = 0 - - # Apply mask to depth image - masked_depth[~mask] = 0 - - # Create point cloud - pcd = create_o3d_point_cloud_from_rgbd(masked_color, masked_depth, intrinsic, depth_scale) - - # Remove points with coordinates at origin (0,0,0) which are likely from masked out regions - points = np.asarray(pcd.points) - if len(points) > 0: - # Find points that are not at origin - dist_from_origin = np.sum(points**2, axis=1) - valid_indices = dist_from_origin > 1e-6 - - # Filter points and colors - pcd = pcd.select_by_index(np.where(valid_indices)[0]) - - return pcd - - def create_point_cloud_and_extract_masks( color_img: np.ndarray, depth_img: np.ndarray, @@ -437,89 +264,6 @@ def create_point_cloud_and_extract_masks( return full_pcd, masked_pcds -def extract_masked_point_cloud_efficient( - full_pcd: o3d.geometry.PointCloud, depth_img: np.ndarray, mask: np.ndarray -) -> o3d.geometry.PointCloud: - """ - Extract a masked region from an existing point cloud efficiently. - - This assumes the point cloud was created from the given depth image. - Use this when you have a pre-computed full point cloud and want to extract - individual masked regions. - - Args: - full_pcd: Complete Open3D point cloud - depth_img: Depth image used to create the point cloud (H, W) - mask: Boolean mask (H, W) - - Returns: - Open3D point cloud for the masked region - - Raises: - ValueError: If mask shape doesn't match depth image - """ - if mask.shape != depth_img.shape: - raise ValueError( - f"Mask shape {mask.shape} doesn't match depth image shape {depth_img.shape}" - ) - - # Early return if no points in full point cloud - if len(np.asarray(full_pcd.points)) == 0: - return o3d.geometry.PointCloud() - - # Get valid depth mask - valid_depth = depth_img.flatten() > 0 - mask_flat = mask.flatten() - - # Find pixels that are both valid and in the mask - valid_mask_indices = mask_flat & valid_depth - - # Get indices of valid points - point_indices = np.where(valid_mask_indices[valid_depth])[0] - - # Extract the masked point cloud - if len(point_indices) > 0: - return full_pcd.select_by_index(point_indices) - else: - return o3d.geometry.PointCloud() - - -def segment_and_remove_plane(pcd, distance_threshold=0.02, ransac_n=3, num_iterations=1000): - """ - Segment the dominant plane from a point cloud using RANSAC and remove it. - Often used to remove table tops, floors, walls, or other planar surfaces. - - Args: - pcd: Open3D point cloud object - distance_threshold: Maximum distance a point can be from the plane to be considered an inlier (in meters) - ransac_n: Number of points to sample for each RANSAC iteration - num_iterations: Number of RANSAC iterations - - Returns: - Open3D point cloud with the dominant plane removed - """ - # Make a copy of the input point cloud to avoid modifying the original - pcd_filtered = o3d.geometry.PointCloud() - pcd_filtered.points = o3d.utility.Vector3dVector(np.asarray(pcd.points)) - if pcd.has_colors(): - pcd_filtered.colors = o3d.utility.Vector3dVector(np.asarray(pcd.colors)) - if pcd.has_normals(): - pcd_filtered.normals = o3d.utility.Vector3dVector(np.asarray(pcd.normals)) - - # Check if point cloud has enough points - if len(pcd_filtered.points) < ransac_n: - return pcd_filtered - - # Run RANSAC to find the largest plane - _, inliers = pcd_filtered.segment_plane( - distance_threshold=distance_threshold, ransac_n=ransac_n, num_iterations=num_iterations - ) - - # Remove the dominant plane (regardless of orientation) - pcd_without_dominant_plane = pcd_filtered.select_by_index(inliers, invert=True) - return pcd_without_dominant_plane - - def filter_point_cloud_statistical( pcd: o3d.geometry.PointCloud, nb_neighbors: int = 20, std_ratio: float = 2.0 ) -> Tuple[o3d.geometry.PointCloud, np.ndarray]: @@ -560,35 +304,6 @@ def filter_point_cloud_radius( return pcd.remove_radius_outlier(nb_points=nb_points, radius=radius) -def compute_point_cloud_bounds(pcd: o3d.geometry.PointCloud) -> dict: - """ - Compute bounding box information for a point cloud. - - Args: - pcd: Input point cloud - - Returns: - Dictionary with bounds information - """ - points = np.asarray(pcd.points) - if len(points) == 0: - return { - "min": np.array([0, 0, 0]), - "max": np.array([0, 0, 0]), - "center": np.array([0, 0, 0]), - "size": np.array([0, 0, 0]), - "volume": 0.0, - } - - min_bound = points.min(axis=0) - max_bound = points.max(axis=0) - center = (min_bound + max_bound) / 2 - size = max_bound - min_bound - volume = np.prod(size) - - return {"min": min_bound, "max": max_bound, "center": center, "size": size, "volume": volume} - - def project_3d_points_to_2d( points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray] ) -> np.ndarray: From ffa68d33f075879bcb21e6f96bb80d4402ff2597 Mon Sep 17 00:00:00 2001 From: stash Date: Fri, 18 Jul 2025 00:34:59 -0700 Subject: [PATCH 6/7] Removed unused grasp utils --- dimos/perception/grasp_generation/utils.py | 51 ---------------------- 1 file changed, 51 deletions(-) diff --git a/dimos/perception/grasp_generation/utils.py b/dimos/perception/grasp_generation/utils.py index 5a79a7d3bc..94377363f2 100644 --- a/dimos/perception/grasp_generation/utils.py +++ b/dimos/perception/grasp_generation/utils.py @@ -63,32 +63,6 @@ def project_3d_points_to_2d( return points_2d -def euler_to_rotation_matrix(roll: float, pitch: float, yaw: float) -> np.ndarray: - """ - Convert Euler angles to rotation matrix. - - Args: - roll: Roll angle in radians - pitch: Pitch angle in radians - yaw: Yaw angle in radians - - Returns: - 3x3 rotation matrix - """ - Rx = np.array([[1, 0, 0], [0, np.cos(roll), -np.sin(roll)], [0, np.sin(roll), np.cos(roll)]]) - - Ry = np.array( - [[np.cos(pitch), 0, np.sin(pitch)], [0, 1, 0], [-np.sin(pitch), 0, np.cos(pitch)]] - ) - - Rz = np.array([[np.cos(yaw), -np.sin(yaw), 0], [np.sin(yaw), np.cos(yaw), 0], [0, 0, 1]]) - - # Combined rotation matrix - R = Rz @ Ry @ Rx - - return R - - def create_gripper_geometry( grasp_data: dict, finger_length: float = 0.08, @@ -527,31 +501,6 @@ def visualize_grasps_3d( o3d.visualization.draw_geometries(geometries, window_name="3D Grasp Visualization") -def rotation_matrix_to_euler(rotation_matrix: np.ndarray) -> Tuple[float, float, float]: - """ - Convert 3x3 rotation matrix to Euler angles (roll, pitch, yaw). - - Args: - rotation_matrix: 3x3 rotation matrix - - Returns: - Tuple of (roll, pitch, yaw) in radians - """ - sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2) - singular = sy < 1e-6 - - if not singular: - x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2]) # roll - y = np.arctan2(-rotation_matrix[2, 0], sy) # pitch - z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0]) # yaw - else: - x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1]) # roll - y = np.arctan2(-rotation_matrix[2, 0], sy) # pitch - z = 0 # yaw - - return x, y, z - - def parse_grasp_results(grasps: List[Dict]) -> List[Dict]: """ Parse grasp results into visualization format. From 256d9f9697bd50e2d906db3751e0419d0ccce079 Mon Sep 17 00:00:00 2001 From: stash Date: Fri, 18 Jul 2025 00:40:32 -0700 Subject: [PATCH 7/7] Fixed file rename --- ...on_pipeline.py.py => test_manipulation_perception_pipeline.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_manipulation_perception_pipeline.py.py => test_manipulation_perception_pipeline.py} (100%) diff --git a/tests/test_manipulation_perception_pipeline.py.py b/tests/test_manipulation_perception_pipeline.py similarity index 100% rename from tests/test_manipulation_perception_pipeline.py.py rename to tests/test_manipulation_perception_pipeline.py