From 4383737bfe2a5622d8d10fdc460737c7f8727849 Mon Sep 17 00:00:00 2001
From: Alex Lin <alex.lin416@outlook.com>
Date: Tue, 15 Jul 2025 19:16:20 -0700
Subject: [PATCH 1/7] Point Cloud Filtering and Segmentation, Full 6DOF Object
 pose estimation, Grasp generation, ZED driver support, Hosted grasp
 integration

---
 dimos/hardware/zed_camera.py                  |  313 ++++
 dimos/perception/common/__init__.py           |    2 +-
 dimos/perception/common/cuboid_fit.py         |  331 ----
 dimos/perception/common/utils.py              |  331 ++++
 dimos/perception/grasp_generation/__init__.py |    1 +
 .../grasp_generation/grasp_generation.py      |  228 +++
 dimos/perception/grasp_generation/utils.py    |  621 ++++++++
 dimos/perception/manip_aio_pipeline.py        |  590 +++++++
 dimos/perception/manip_aio_processer.py       |  411 +++++
 dimos/perception/object_detection_stream.py   |   67 +-
 dimos/perception/pointcloud/__init__.py       |    3 +
 dimos/perception/pointcloud/cuboid_fit.py     |  414 +++++
 .../pointcloud/pointcloud_filtering.py        |  674 ++++++++
 dimos/perception/pointcloud/utils.py          | 1367 +++++++++++++++++
 dimos/perception/segmentation/sam_2d_seg.py   |   17 +-
 dimos/perception/segmentation/utils.py        |    6 +
 dimos/stream/stereo_camera_streams/zed.py     |  164 ++
 dimos/types/manipulation.py                   |   19 +-
 ...est_manipulation_perception_pipeline.py.py |  167 ++
 ...test_manipulation_pipeline_single_frame.py |  248 +++
 ..._manipulation_pipeline_single_frame_lcm.py |  431 ++++++
 tests/test_pointcloud_filtering.py            |  105 ++
 tests/test_zed_setup.py                       |  182 +++
 tests/visualization_script.py                 | 1041 +++++++++++++
 tests/zed_neural_depth_demo.py                |  450 ++++++
 25 files changed, 7783 insertions(+), 400 deletions(-)
 create mode 100644 dimos/hardware/zed_camera.py
 delete mode 100644 dimos/perception/common/cuboid_fit.py
 create mode 100644 dimos/perception/common/utils.py
 create mode 100644 dimos/perception/grasp_generation/__init__.py
 create mode 100644 dimos/perception/grasp_generation/grasp_generation.py
 create mode 100644 dimos/perception/grasp_generation/utils.py
 create mode 100644 dimos/perception/manip_aio_pipeline.py
 create mode 100644 dimos/perception/manip_aio_processer.py
 create mode 100644 dimos/perception/pointcloud/__init__.py
 create mode 100644 dimos/perception/pointcloud/cuboid_fit.py
 create mode 100644 dimos/perception/pointcloud/pointcloud_filtering.py
 create mode 100644 dimos/perception/pointcloud/utils.py
 create mode 100644 dimos/stream/stereo_camera_streams/zed.py
 create mode 100644 tests/test_manipulation_perception_pipeline.py.py
 create mode 100644 tests/test_manipulation_pipeline_single_frame.py
 create mode 100644 tests/test_manipulation_pipeline_single_frame_lcm.py
 create mode 100644 tests/test_pointcloud_filtering.py
 create mode 100755 tests/test_zed_setup.py
 create mode 100644 tests/visualization_script.py
 create mode 100755 tests/zed_neural_depth_demo.py

diff --git a/dimos/hardware/zed_camera.py b/dimos/hardware/zed_camera.py
new file mode 100644
index 0000000000..b93d2577e6
--- /dev/null
+++ b/dimos/hardware/zed_camera.py
@@ -0,0 +1,313 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import cv2
+import open3d as o3d
+from typing import Optional, Tuple, Dict, Any
+import logging
+
+try:
+    import pyzed.sl as sl
+except ImportError:
+    sl = None
+    logging.warning("ZED SDK not found. Please install pyzed to use ZED camera functionality.")
+
+from dimos.hardware.stereo_camera import StereoCamera
+
+logger = logging.getLogger(__name__)
+
+
+class ZEDCamera(StereoCamera):
+    """ZED Camera capture node with neural depth processing."""
+
+    def __init__(
+        self,
+        camera_id: int = 0,
+        resolution: sl.RESOLUTION = sl.RESOLUTION.HD720,
+        depth_mode: sl.DEPTH_MODE = sl.DEPTH_MODE.NEURAL,
+        fps: int = 30,
+        **kwargs,
+    ):
+        """
+        Initialize ZED Camera.
+
+        Args:
+            camera_id: Camera ID (0 for first ZED)
+            resolution: ZED camera resolution
+            depth_mode: Depth computation mode
+            fps: Camera frame rate (default: 30)
+        """
+        if sl is None:
+            raise ImportError("ZED SDK not installed. Please install pyzed package.")
+
+        super().__init__(**kwargs)
+
+        self.camera_id = camera_id
+        self.resolution = resolution
+        self.depth_mode = depth_mode
+        self.fps = fps
+
+        # Initialize ZED camera
+        self.zed = sl.Camera()
+        self.init_params = sl.InitParameters()
+        self.init_params.camera_resolution = resolution
+        self.init_params.depth_mode = depth_mode
+        self.init_params.coordinate_units = sl.UNIT.METER
+        self.init_params.camera_fps = fps
+
+        # Set camera ID using the correct parameter name
+        if hasattr(self.init_params, "set_from_camera_id"):
+            self.init_params.set_from_camera_id(camera_id)
+        elif hasattr(self.init_params, "input"):
+            self.init_params.input.set_from_camera_id(camera_id)
+
+        # Use enable_fill_mode instead of SENSING_MODE.STANDARD
+        self.runtime_params = sl.RuntimeParameters()
+        self.runtime_params.enable_fill_mode = True  # False = STANDARD mode, True = FILL mode
+
+        # Image containers
+        self.image_left = sl.Mat()
+        self.image_right = sl.Mat()
+        self.depth_map = sl.Mat()
+        self.point_cloud = sl.Mat()
+        self.confidence_map = sl.Mat()
+
+        self.is_opened = False
+
+    def open(self) -> bool:
+        """Open the ZED camera."""
+        try:
+            err = self.zed.open(self.init_params)
+            if err != sl.ERROR_CODE.SUCCESS:
+                logger.error(f"Failed to open ZED camera: {err}")
+                return False
+
+            self.is_opened = True
+            logger.info("ZED camera opened successfully")
+
+            # Get camera information
+            info = self.zed.get_camera_information()
+            logger.info(f"ZED Camera Model: {info.camera_model}")
+            logger.info(f"Serial Number: {info.serial_number}")
+            logger.info(f"Firmware: {info.camera_configuration.firmware_version}")
+
+            return True
+
+        except Exception as e:
+            logger.error(f"Error opening ZED camera: {e}")
+            return False
+
+    def close(self):
+        """Close the ZED camera."""
+        if self.is_opened:
+            self.zed.close()
+            self.is_opened = False
+            logger.info("ZED camera closed")
+
+    def capture_frame(
+        self,
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]:
+        """
+        Capture a frame from ZED camera.
+
+        Returns:
+            Tuple of (left_image, right_image, depth_map) as numpy arrays
+        """
+        if not self.is_opened:
+            logger.error("ZED camera not opened")
+            return None, None, None
+
+        try:
+            # Grab frame
+            if self.zed.grab(self.runtime_params) == sl.ERROR_CODE.SUCCESS:
+                # Retrieve left image
+                self.zed.retrieve_image(self.image_left, sl.VIEW.LEFT)
+                left_img = self.image_left.get_data()[:, :, :3]  # Remove alpha channel
+
+                # Retrieve right image
+                self.zed.retrieve_image(self.image_right, sl.VIEW.RIGHT)
+                right_img = self.image_right.get_data()[:, :, :3]  # Remove alpha channel
+
+                # Retrieve depth map
+                self.zed.retrieve_measure(self.depth_map, sl.MEASURE.DEPTH)
+                depth = self.depth_map.get_data()
+
+                return left_img, right_img, depth
+            else:
+                logger.warning("Failed to grab frame from ZED camera")
+                return None, None, None
+
+        except Exception as e:
+            logger.error(f"Error capturing frame: {e}")
+            return None, None, None
+
+    def capture_pointcloud(self) -> Optional[o3d.geometry.PointCloud]:
+        """
+        Capture point cloud from ZED camera.
+
+        Returns:
+            Open3D point cloud with XYZ coordinates and RGB colors
+        """
+        if not self.is_opened:
+            logger.error("ZED camera not opened")
+            return None
+
+        try:
+            if self.zed.grab(self.runtime_params) == sl.ERROR_CODE.SUCCESS:
+                # Retrieve point cloud with RGBA data
+                self.zed.retrieve_measure(self.point_cloud, sl.MEASURE.XYZRGBA)
+                point_cloud_data = self.point_cloud.get_data()
+
+                # Convert to numpy array format
+                height, width = point_cloud_data.shape[:2]
+                points = point_cloud_data.reshape(-1, 4)
+
+                # Extract XYZ coordinates
+                xyz = points[:, :3]
+
+                # Extract and unpack RGBA color data from 4th channel
+                rgba_packed = points[:, 3].view(np.uint32)
+
+                # Unpack RGBA: each 32-bit value contains 4 bytes (R, G, B, A)
+                colors_rgba = np.zeros((len(rgba_packed), 4), dtype=np.uint8)
+                colors_rgba[:, 0] = rgba_packed & 0xFF  # R
+                colors_rgba[:, 1] = (rgba_packed >> 8) & 0xFF  # G
+                colors_rgba[:, 2] = (rgba_packed >> 16) & 0xFF  # B
+                colors_rgba[:, 3] = (rgba_packed >> 24) & 0xFF  # A
+
+                # Extract RGB (ignore alpha) and normalize to [0, 1]
+                colors_rgb = colors_rgba[:, :3].astype(np.float64) / 255.0
+
+                # Filter out invalid points (NaN or inf)
+                valid = np.isfinite(xyz).all(axis=1)
+                valid_xyz = xyz[valid]
+                valid_colors = colors_rgb[valid]
+
+                # Create Open3D point cloud
+                pcd = o3d.geometry.PointCloud()
+
+                if len(valid_xyz) > 0:
+                    pcd.points = o3d.utility.Vector3dVector(valid_xyz)
+                    pcd.colors = o3d.utility.Vector3dVector(valid_colors)
+
+                return pcd
+            else:
+                logger.warning("Failed to grab frame for point cloud")
+                return None
+
+        except Exception as e:
+            logger.error(f"Error capturing point cloud: {e}")
+            return None
+
+    def get_camera_info(self) -> Dict[str, Any]:
+        """Get ZED camera information and calibration parameters."""
+        if not self.is_opened:
+            return {}
+
+        try:
+            info = self.zed.get_camera_information()
+            calibration = info.camera_configuration.calibration_parameters
+
+            # In ZED SDK 4.0+, the baseline calculation has changed
+            # Try to get baseline from the stereo parameters
+            try:
+                # Method 1: Try to get from stereo parameters if available
+                if hasattr(calibration, "getCameraBaseline"):
+                    baseline = calibration.getCameraBaseline()
+                else:
+                    # Method 2: Calculate from left and right camera positions
+                    # The baseline is the distance between left and right cameras
+                    left_cam = calibration.left_cam
+                    right_cam = calibration.right_cam
+
+                    # Try different ways to get baseline in SDK 4.0+
+                    if hasattr(info.camera_configuration, "calibration_parameters_raw"):
+                        # Use raw calibration if available
+                        raw_calib = info.camera_configuration.calibration_parameters_raw
+                        if hasattr(raw_calib, "T"):
+                            baseline = abs(raw_calib.T[0])
+                        else:
+                            baseline = 0.12  # Default ZED-M baseline approximation
+                    else:
+                        # Use default baseline for ZED-M
+                        baseline = 0.12  # ZED-M baseline is approximately 120mm
+            except:
+                baseline = 0.12  # Fallback to approximate ZED-M baseline
+
+            return {
+                "model": str(info.camera_model),
+                "serial_number": info.serial_number,
+                "firmware": info.camera_configuration.firmware_version,
+                "resolution": {
+                    "width": info.camera_configuration.resolution.width,
+                    "height": info.camera_configuration.resolution.height,
+                },
+                "fps": info.camera_configuration.fps,
+                "left_cam": {
+                    "fx": calibration.left_cam.fx,
+                    "fy": calibration.left_cam.fy,
+                    "cx": calibration.left_cam.cx,
+                    "cy": calibration.left_cam.cy,
+                    "k1": calibration.left_cam.disto[0],
+                    "k2": calibration.left_cam.disto[1],
+                    "p1": calibration.left_cam.disto[2],
+                    "p2": calibration.left_cam.disto[3],
+                    "k3": calibration.left_cam.disto[4],
+                },
+                "right_cam": {
+                    "fx": calibration.right_cam.fx,
+                    "fy": calibration.right_cam.fy,
+                    "cx": calibration.right_cam.cx,
+                    "cy": calibration.right_cam.cy,
+                    "k1": calibration.right_cam.disto[0],
+                    "k2": calibration.right_cam.disto[1],
+                    "p1": calibration.right_cam.disto[2],
+                    "p2": calibration.right_cam.disto[3],
+                    "k3": calibration.right_cam.disto[4],
+                },
+                "baseline": baseline,
+            }
+        except Exception as e:
+            logger.error(f"Error getting camera info: {e}")
+            return {}
+
+    def calculate_intrinsics(self):
+        """Calculate camera intrinsics from ZED calibration."""
+        info = self.get_camera_info()
+        if not info:
+            return super().calculate_intrinsics()
+
+        left_cam = info.get("left_cam", {})
+        resolution = info.get("resolution", {})
+
+        return {
+            "focal_length_x": left_cam.get("fx", 0),
+            "focal_length_y": left_cam.get("fy", 0),
+            "principal_point_x": left_cam.get("cx", 0),
+            "principal_point_y": left_cam.get("cy", 0),
+            "baseline": info.get("baseline", 0),
+            "resolution_width": resolution.get("width", 0),
+            "resolution_height": resolution.get("height", 0),
+        }
+
+    def __enter__(self):
+        """Context manager entry."""
+        if not self.open():
+            raise RuntimeError("Failed to open ZED camera")
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        self.close()
diff --git a/dimos/perception/common/__init__.py b/dimos/perception/common/__init__.py
index ad815d3f46..e658a8734c 100644
--- a/dimos/perception/common/__init__.py
+++ b/dimos/perception/common/__init__.py
@@ -1,3 +1,3 @@
 from .detection2d_tracker import target2dTracker, get_tracked_results
-from .cuboid_fit import *
 from .ibvs import *
+from .utils import *
diff --git a/dimos/perception/common/cuboid_fit.py b/dimos/perception/common/cuboid_fit.py
deleted file mode 100644
index 9848332c06..0000000000
--- a/dimos/perception/common/cuboid_fit.py
+++ /dev/null
@@ -1,331 +0,0 @@
-# Copyright 2025 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from sklearn.decomposition import PCA
-import matplotlib.pyplot as plt
-import cv2
-
-
-def depth_to_point_cloud(depth_image, camera_matrix, subsample_factor=4):
-    """
-    Convert depth image to point cloud using camera intrinsics.
-    Subsamples points to reduce density.
-
-    Args:
-        depth_image: HxW depth image in meters
-        camera_matrix: 3x3 camera intrinsic matrix
-        subsample_factor: Factor to subsample points (higher = fewer points)
-
-    Returns:
-        Nx3 array of 3D points
-    """
-    # Get focal length and principal point from camera matrix
-    fx = camera_matrix[0, 0]
-    fy = camera_matrix[1, 1]
-    cx = camera_matrix[0, 2]
-    cy = camera_matrix[1, 2]
-
-    # Create pixel coordinate grid
-    rows, cols = depth_image.shape
-    x_grid, y_grid = np.meshgrid(
-        np.arange(0, cols, subsample_factor), np.arange(0, rows, subsample_factor)
-    )
-
-    # Flatten grid and depth
-    x = x_grid.flatten()
-    y = y_grid.flatten()
-    z = depth_image[y_grid, x_grid].flatten()
-
-    # Remove points with invalid depth
-    valid = z > 0
-    x = x[valid]
-    y = y[valid]
-    z = z[valid]
-
-    # Convert to 3D points
-    X = (x - cx) * z / fx
-    Y = (y - cy) * z / fy
-    Z = z
-
-    return np.column_stack([X, Y, Z])
-
-
-def fit_cuboid(points, n_iterations=5, inlier_thresh=2.0):
-    """
-    Fit a cuboid to a point cloud using iteratively refined PCA.
-
-    Args:
-        points: Nx3 array of points
-        n_iterations: Number of refinement iterations
-        inlier_thresh: Threshold for inlier detection in standard deviations
-
-    Returns:
-        dict containing:
-            - center: 3D center point
-            - dimensions: 3D dimensions
-            - rotation: 3x3 rotation matrix
-            - error: fitting error
-    """
-    points = np.asarray(points)
-    if len(points) < 4:
-        return None
-
-    # Initial center estimate using median for robustness
-    best_error = float("inf")
-    best_params = None
-    center = np.median(points, axis=0)
-    current_points = points - center
-
-    for iteration in range(n_iterations):
-        if len(current_points) < 4:  # Need at least 4 points for PCA
-            break
-
-        # Perform PCA
-        pca = PCA(n_components=3)
-        pca.fit(current_points)
-
-        # Get rotation matrix from PCA
-        rotation = pca.components_
-
-        # Transform points to PCA space
-        local_points = current_points @ rotation.T
-
-        # Initialize mask for this iteration
-        inlier_mask = np.ones(len(current_points), dtype=bool)
-        dimensions = np.zeros(3)
-
-        # Filter points along each dimension
-        for dim in range(3):
-            points_1d = local_points[inlier_mask, dim]
-            if len(points_1d) < 4:
-                break
-
-            median = np.median(points_1d)
-            mad = np.median(np.abs(points_1d - median))
-            sigma = mad * 1.4826  # Convert MAD to standard deviation estimate
-
-            # Avoid issues with constant values
-            if sigma < 1e-6:
-                continue
-
-            # Update mask for this dimension
-            dim_inliers = np.abs(points_1d - median) < (inlier_thresh * sigma)
-            inlier_mask[inlier_mask] = dim_inliers
-
-            # Calculate dimension based on robust statistics
-            valid_points = points_1d[dim_inliers]
-            if len(valid_points) > 0:
-                dimensions[dim] = np.max(valid_points) - np.min(valid_points)
-
-        # Skip if we don't have enough inliers
-        if np.sum(inlier_mask) < 4:
-            continue
-
-        # Calculate error for this iteration
-        # Mean squared distance from points to cuboid surface
-        half_dims = dimensions / 2
-        dx = np.abs(local_points[:, 0]) - half_dims[0]
-        dy = np.abs(local_points[:, 1]) - half_dims[1]
-        dz = np.abs(local_points[:, 2]) - half_dims[2]
-
-        outside_dist = np.sqrt(
-            np.maximum(dx, 0) ** 2 + np.maximum(dy, 0) ** 2 + np.maximum(dz, 0) ** 2
-        )
-        inside_dist = np.minimum(np.maximum(np.maximum(dx, dy), dz), 0)
-        distances = outside_dist + inside_dist
-        error = np.mean(distances**2)
-
-        if error < best_error:
-            best_error = error
-            best_params = {
-                "center": center,
-                "rotation": rotation,
-                "dimensions": dimensions,
-                "error": error,
-            }
-
-        # Update points for next iteration
-        current_points = current_points[inlier_mask]
-
-    return best_params
-
-
-def compute_fitting_error(local_points, dimensions):
-    """Compute mean squared distance from points to cuboid surface."""
-    half_dims = dimensions / 2
-    dx = np.abs(local_points[:, 0]) - half_dims[0]
-    dy = np.abs(local_points[:, 1]) - half_dims[1]
-    dz = np.abs(local_points[:, 2]) - half_dims[2]
-
-    outside_dist = np.sqrt(np.maximum(dx, 0) ** 2 + np.maximum(dy, 0) ** 2 + np.maximum(dz, 0) ** 2)
-    inside_dist = np.minimum(np.maximum(np.maximum(dx, dy), dz), 0)
-
-    distances = outside_dist + inside_dist
-    return np.mean(distances**2)
-
-
-def get_cuboid_corners(center, dimensions, rotation):
-    """Get the 8 corners of a cuboid."""
-    half_dims = dimensions / 2
-    corners_local = (
-        np.array(
-            [
-                [-1, -1, -1],  # 0: left  bottom back
-                [-1, -1, 1],  # 1: left  bottom front
-                [-1, 1, -1],  # 2: left  top    back
-                [-1, 1, 1],  # 3: left  top    front
-                [1, -1, -1],  # 4: right bottom back
-                [1, -1, 1],  # 5: right bottom front
-                [1, 1, -1],  # 6: right top    back
-                [1, 1, 1],  # 7: right top    front
-            ]
-        )
-        * half_dims
-    )
-
-    return corners_local @ rotation + center
-
-
-def visualize_fit(image, cuboid_params, camera_matrix, R=None, t=None):
-    """
-    Draw the fitted cuboid on the image.
-    """
-    # Get corners in world coordinates
-    corners = get_cuboid_corners(
-        cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"]
-    )
-
-    # Transform corners if R and t are provided
-    if R is not None and t is not None:
-        corners = (R @ corners.T).T + t
-
-    # Project corners to image space
-    corners_img = (
-        cv2.projectPoints(
-            corners,
-            np.zeros(3),
-            np.zeros(3),  # Already in camera frame
-            camera_matrix,
-            None,
-        )[0]
-        .reshape(-1, 2)
-        .astype(int)
-    )
-
-    # Define edges for visualization
-    edges = [
-        # Bottom face
-        (0, 1),
-        (1, 5),
-        (5, 4),
-        (4, 0),
-        # Top face
-        (2, 3),
-        (3, 7),
-        (7, 6),
-        (6, 2),
-        # Vertical edges
-        (0, 2),
-        (1, 3),
-        (5, 7),
-        (4, 6),
-    ]
-
-    # Draw edges
-    vis_img = image.copy()
-    for i, j in edges:
-        cv2.line(vis_img, tuple(corners_img[i]), tuple(corners_img[j]), (0, 255, 0), 2)
-
-    # Add text with dimensions
-    dims = cuboid_params["dimensions"]
-    dim_text = f"Dims: {dims[0]:.3f} x {dims[1]:.3f} x {dims[2]:.3f}"
-    cv2.putText(vis_img, dim_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
-
-    return vis_img
-
-
-def plot_3d_fit(points, cuboid_params, title="3D Cuboid Fit"):
-    """Plot points and fitted cuboid in 3D."""
-    fig = plt.figure(figsize=(10, 10))
-    ax = fig.add_subplot(111, projection="3d")
-
-    # Plot points
-    ax.scatter(
-        points[:, 0], points[:, 1], points[:, 2], c="b", marker=".", alpha=0.1, label="Points"
-    )
-
-    # Plot fitted cuboid
-    corners = get_cuboid_corners(
-        cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"]
-    )
-
-    # Define edges
-    edges = [
-        # Bottom face
-        (0, 1),
-        (1, 5),
-        (5, 4),
-        (4, 0),
-        # Top face
-        (2, 3),
-        (3, 7),
-        (7, 6),
-        (6, 2),
-        # Vertical edges
-        (0, 2),
-        (1, 3),
-        (5, 7),
-        (4, 6),
-    ]
-
-    # Plot edges
-    for i, j in edges:
-        ax.plot3D(
-            [corners[i, 0], corners[j, 0]],
-            [corners[i, 1], corners[j, 1]],
-            [corners[i, 2], corners[j, 2]],
-            "r-",
-        )
-
-    # Set labels and title
-    ax.set_xlabel("X")
-    ax.set_ylabel("Y")
-    ax.set_zlabel("Z")
-    ax.set_title(title)
-
-    # Make scaling uniform
-    all_points = np.vstack([points, corners])
-    max_range = (
-        np.array(
-            [
-                all_points[:, 0].max() - all_points[:, 0].min(),
-                all_points[:, 1].max() - all_points[:, 1].min(),
-                all_points[:, 2].max() - all_points[:, 2].min(),
-            ]
-        ).max()
-        / 2.0
-    )
-
-    mid_x = (all_points[:, 0].max() + all_points[:, 0].min()) * 0.5
-    mid_y = (all_points[:, 1].max() + all_points[:, 1].min()) * 0.5
-    mid_z = (all_points[:, 2].max() + all_points[:, 2].min()) * 0.5
-
-    ax.set_xlim(mid_x - max_range, mid_x + max_range)
-    ax.set_ylim(mid_y - max_range, mid_y + max_range)
-    ax.set_zlim(mid_z - max_range, mid_z + max_range)
-
-    ax.set_box_aspect([1, 1, 1])
-    plt.legend()
-    return fig, ax
diff --git a/dimos/perception/common/utils.py b/dimos/perception/common/utils.py
new file mode 100644
index 0000000000..da9cc58fe0
--- /dev/null
+++ b/dimos/perception/common/utils.py
@@ -0,0 +1,331 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+from typing import List, Tuple, Optional
+from dimos.types.manipulation import ObjectData
+from dimos.types.vector import Vector
+from dimos.utils.logging_config import setup_logger
+import torch
+
+logger = setup_logger("dimos.perception.common.utils")
+
+
+def colorize_depth(depth_img: np.ndarray, max_depth: float = 5.0) -> Optional[np.ndarray]:
+    """
+    Normalize and colorize depth image using COLORMAP_JET.
+
+    Args:
+        depth_img: Depth image (H, W) in meters
+        max_depth: Maximum depth value for normalization
+
+    Returns:
+        Colorized depth image (H, W, 3) in RGB format, or None if input is None
+    """
+    if depth_img is None:
+        return None
+
+    valid_mask = np.isfinite(depth_img) & (depth_img > 0)
+    depth_norm = np.zeros_like(depth_img)
+    depth_norm[valid_mask] = np.clip(depth_img[valid_mask] / max_depth, 0, 1)
+    depth_colored = cv2.applyColorMap((depth_norm * 255).astype(np.uint8), cv2.COLORMAP_JET)
+    depth_rgb = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB)
+
+    # Make the depth image less bright by scaling down the values
+    depth_rgb = (depth_rgb * 0.6).astype(np.uint8)
+
+    return depth_rgb
+
+
+def draw_bounding_box(
+    image: np.ndarray,
+    bbox: List[float],
+    color: Tuple[int, int, int] = (0, 255, 0),
+    thickness: int = 2,
+    label: Optional[str] = None,
+    confidence: Optional[float] = None,
+    object_id: Optional[int] = None,
+    font_scale: float = 0.6,
+) -> np.ndarray:
+    """
+    Draw a bounding box with optional label on an image.
+
+    Args:
+        image: Image to draw on (H, W, 3)
+        bbox: Bounding box [x1, y1, x2, y2]
+        color: RGB color tuple for the box
+        thickness: Line thickness for the box
+        label: Optional class label
+        confidence: Optional confidence score
+        object_id: Optional object ID
+        font_scale: Font scale for text
+
+    Returns:
+        Image with bounding box drawn
+    """
+    x1, y1, x2, y2 = map(int, bbox)
+
+    # Draw bounding box
+    cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness)
+
+    # Create label text
+    text_parts = []
+    if label is not None:
+        text_parts.append(str(label))
+    if object_id is not None:
+        text_parts.append(f"ID: {object_id}")
+    if confidence is not None:
+        text_parts.append(f"({confidence:.2f})")
+
+    if text_parts:
+        text = ", ".join(text_parts)
+
+        # Draw text background
+        text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 1)[0]
+        cv2.rectangle(
+            image,
+            (x1, y1 - text_size[1] - 5),
+            (x1 + text_size[0], y1),
+            (0, 0, 0),
+            -1,
+        )
+
+        # Draw text
+        cv2.putText(
+            image,
+            text,
+            (x1, y1 - 5),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            font_scale,
+            (255, 255, 255),
+            1,
+        )
+
+    return image
+
+
+def draw_segmentation_mask(
+    image: np.ndarray,
+    mask: np.ndarray,
+    color: Tuple[int, int, int] = (0, 200, 200),
+    alpha: float = 0.5,
+    draw_contours: bool = True,
+    contour_thickness: int = 2,
+) -> np.ndarray:
+    """
+    Draw segmentation mask overlay on an image.
+
+    Args:
+        image: Image to draw on (H, W, 3)
+        mask: Segmentation mask (H, W) - boolean or uint8
+        color: RGB color for the mask
+        alpha: Transparency factor (0.0 = transparent, 1.0 = opaque)
+        draw_contours: Whether to draw mask contours
+        contour_thickness: Thickness of contour lines
+
+    Returns:
+        Image with mask overlay drawn
+    """
+    if mask is None:
+        return image
+
+    try:
+        # Ensure mask is uint8
+        mask = mask.astype(np.uint8)
+
+        # Create colored mask overlay
+        colored_mask = np.zeros_like(image)
+        colored_mask[mask > 0] = color
+
+        # Apply the mask with transparency
+        mask_area = mask > 0
+        image[mask_area] = cv2.addWeighted(
+            image[mask_area], 1 - alpha, colored_mask[mask_area], alpha, 0
+        )
+
+        # Draw mask contours if requested
+        if draw_contours:
+            contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            cv2.drawContours(image, contours, -1, color, contour_thickness)
+
+    except Exception as e:
+        logger.warning(f"Error drawing segmentation mask: {e}")
+
+    return image
+
+
+def draw_object_detection_visualization(
+    image: np.ndarray,
+    objects: List[ObjectData],
+    draw_masks: bool = False,
+    bbox_color: Tuple[int, int, int] = (0, 255, 0),
+    mask_color: Tuple[int, int, int] = (0, 200, 200),
+    font_scale: float = 0.6,
+) -> np.ndarray:
+    """
+    Create object detection visualization with bounding boxes and optional masks.
+
+    Args:
+        image: Base image to draw on (H, W, 3)
+        objects: List of ObjectData with detection information
+        draw_masks: Whether to draw segmentation masks
+        bbox_color: Default color for bounding boxes
+        mask_color: Default color for segmentation masks
+        font_scale: Font scale for text labels
+
+    Returns:
+        Image with detection visualization
+    """
+    viz_image = image.copy()
+
+    for obj in objects:
+        try:
+            # Draw segmentation mask first (if enabled and available)
+            if draw_masks and "segmentation_mask" in obj and obj["segmentation_mask"] is not None:
+                viz_image = draw_segmentation_mask(
+                    viz_image, obj["segmentation_mask"], color=mask_color, alpha=0.5
+                )
+
+            # Draw bounding box
+            if "bbox" in obj and obj["bbox"] is not None:
+                # Use object's color if available, otherwise default
+                color = bbox_color
+                if "color" in obj and obj["color"] is not None:
+                    obj_color = obj["color"]
+                    if isinstance(obj_color, np.ndarray):
+                        color = tuple(int(c) for c in obj_color)
+                    elif isinstance(obj_color, (list, tuple)):
+                        color = tuple(int(c) for c in obj_color[:3])
+
+                viz_image = draw_bounding_box(
+                    viz_image,
+                    obj["bbox"],
+                    color=color,
+                    label=obj.get("label"),
+                    confidence=obj.get("confidence"),
+                    object_id=obj.get("object_id"),
+                    font_scale=font_scale,
+                )
+
+        except Exception as e:
+            logger.warning(f"Error drawing object visualization: {e}")
+
+    return viz_image
+
+
+def detection_results_to_object_data(
+    bboxes: List[List[float]],
+    track_ids: List[int],
+    class_ids: List[int],
+    confidences: List[float],
+    names: List[str],
+    masks: Optional[List[np.ndarray]] = None,
+    source: str = "detection",
+) -> List[ObjectData]:
+    """
+    Convert detection/segmentation results to ObjectData format.
+
+    Args:
+        bboxes: List of bounding boxes [x1, y1, x2, y2]
+        track_ids: List of tracking IDs
+        class_ids: List of class indices
+        confidences: List of detection confidences
+        names: List of class names
+        masks: Optional list of segmentation masks
+        source: Source type ("detection" or "segmentation")
+
+    Returns:
+        List of ObjectData dictionaries
+    """
+    objects = []
+
+    for i in range(len(bboxes)):
+        # Calculate basic properties from bbox
+        bbox = bboxes[i]
+        width = bbox[2] - bbox[0]
+        height = bbox[3] - bbox[1]
+        center_x = bbox[0] + width / 2
+        center_y = bbox[1] + height / 2
+
+        # Create ObjectData
+        object_data: ObjectData = {
+            "object_id": track_ids[i] if i < len(track_ids) else i,
+            "bbox": bbox,
+            "depth": -1.0,  # Will be populated by depth estimation or point cloud processing
+            "confidence": confidences[i] if i < len(confidences) else 1.0,
+            "class_id": class_ids[i] if i < len(class_ids) else 0,
+            "label": names[i] if i < len(names) else f"{source}_object",
+            "movement_tolerance": 1.0,  # Default to freely movable
+            "segmentation_mask": masks[i].cpu().numpy()
+            if masks and i < len(masks) and isinstance(masks[i], torch.Tensor)
+            else masks[i]
+            if masks and i < len(masks)
+            else None,
+            # Initialize 3D properties (will be populated by point cloud processing)
+            "position": Vector(0, 0, 0),
+            "rotation": Vector(0, 0, 0),
+            "size": {
+                "width": 0.0,
+                "height": 0.0,
+                "depth": 0.0,
+            },
+        }
+        objects.append(object_data)
+
+    return objects
+
+
+def combine_object_data(
+    list1: List[ObjectData], list2: List[ObjectData], overlap_threshold: float = 0.8
+) -> List[ObjectData]:
+    """
+    Combine two ObjectData lists, removing duplicates based on segmentation mask overlap.
+    """
+    combined = list1.copy()
+    used_ids = set(obj.get("object_id", 0) for obj in list1)
+    next_id = max(used_ids) + 1 if used_ids else 1
+
+    for obj2 in list2:
+        obj_copy = obj2.copy()
+
+        # Handle duplicate object_id
+        if obj_copy.get("object_id", 0) in used_ids:
+            obj_copy["object_id"] = next_id
+            next_id += 1
+        used_ids.add(obj_copy["object_id"])
+
+        # Check mask overlap
+        mask2 = obj2.get("segmentation_mask")
+        if mask2 is None or np.sum(mask2 > 0) == 0:
+            combined.append(obj_copy)
+            continue
+
+        mask2_area = np.sum(mask2 > 0)
+        is_duplicate = False
+
+        for obj1 in list1:
+            mask1 = obj1.get("segmentation_mask")
+            if mask1 is None:
+                continue
+
+            intersection = np.sum((mask1 > 0) & (mask2 > 0))
+            if intersection / mask2_area >= overlap_threshold:
+                is_duplicate = True
+                break
+
+        if not is_duplicate:
+            combined.append(obj_copy)
+
+    return combined
diff --git a/dimos/perception/grasp_generation/__init__.py b/dimos/perception/grasp_generation/__init__.py
new file mode 100644
index 0000000000..16281fe0b6
--- /dev/null
+++ b/dimos/perception/grasp_generation/__init__.py
@@ -0,0 +1 @@
+from .utils import *
diff --git a/dimos/perception/grasp_generation/grasp_generation.py b/dimos/perception/grasp_generation/grasp_generation.py
new file mode 100644
index 0000000000..89e7a0036c
--- /dev/null
+++ b/dimos/perception/grasp_generation/grasp_generation.py
@@ -0,0 +1,228 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dimensional-hosted grasp generation for manipulation pipeline.
+"""
+
+import asyncio
+import numpy as np
+import open3d as o3d
+from typing import Dict, List, Optional
+
+from dimos.types.manipulation import ObjectData
+from dimos.utils.logging_config import setup_logger
+from dimos.perception.grasp_generation.utils import parse_grasp_results
+
+logger = setup_logger("dimos.perception.grasp_generation")
+
+
+class HostedGraspGenerator:
+    """
+    Dimensional-hosted grasp generator using WebSocket communication.
+    """
+
+    def __init__(self, server_url: str):
+        """
+        Initialize Dimensional-hosted grasp generator.
+
+        Args:
+            server_url: WebSocket URL for Dimensional-hosted grasp generator server
+        """
+        self.server_url = server_url
+        logger.info(f"Initialized grasp generator with server: {server_url}")
+
+    def generate_grasps_from_objects(
+        self, objects: List[ObjectData], full_pcd: o3d.geometry.PointCloud
+    ) -> List[Dict]:
+        """
+        Generate grasps from ObjectData objects using grasp generator.
+
+        Args:
+            objects: List of ObjectData with point clouds
+            full_pcd: Open3D point cloud of full scene
+
+        Returns:
+            Parsed grasp results as list of dictionaries
+        """
+        try:
+            # Combine all point clouds
+            all_points = []
+            all_colors = []
+            valid_objects = 0
+
+            for obj in objects:
+                if "point_cloud_numpy" not in obj or obj["point_cloud_numpy"] is None:
+                    continue
+
+                points = obj["point_cloud_numpy"]
+                if not isinstance(points, np.ndarray) or points.size == 0:
+                    continue
+
+                if len(points.shape) != 2 or points.shape[1] != 3:
+                    continue
+
+                colors = None
+                if "colors_numpy" in obj and obj["colors_numpy"] is not None:
+                    colors = obj["colors_numpy"]
+                    if isinstance(colors, np.ndarray) and colors.size > 0:
+                        if (
+                            colors.shape[0] != points.shape[0]
+                            or len(colors.shape) != 2
+                            or colors.shape[1] != 3
+                        ):
+                            colors = None
+
+                all_points.append(points)
+                if colors is not None:
+                    all_colors.append(colors)
+                valid_objects += 1
+
+            if not all_points:
+                return []
+
+            # Combine point clouds
+            combined_points = np.vstack(all_points)
+            combined_colors = None
+            if len(all_colors) == valid_objects and len(all_colors) > 0:
+                combined_colors = np.vstack(all_colors)
+
+            # Send grasp request
+            grasps = self._send_grasp_request_sync(combined_points, combined_colors)
+
+            if not grasps:
+                return []
+
+            # Parse and return results in list of dictionaries format
+            return parse_grasp_results(grasps)
+
+        except Exception as e:
+            logger.error(f"Grasp generation failed: {e}")
+            return []
+
+    def _send_grasp_request_sync(
+        self, points: np.ndarray, colors: Optional[np.ndarray]
+    ) -> Optional[List[Dict]]:
+        """Send synchronous grasp request to grasp server."""
+
+        try:
+            # Prepare colors
+            colors = np.ones((points.shape[0], 3), dtype=np.float32) * 0.5
+
+            # Ensure correct data types
+            points = points.astype(np.float32)
+            colors = colors.astype(np.float32)
+
+            # Validate ranges
+            if np.any(np.isnan(points)) or np.any(np.isinf(points)):
+                logger.error("Points contain NaN or Inf values")
+                return None
+            if np.any(np.isnan(colors)) or np.any(np.isinf(colors)):
+                logger.error("Colors contain NaN or Inf values")
+                return None
+
+            colors = np.clip(colors, 0.0, 1.0)
+
+            # Run async request in sync context
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                result = loop.run_until_complete(self._async_grasp_request(points, colors))
+                return result
+            finally:
+                loop.close()
+
+        except Exception as e:
+            logger.error(f"Error in synchronous grasp request: {e}")
+            return None
+
+    async def _async_grasp_request(
+        self, points: np.ndarray, colors: np.ndarray
+    ) -> Optional[List[Dict]]:
+        """Async grasp request helper."""
+        import json
+        import websockets
+
+        try:
+            async with websockets.connect(self.server_url) as websocket:
+                request = {
+                    "points": points.tolist(),
+                    "colors": colors.tolist(),
+                    "lims": [-1.0, 1.0, -1.0, 1.0, 0.0, 2.0],
+                }
+
+                await websocket.send(json.dumps(request))
+                response = await websocket.recv()
+                grasps = json.loads(response)
+
+                if isinstance(grasps, dict) and "error" in grasps:
+                    logger.error(f"Server returned error: {grasps['error']}")
+                    return None
+                elif isinstance(grasps, (int, float)) and grasps == 0:
+                    return None
+                elif not isinstance(grasps, list):
+                    logger.error(f"Server returned unexpected response type: {type(grasps)}")
+                    return None
+                elif len(grasps) == 0:
+                    return None
+
+                return self._convert_grasp_format(grasps)
+
+        except Exception as e:
+            logger.error(f"Async grasp request failed: {e}")
+            return None
+
+    def _convert_grasp_format(self, grasps: List[dict]) -> List[dict]:
+        """Convert Dimensional Grasp format to visualization format."""
+        converted = []
+
+        for i, grasp in enumerate(grasps):
+            rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3)))
+            euler_angles = self._rotation_matrix_to_euler(rotation_matrix)
+
+            converted_grasp = {
+                "id": f"grasp_{i}",
+                "score": grasp.get("score", 0.0),
+                "width": grasp.get("width", 0.0),
+                "height": grasp.get("height", 0.0),
+                "depth": grasp.get("depth", 0.0),
+                "translation": grasp.get("translation", [0, 0, 0]),
+                "rotation_matrix": rotation_matrix.tolist(),
+                "euler_angles": euler_angles,
+            }
+            converted.append(converted_grasp)
+
+        converted.sort(key=lambda x: x["score"], reverse=True)
+        return converted
+
+    def _rotation_matrix_to_euler(self, rotation_matrix: np.ndarray) -> Dict[str, float]:
+        """Convert rotation matrix to Euler angles (in radians)."""
+        sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2)
+
+        singular = sy < 1e-6
+
+        if not singular:
+            x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2])
+            y = np.arctan2(-rotation_matrix[2, 0], sy)
+            z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0])
+        else:
+            x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1])
+            y = np.arctan2(-rotation_matrix[2, 0], sy)
+            z = 0
+
+        return {"roll": x, "pitch": y, "yaw": z}
+
+    def cleanup(self):
+        """Clean up resources."""
+        logger.info("Grasp generator cleaned up")
diff --git a/dimos/perception/grasp_generation/utils.py b/dimos/perception/grasp_generation/utils.py
new file mode 100644
index 0000000000..5a79a7d3bc
--- /dev/null
+++ b/dimos/perception/grasp_generation/utils.py
@@ -0,0 +1,621 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for grasp generation and visualization."""
+
+import numpy as np
+import open3d as o3d
+import cv2
+from typing import List, Dict, Tuple, Optional, Union
+
+
+def project_3d_points_to_2d(
+    points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray]
+) -> np.ndarray:
+    """
+    Project 3D points to 2D image coordinates using camera intrinsics.
+
+    Args:
+        points_3d: Nx3 array of 3D points (X, Y, Z)
+        camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
+
+    Returns:
+        Nx2 array of 2D image coordinates (u, v)
+    """
+    if len(points_3d) == 0:
+        return np.zeros((0, 2), dtype=np.int32)
+
+    # Filter out points with zero or negative depth
+    valid_mask = points_3d[:, 2] > 0
+    if not np.any(valid_mask):
+        return np.zeros((0, 2), dtype=np.int32)
+
+    valid_points = points_3d[valid_mask]
+
+    # Extract camera parameters
+    if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
+        fx, fy, cx, cy = camera_intrinsics
+    else:
+        camera_matrix = np.array(camera_intrinsics)
+        fx = camera_matrix[0, 0]
+        fy = camera_matrix[1, 1]
+        cx = camera_matrix[0, 2]
+        cy = camera_matrix[1, 2]
+
+    # Project to image coordinates
+    u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx
+    v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy
+
+    # Round to integer pixel coordinates
+    points_2d = np.column_stack([u, v]).astype(np.int32)
+
+    return points_2d
+
+
+def euler_to_rotation_matrix(roll: float, pitch: float, yaw: float) -> np.ndarray:
+    """
+    Convert Euler angles to rotation matrix.
+
+    Args:
+        roll: Roll angle in radians
+        pitch: Pitch angle in radians
+        yaw: Yaw angle in radians
+
+    Returns:
+        3x3 rotation matrix
+    """
+    Rx = np.array([[1, 0, 0], [0, np.cos(roll), -np.sin(roll)], [0, np.sin(roll), np.cos(roll)]])
+
+    Ry = np.array(
+        [[np.cos(pitch), 0, np.sin(pitch)], [0, 1, 0], [-np.sin(pitch), 0, np.cos(pitch)]]
+    )
+
+    Rz = np.array([[np.cos(yaw), -np.sin(yaw), 0], [np.sin(yaw), np.cos(yaw), 0], [0, 0, 1]])
+
+    # Combined rotation matrix
+    R = Rz @ Ry @ Rx
+
+    return R
+
+
+def create_gripper_geometry(
+    grasp_data: dict,
+    finger_length: float = 0.08,
+    finger_thickness: float = 0.004,
+) -> List[o3d.geometry.TriangleMesh]:
+    """
+    Create a simple fork-like gripper geometry from grasp data.
+
+    Args:
+        grasp_data: Dictionary containing grasp parameters
+            - translation: 3D position list
+            - rotation_matrix: 3x3 rotation matrix defining gripper coordinate system
+                * X-axis: gripper width direction (opening/closing)
+                * Y-axis: finger length direction
+                * Z-axis: approach direction (toward object)
+            - width: Gripper opening width
+        finger_length: Length of gripper fingers (longer)
+        finger_thickness: Thickness of gripper fingers
+        base_height: Height of gripper base (longer)
+        color: RGB color for the gripper (solid blue)
+
+    Returns:
+        List of Open3D TriangleMesh geometries for the gripper
+    """
+
+    translation = np.array(grasp_data["translation"])
+    rotation_matrix = np.array(grasp_data["rotation_matrix"])
+
+    width = grasp_data.get("width", 0.04)
+
+    # Create transformation matrix
+    transform = np.eye(4)
+    transform[:3, :3] = rotation_matrix
+    transform[:3, 3] = translation
+
+    geometries = []
+
+    # Gripper dimensions
+    finger_width = 0.006  # Thickness of each finger
+    handle_length = 0.05  # Length of handle extending backward
+
+    # Build gripper in local coordinate system:
+    # X-axis = width direction (left/right finger separation)
+    # Y-axis = finger length direction (fingers extend along +Y)
+    # Z-axis = approach direction (toward object, handle extends along -Z)
+    # IMPORTANT: Fingertips should be at origin (translation point)
+
+    # Create left finger extending along +Y, positioned at +X
+    left_finger = o3d.geometry.TriangleMesh.create_box(
+        width=finger_width,  # Thin finger
+        height=finger_length,  # Extends along Y (finger length direction)
+        depth=finger_thickness,  # Thin in Z direction
+    )
+    left_finger.translate(
+        [
+            width / 2 - finger_width / 2,  # Position at +X (half width from center)
+            -finger_length,  # Shift so fingertips are at origin
+            -finger_thickness / 2,  # Center in Z
+        ]
+    )
+
+    # Create right finger extending along +Y, positioned at -X
+    right_finger = o3d.geometry.TriangleMesh.create_box(
+        width=finger_width,  # Thin finger
+        height=finger_length,  # Extends along Y (finger length direction)
+        depth=finger_thickness,  # Thin in Z direction
+    )
+    right_finger.translate(
+        [
+            -width / 2 - finger_width / 2,  # Position at -X (half width from center)
+            -finger_length,  # Shift so fingertips are at origin
+            -finger_thickness / 2,  # Center in Z
+        ]
+    )
+
+    # Create base connecting fingers - flat like a stickman body
+    base = o3d.geometry.TriangleMesh.create_box(
+        width=width + finger_width,  # Full width plus finger thickness
+        height=finger_thickness,  # Flat like fingers (stickman style)
+        depth=finger_thickness,  # Thin like fingers
+    )
+    base.translate(
+        [
+            -width / 2 - finger_width / 2,  # Start from left finger position
+            -finger_length - finger_thickness,  # Behind fingers, adjusted for fingertips at origin
+            -finger_thickness / 2,  # Center in Z
+        ]
+    )
+
+    # Create handle extending backward - flat stick like stickman arm
+    handle = o3d.geometry.TriangleMesh.create_box(
+        width=finger_width,  # Same width as fingers
+        height=handle_length,  # Extends backward along Y direction (same plane)
+        depth=finger_thickness,  # Thin like fingers (same plane)
+    )
+    handle.translate(
+        [
+            -finger_width / 2,  # Center in X
+            -finger_length
+            - finger_thickness
+            - handle_length,  # Extend backward from base, adjusted for fingertips at origin
+            -finger_thickness / 2,  # Same Z plane as other components
+        ]
+    )
+
+    # Use solid red color for all parts (user changed to red)
+    solid_color = [1.0, 0.0, 0.0]  # Red color
+
+    left_finger.paint_uniform_color(solid_color)
+    right_finger.paint_uniform_color(solid_color)
+    base.paint_uniform_color(solid_color)
+    handle.paint_uniform_color(solid_color)
+
+    # Apply transformation to all parts
+    left_finger.transform(transform)
+    right_finger.transform(transform)
+    base.transform(transform)
+    handle.transform(transform)
+
+    geometries.extend([left_finger, right_finger, base, handle])
+
+    return geometries
+
+
+def create_all_gripper_geometries(
+    grasp_list: List[dict], max_grasps: int = -1
+) -> List[o3d.geometry.TriangleMesh]:
+    """
+    Create gripper geometries for multiple grasps.
+
+    Args:
+        grasp_list: List of grasp dictionaries
+        max_grasps: Maximum number of grasps to visualize (-1 for all)
+
+    Returns:
+        List of all gripper geometries
+    """
+    all_geometries = []
+
+    grasps_to_show = grasp_list if max_grasps < 0 else grasp_list[:max_grasps]
+
+    for grasp in grasps_to_show:
+        gripper_parts = create_gripper_geometry(grasp)
+        all_geometries.extend(gripper_parts)
+
+    return all_geometries
+
+
+def draw_grasps_on_image(
+    image: np.ndarray,
+    grasp_data: Union[dict, Dict[Union[int, str], List[dict]], List[dict]],
+    camera_intrinsics: Union[List[float], np.ndarray],  # [fx, fy, cx, cy] or 3x3 matrix
+    max_grasps: int = -1,  # -1 means show all grasps
+    finger_length: float = 0.08,  # Match 3D gripper
+    finger_thickness: float = 0.004,  # Match 3D gripper
+) -> np.ndarray:
+    """
+    Draw fork-like gripper visualizations on the image matching 3D gripper design.
+
+    Args:
+        image: Base image to draw on
+        grasp_data: Can be:
+            - A single grasp dict
+            - A list of grasp dicts
+            - A dictionary mapping object IDs or "scene" to list of grasps
+        camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
+        max_grasps: Maximum number of grasps to visualize (-1 for all)
+        finger_length: Length of gripper fingers (matches 3D design)
+        finger_thickness: Thickness of gripper fingers (matches 3D design)
+
+    Returns:
+        Image with grasps drawn
+    """
+    result = image.copy()
+
+    # Convert camera intrinsics to 3x3 matrix if needed
+    if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
+        fx, fy, cx, cy = camera_intrinsics
+        camera_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+    else:
+        camera_matrix = np.array(camera_intrinsics)
+
+    # Convert input to standard format
+    if isinstance(grasp_data, dict) and not any(
+        key in grasp_data for key in ["scene", 0, 1, 2, 3, 4, 5]
+    ):
+        # Single grasp
+        grasps_to_draw = [(grasp_data, 0)]
+    elif isinstance(grasp_data, list):
+        # List of grasps
+        grasps_to_draw = [(grasp, i) for i, grasp in enumerate(grasp_data)]
+    else:
+        # Dictionary of grasps by object ID
+        grasps_to_draw = []
+        for obj_id, grasps in grasp_data.items():
+            for i, grasp in enumerate(grasps):
+                grasps_to_draw.append((grasp, i))
+
+    # Limit number of grasps if specified
+    if max_grasps > 0:
+        grasps_to_draw = grasps_to_draw[:max_grasps]
+
+    # Define grasp colors (solid red to match 3D design)
+    def get_grasp_color(index: int) -> tuple:
+        # Use solid red color for all grasps to match 3D design
+        return (0, 0, 255)  # Red in BGR format for OpenCV
+
+    # Draw each grasp
+    for grasp, index in grasps_to_draw:
+        try:
+            color = get_grasp_color(index)
+            thickness = max(1, 4 - index // 3)
+
+            # Extract grasp parameters (using translation and rotation_matrix)
+            if "translation" not in grasp or "rotation_matrix" not in grasp:
+                continue
+
+            translation = np.array(grasp["translation"])
+            rotation_matrix = np.array(grasp["rotation_matrix"])
+            width = grasp.get("width", 0.04)
+
+            # Match 3D gripper dimensions
+            finger_width = 0.006  # Thickness of each finger (matches 3D)
+            handle_length = 0.05  # Length of handle extending backward (matches 3D)
+
+            # Create gripper geometry in local coordinate system matching 3D design:
+            # X-axis = width direction (left/right finger separation)
+            # Y-axis = finger length direction (fingers extend along +Y)
+            # Z-axis = approach direction (toward object, handle extends along -Z)
+            # IMPORTANT: Fingertips should be at origin (translation point)
+
+            # Left finger extending along +Y, positioned at +X
+            left_finger_points = np.array(
+                [
+                    [
+                        width / 2 - finger_width / 2,
+                        -finger_length,
+                        -finger_thickness / 2,
+                    ],  # Back left
+                    [
+                        width / 2 + finger_width / 2,
+                        -finger_length,
+                        -finger_thickness / 2,
+                    ],  # Back right
+                    [
+                        width / 2 + finger_width / 2,
+                        0,
+                        -finger_thickness / 2,
+                    ],  # Front right (at origin)
+                    [
+                        width / 2 - finger_width / 2,
+                        0,
+                        -finger_thickness / 2,
+                    ],  # Front left (at origin)
+                ]
+            )
+
+            # Right finger extending along +Y, positioned at -X
+            right_finger_points = np.array(
+                [
+                    [
+                        -width / 2 - finger_width / 2,
+                        -finger_length,
+                        -finger_thickness / 2,
+                    ],  # Back left
+                    [
+                        -width / 2 + finger_width / 2,
+                        -finger_length,
+                        -finger_thickness / 2,
+                    ],  # Back right
+                    [
+                        -width / 2 + finger_width / 2,
+                        0,
+                        -finger_thickness / 2,
+                    ],  # Front right (at origin)
+                    [
+                        -width / 2 - finger_width / 2,
+                        0,
+                        -finger_thickness / 2,
+                    ],  # Front left (at origin)
+                ]
+            )
+
+            # Base connecting fingers - flat rectangle behind fingers
+            base_points = np.array(
+                [
+                    [
+                        -width / 2 - finger_width / 2,
+                        -finger_length - finger_thickness,
+                        -finger_thickness / 2,
+                    ],  # Back left
+                    [
+                        width / 2 + finger_width / 2,
+                        -finger_length - finger_thickness,
+                        -finger_thickness / 2,
+                    ],  # Back right
+                    [
+                        width / 2 + finger_width / 2,
+                        -finger_length,
+                        -finger_thickness / 2,
+                    ],  # Front right
+                    [
+                        -width / 2 - finger_width / 2,
+                        -finger_length,
+                        -finger_thickness / 2,
+                    ],  # Front left
+                ]
+            )
+
+            # Handle extending backward - thin rectangle
+            handle_points = np.array(
+                [
+                    [
+                        -finger_width / 2,
+                        -finger_length - finger_thickness - handle_length,
+                        -finger_thickness / 2,
+                    ],  # Back left
+                    [
+                        finger_width / 2,
+                        -finger_length - finger_thickness - handle_length,
+                        -finger_thickness / 2,
+                    ],  # Back right
+                    [
+                        finger_width / 2,
+                        -finger_length - finger_thickness,
+                        -finger_thickness / 2,
+                    ],  # Front right
+                    [
+                        -finger_width / 2,
+                        -finger_length - finger_thickness,
+                        -finger_thickness / 2,
+                    ],  # Front left
+                ]
+            )
+
+            # Transform all points to world frame
+            def transform_points(points):
+                # Apply rotation and translation
+                world_points = (rotation_matrix @ points.T).T + translation
+                return world_points
+
+            left_finger_world = transform_points(left_finger_points)
+            right_finger_world = transform_points(right_finger_points)
+            base_world = transform_points(base_points)
+            handle_world = transform_points(handle_points)
+
+            # Project to 2D
+            left_finger_2d = project_3d_points_to_2d(left_finger_world, camera_matrix)
+            right_finger_2d = project_3d_points_to_2d(right_finger_world, camera_matrix)
+            base_2d = project_3d_points_to_2d(base_world, camera_matrix)
+            handle_2d = project_3d_points_to_2d(handle_world, camera_matrix)
+
+            # Draw left finger
+            pts = left_finger_2d.astype(np.int32)
+            cv2.polylines(result, [pts], True, color, thickness)
+
+            # Draw right finger
+            pts = right_finger_2d.astype(np.int32)
+            cv2.polylines(result, [pts], True, color, thickness)
+
+            # Draw base
+            pts = base_2d.astype(np.int32)
+            cv2.polylines(result, [pts], True, color, thickness)
+
+            # Draw handle
+            pts = handle_2d.astype(np.int32)
+            cv2.polylines(result, [pts], True, color, thickness)
+
+            # Draw grasp center (fingertips at origin)
+            center_2d = project_3d_points_to_2d(translation.reshape(1, -1), camera_matrix)[0]
+            cv2.circle(result, tuple(center_2d.astype(int)), 3, color, -1)
+
+        except Exception as e:
+            # Skip this grasp if there's an error
+            continue
+
+    return result
+
+
+def get_standard_coordinate_transform():
+    """
+    Get a standard coordinate transformation matrix for consistent visualization.
+
+    This transformation ensures that:
+    - X (red) axis points right
+    - Y (green) axis points up
+    - Z (blue) axis points toward viewer
+
+    Returns:
+        4x4 transformation matrix
+    """
+    # Standard transformation matrix to ensure consistent coordinate frame orientation
+    transform = np.array(
+        [
+            [1, 0, 0, 0],  # X points right
+            [0, -1, 0, 0],  # Y points up (flip from OpenCV to standard)
+            [0, 0, -1, 0],  # Z points toward viewer (flip depth)
+            [0, 0, 0, 1],
+        ]
+    )
+    return transform
+
+
+def visualize_grasps_3d(
+    point_cloud: o3d.geometry.PointCloud,
+    grasp_list: List[dict],
+    max_grasps: int = -1,
+):
+    """
+    Visualize grasps in 3D with point cloud.
+
+    Args:
+        point_cloud: Open3D point cloud
+        grasp_list: List of grasp dictionaries
+        max_grasps: Maximum number of grasps to visualize
+    """
+    # Apply standard coordinate transformation
+    transform = get_standard_coordinate_transform()
+
+    # Transform point cloud
+    pc_copy = o3d.geometry.PointCloud(point_cloud)
+    pc_copy.transform(transform)
+    geometries = [pc_copy]
+
+    # Transform gripper geometries
+    gripper_geometries = create_all_gripper_geometries(grasp_list, max_grasps)
+    for geom in gripper_geometries:
+        geom.transform(transform)
+    geometries.extend(gripper_geometries)
+
+    # Add transformed coordinate frame
+    origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
+    origin_frame.transform(transform)
+    geometries.append(origin_frame)
+
+    o3d.visualization.draw_geometries(geometries, window_name="3D Grasp Visualization")
+
+
+def rotation_matrix_to_euler(rotation_matrix: np.ndarray) -> Tuple[float, float, float]:
+    """
+    Convert 3x3 rotation matrix to Euler angles (roll, pitch, yaw).
+
+    Args:
+        rotation_matrix: 3x3 rotation matrix
+
+    Returns:
+        Tuple of (roll, pitch, yaw) in radians
+    """
+    sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2)
+    singular = sy < 1e-6
+
+    if not singular:
+        x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2])  # roll
+        y = np.arctan2(-rotation_matrix[2, 0], sy)  # pitch
+        z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0])  # yaw
+    else:
+        x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1])  # roll
+        y = np.arctan2(-rotation_matrix[2, 0], sy)  # pitch
+        z = 0  # yaw
+
+    return x, y, z
+
+
+def parse_grasp_results(grasps: List[Dict]) -> List[Dict]:
+    """
+    Parse grasp results into visualization format.
+
+    Args:
+        grasps: List of grasp dictionaries
+
+    Returns:
+        List of dictionaries containing:
+        - id: Unique grasp identifier
+        - score: Confidence score (float)
+        - width: Gripper opening width (float)
+        - translation: 3D position [x, y, z]
+        - rotation_matrix: 3x3 rotation matrix as nested list
+    """
+    if not grasps:
+        return []
+
+    parsed_grasps = []
+
+    for i, grasp in enumerate(grasps):
+        # Extract data from each grasp
+        translation = grasp.get("translation", [0, 0, 0])
+        rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3)))
+        score = float(grasp.get("score", 0.0))
+        width = float(grasp.get("width", 0.08))
+
+        parsed_grasp = {
+            "id": f"grasp_{i}",
+            "score": score,
+            "width": width,
+            "translation": translation,
+            "rotation_matrix": rotation_matrix.tolist(),
+        }
+        parsed_grasps.append(parsed_grasp)
+
+    return parsed_grasps
+
+
+def create_grasp_overlay(
+    rgb_image: np.ndarray,
+    grasps: List[Dict],
+    camera_intrinsics: Union[List[float], np.ndarray],
+) -> np.ndarray:
+    """
+    Create grasp visualization overlay on RGB image.
+
+    Args:
+        rgb_image: RGB input image
+        grasps: List of grasp dictionaries in viz format
+        camera_intrinsics: Camera parameters
+
+    Returns:
+        RGB image with grasp overlay
+    """
+    try:
+        bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
+
+        result_bgr = draw_grasps_on_image(
+            bgr_image,
+            grasps,
+            camera_intrinsics,
+            max_grasps=-1,
+        )
+        return cv2.cvtColor(result_bgr, cv2.COLOR_BGR2RGB)
+    except Exception as e:
+        return rgb_image.copy()
diff --git a/dimos/perception/manip_aio_pipeline.py b/dimos/perception/manip_aio_pipeline.py
new file mode 100644
index 0000000000..7c69e562cf
--- /dev/null
+++ b/dimos/perception/manip_aio_pipeline.py
@@ -0,0 +1,590 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Asynchronous, reactive manipulation pipeline for realtime detection, filtering, and grasp generation.
+"""
+
+import asyncio
+import json
+import logging
+import threading
+import time
+import traceback
+import websockets
+from typing import Dict, List, Optional, Any
+import numpy as np
+import reactivex as rx
+import reactivex.operators as ops
+from dimos.utils.logging_config import setup_logger
+from dimos.perception.detection2d.detic_2d_det import Detic2DDetector
+from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering
+from dimos.perception.object_detection_stream import ObjectDetectionStream
+from dimos.perception.grasp_generation.utils import draw_grasps_on_image
+from dimos.perception.pointcloud.utils import create_point_cloud_overlay_visualization
+from dimos.perception.common.utils import colorize_depth
+from dimos.utils.logging_config import setup_logger
+import cv2
+
+logger = setup_logger("dimos.perception.manip_aio_pipeline")
+
+
+class ManipulationPipeline:
+    """
+    Clean separated stream pipeline with frame buffering.
+
+    - Object detection runs independently on RGB stream
+    - Point cloud processing subscribes to both detection and ZED streams separately
+    - Simple frame buffering to match RGB+depth+objects
+    """
+
+    def __init__(
+        self,
+        camera_intrinsics: List[float],  # [fx, fy, cx, cy]
+        min_confidence: float = 0.6,
+        max_objects: int = 10,
+        vocabulary: Optional[str] = None,
+        grasp_server_url: Optional[str] = None,
+        enable_grasp_generation: bool = False,
+    ):
+        """
+        Initialize the manipulation pipeline.
+
+        Args:
+            camera_intrinsics: [fx, fy, cx, cy] camera parameters
+            min_confidence: Minimum detection confidence threshold
+            max_objects: Maximum number of objects to process
+            vocabulary: Optional vocabulary for Detic detector
+            grasp_server_url: Optional WebSocket URL for Dimensional Grasp server
+            enable_grasp_generation: Whether to enable async grasp generation
+        """
+        self.camera_intrinsics = camera_intrinsics
+        self.min_confidence = min_confidence
+
+        # Grasp generation settings
+        self.grasp_server_url = grasp_server_url
+        self.enable_grasp_generation = enable_grasp_generation
+
+        # Asyncio event loop for WebSocket communication
+        self.grasp_loop = None
+        self.grasp_loop_thread = None
+
+        # Storage for grasp results and filtered objects
+        self.latest_grasps: List[dict] = []  # Simplified: just a list of grasps
+        self.grasps_consumed = False
+        self.latest_filtered_objects = []
+        self.latest_rgb_for_grasps = None  # Store RGB image for grasp overlay
+        self.grasp_lock = threading.Lock()
+
+        # Track pending requests - simplified to single task
+        self.grasp_task: Optional[asyncio.Task] = None
+
+        # Reactive subjects for streaming filtered objects and grasps
+        self.filtered_objects_subject = rx.subject.Subject()
+        self.grasps_subject = rx.subject.Subject()
+        self.grasp_overlay_subject = rx.subject.Subject()  # Add grasp overlay subject
+
+        # Initialize grasp client if enabled
+        if self.enable_grasp_generation and self.grasp_server_url:
+            self._start_grasp_loop()
+
+        # Initialize object detector
+        self.detector = Detic2DDetector(vocabulary=vocabulary, threshold=min_confidence)
+
+        # Initialize point cloud processor
+        self.pointcloud_filter = PointcloudFiltering(
+            color_intrinsics=camera_intrinsics,
+            depth_intrinsics=camera_intrinsics,  # ZED uses same intrinsics
+            max_num_objects=max_objects,
+        )
+
+        logger.info(f"Initialized ManipulationPipeline with confidence={min_confidence}")
+
+    def create_streams(self, zed_stream: rx.Observable) -> Dict[str, rx.Observable]:
+        """
+        Create streams using exact old main logic.
+        """
+        # Create ZED streams (from old main)
+        zed_frame_stream = zed_stream.pipe(ops.share())
+
+        # RGB stream for object detection (from old main)
+        video_stream = zed_frame_stream.pipe(
+            ops.map(lambda x: x.get("rgb") if x is not None else None),
+            ops.filter(lambda x: x is not None),
+            ops.share(),
+        )
+        object_detector = ObjectDetectionStream(
+            camera_intrinsics=self.camera_intrinsics,
+            min_confidence=self.min_confidence,
+            class_filter=None,
+            detector=self.detector,
+            video_stream=video_stream,
+            disable_depth=True,
+        )
+
+        # Store latest frames for point cloud processing (from old main)
+        latest_rgb = None
+        latest_depth = None
+        latest_point_cloud_overlay = None
+        frame_lock = threading.Lock()
+
+        # Subscribe to combined ZED frames (from old main)
+        def on_zed_frame(zed_data):
+            nonlocal latest_rgb, latest_depth
+            if zed_data is not None:
+                with frame_lock:
+                    latest_rgb = zed_data.get("rgb")
+                    latest_depth = zed_data.get("depth")
+
+        # Depth stream for point cloud filtering (from old main)
+        def get_depth_or_overlay(zed_data):
+            if zed_data is None:
+                return None
+
+            # Check if we have a point cloud overlay available
+            with frame_lock:
+                overlay = latest_point_cloud_overlay
+
+            if overlay is not None:
+                return overlay
+            else:
+                # Return regular colorized depth
+                return colorize_depth(zed_data.get("depth"), max_depth=10.0)
+
+        depth_stream = zed_frame_stream.pipe(
+            ops.map(get_depth_or_overlay), ops.filter(lambda x: x is not None), ops.share()
+        )
+
+        # Process object detection results with point cloud filtering (from old main)
+        def on_detection_next(result):
+            nonlocal latest_point_cloud_overlay
+            if "objects" in result and result["objects"]:
+                # Get latest RGB and depth frames
+                with frame_lock:
+                    rgb = latest_rgb
+                    depth = latest_depth
+
+                if rgb is not None and depth is not None:
+                    try:
+                        filtered_objects = self.pointcloud_filter.process_images(
+                            rgb, depth, result["objects"]
+                        )
+
+                        if filtered_objects:
+                            # Store filtered objects
+                            with self.grasp_lock:
+                                self.latest_filtered_objects = filtered_objects
+                            self.filtered_objects_subject.on_next(filtered_objects)
+
+                            # Create base image (colorized depth)
+                            base_image = colorize_depth(depth, max_depth=10.0)
+
+                            # Create point cloud overlay visualization
+                            overlay_viz = create_point_cloud_overlay_visualization(
+                                base_image=base_image,
+                                objects=filtered_objects,
+                                intrinsics=self.camera_intrinsics,
+                            )
+
+                            # Store the overlay for the stream
+                            with frame_lock:
+                                latest_point_cloud_overlay = overlay_viz
+
+                            # Request grasps if enabled
+                            if self.enable_grasp_generation and len(filtered_objects) > 0:
+                                # Save RGB image for later grasp overlay
+                                with frame_lock:
+                                    self.latest_rgb_for_grasps = rgb.copy()
+
+                                task = self.request_scene_grasps(filtered_objects)
+                                if task:
+                                    # Check for results after a delay
+                                    def check_grasps_later():
+                                        time.sleep(2.0)  # Wait for grasp processing
+                                        # Wait for task to complete
+                                        if hasattr(self, "grasp_task") and self.grasp_task:
+                                            try:
+                                                result = self.grasp_task.result(
+                                                    timeout=3.0
+                                                )  # Get result with timeout
+                                            except Exception as e:
+                                                logger.warning(f"Grasp task failed or timeout: {e}")
+
+                                        # Try to get latest grasps and create overlay
+                                        with self.grasp_lock:
+                                            grasps = self.latest_grasps
+
+                                        if grasps and hasattr(self, "latest_rgb_for_grasps"):
+                                            # Create grasp overlay on the saved RGB image
+                                            try:
+                                                bgr_image = cv2.cvtColor(
+                                                    self.latest_rgb_for_grasps, cv2.COLOR_RGB2BGR
+                                                )
+                                                result_bgr = draw_grasps_on_image(
+                                                    bgr_image,
+                                                    grasps,
+                                                    self.camera_intrinsics,
+                                                    max_grasps=-1,  # Show all grasps
+                                                )
+                                                result_rgb = cv2.cvtColor(
+                                                    result_bgr, cv2.COLOR_BGR2RGB
+                                                )
+
+                                                # Emit grasp overlay immediately
+                                                self.grasp_overlay_subject.on_next(result_rgb)
+
+                                            except Exception as e:
+                                                logger.error(f"Error creating grasp overlay: {e}")
+
+                                            # Emit grasps to stream
+                                            self.grasps_subject.on_next(grasps)
+
+                                    threading.Thread(target=check_grasps_later, daemon=True).start()
+                                else:
+                                    logger.warning("Failed to create grasp task")
+                    except Exception as e:
+                        logger.error(f"Error in point cloud filtering: {e}")
+                        with frame_lock:
+                            latest_point_cloud_overlay = None
+
+        def on_error(error):
+            logger.error(f"Error in stream: {error}")
+
+        def on_completed():
+            logger.info("Stream completed")
+
+        def start_subscriptions():
+            """Start subscriptions in background thread (from old main)"""
+            # Subscribe to combined ZED frames
+            zed_frame_stream.subscribe(on_next=on_zed_frame)
+
+        # Start subscriptions in background thread (from old main)
+        subscription_thread = threading.Thread(target=start_subscriptions, daemon=True)
+        subscription_thread.start()
+        time.sleep(2)  # Give subscriptions time to start
+
+        # Subscribe to object detection stream (from old main)
+        object_detector.get_stream().subscribe(
+            on_next=on_detection_next, on_error=on_error, on_completed=on_completed
+        )
+
+        # Create visualization stream for web interface (from old main)
+        viz_stream = object_detector.get_stream().pipe(
+            ops.map(lambda x: x["viz_frame"] if x is not None else None),
+            ops.filter(lambda x: x is not None),
+        )
+
+        # Create filtered objects stream
+        filtered_objects_stream = self.filtered_objects_subject
+
+        # Create grasps stream
+        grasps_stream = self.grasps_subject
+
+        # Create grasp overlay subject for immediate emission
+        grasp_overlay_stream = self.grasp_overlay_subject
+
+        return {
+            "detection_viz": viz_stream,
+            "pointcloud_viz": depth_stream,
+            "objects": object_detector.get_stream().pipe(ops.map(lambda x: x.get("objects", []))),
+            "filtered_objects": filtered_objects_stream,
+            "grasps": grasps_stream,
+            "grasp_overlay": grasp_overlay_stream,
+        }
+
+    def _start_grasp_loop(self):
+        """Start asyncio event loop in a background thread for WebSocket communication."""
+
+        def run_loop():
+            self.grasp_loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self.grasp_loop)
+            self.grasp_loop.run_forever()
+
+        self.grasp_loop_thread = threading.Thread(target=run_loop, daemon=True)
+        self.grasp_loop_thread.start()
+
+        # Wait for loop to start
+        while self.grasp_loop is None:
+            time.sleep(0.01)
+
+    async def _send_grasp_request(
+        self, points: np.ndarray, colors: Optional[np.ndarray]
+    ) -> Optional[List[dict]]:
+        """Send grasp request to Dimensional Grasp server."""
+        try:
+            # Comprehensive client-side validation to prevent server errors
+
+            # Validate points array
+            if points is None:
+                logger.error("Points array is None")
+                return None
+            if not isinstance(points, np.ndarray):
+                logger.error(f"Points is not numpy array: {type(points)}")
+                return None
+            if points.size == 0:
+                logger.error("Points array is empty")
+                return None
+            if len(points.shape) != 2 or points.shape[1] != 3:
+                logger.error(f"Points has invalid shape {points.shape}, expected (N, 3)")
+                return None
+            if points.shape[0] < 100:  # Minimum points for stable grasp detection
+                logger.error(f"Insufficient points for grasp detection: {points.shape[0]} < 100")
+                return None
+
+            # Validate and prepare colors
+            if colors is not None:
+                if not isinstance(colors, np.ndarray):
+                    colors = None
+                elif colors.size == 0:
+                    colors = None
+                elif len(colors.shape) != 2 or colors.shape[1] != 3:
+                    colors = None
+                elif colors.shape[0] != points.shape[0]:
+                    colors = None
+
+            # If no valid colors, create default colors (required by server)
+            if colors is None:
+                # Create default white colors for all points
+                colors = np.ones((points.shape[0], 3), dtype=np.float32) * 0.5
+
+            # Ensure data types are correct (server expects float32)
+            points = points.astype(np.float32)
+            colors = colors.astype(np.float32)
+
+            # Validate ranges (basic sanity checks)
+            if np.any(np.isnan(points)) or np.any(np.isinf(points)):
+                logger.error("Points contain NaN or Inf values")
+                return None
+            if np.any(np.isnan(colors)) or np.any(np.isinf(colors)):
+                logger.error("Colors contain NaN or Inf values")
+                return None
+
+            # Clamp color values to valid range [0, 1]
+            colors = np.clip(colors, 0.0, 1.0)
+
+            async with websockets.connect(self.grasp_server_url) as websocket:
+                request = {
+                    "points": points.tolist(),
+                    "colors": colors.tolist(),  # Always send colors array
+                    "lims": [-0.19, 0.12, 0.02, 0.15, 0.0, 1.0],  # Default workspace limits
+                }
+
+                await websocket.send(json.dumps(request))
+
+                response = await websocket.recv()
+                grasps = json.loads(response)
+
+                # Handle server response validation
+                if isinstance(grasps, dict) and "error" in grasps:
+                    logger.error(f"Server returned error: {grasps['error']}")
+                    return None
+                elif isinstance(grasps, (int, float)) and grasps == 0:
+                    return None
+                elif not isinstance(grasps, list):
+                    logger.error(
+                        f"Server returned unexpected response type: {type(grasps)}, value: {grasps}"
+                    )
+                    return None
+                elif len(grasps) == 0:
+                    return None
+
+                converted_grasps = self._convert_grasp_format(grasps)
+                with self.grasp_lock:
+                    self.latest_grasps = converted_grasps
+                    self.grasps_consumed = False  # Reset consumed flag
+
+                # Emit to reactive stream
+                self.grasps_subject.on_next(self.latest_grasps)
+
+                return converted_grasps
+        except websockets.exceptions.ConnectionClosed as e:
+            logger.error(f"WebSocket connection closed: {e}")
+        except websockets.exceptions.WebSocketException as e:
+            logger.error(f"WebSocket error: {e}")
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse server response as JSON: {e}")
+        except Exception as e:
+            logger.error(f"Error requesting grasps: {e}")
+
+        return None
+
+    def request_scene_grasps(self, objects: List[dict]) -> Optional[asyncio.Task]:
+        """Request grasps for entire scene by combining all object point clouds."""
+        if not self.grasp_loop or not objects:
+            return None
+
+        all_points = []
+        all_colors = []
+        valid_objects = 0
+
+        for i, obj in enumerate(objects):
+            # Validate point cloud data
+            if "point_cloud_numpy" not in obj or obj["point_cloud_numpy"] is None:
+                continue
+
+            points = obj["point_cloud_numpy"]
+            if not isinstance(points, np.ndarray) or points.size == 0:
+                continue
+
+            # Ensure points have correct shape (N, 3)
+            if len(points.shape) != 2 or points.shape[1] != 3:
+                continue
+
+            # Validate colors if present
+            colors = None
+            if "colors_numpy" in obj and obj["colors_numpy"] is not None:
+                colors = obj["colors_numpy"]
+                if isinstance(colors, np.ndarray) and colors.size > 0:
+                    # Ensure colors match points count and have correct shape
+                    if colors.shape[0] != points.shape[0]:
+                        colors = None  # Ignore colors for this object
+                    elif len(colors.shape) != 2 or colors.shape[1] != 3:
+                        colors = None  # Ignore colors for this object
+
+            all_points.append(points)
+            if colors is not None:
+                all_colors.append(colors)
+            valid_objects += 1
+
+        if not all_points:
+            return None
+
+        try:
+            combined_points = np.vstack(all_points)
+
+            # Only combine colors if ALL objects have valid colors
+            combined_colors = None
+            if len(all_colors) == valid_objects and len(all_colors) > 0:
+                combined_colors = np.vstack(all_colors)
+
+            # Validate final combined data
+            if combined_points.size == 0:
+                logger.warning("Combined point cloud is empty")
+                return None
+
+            if combined_colors is not None and combined_colors.shape[0] != combined_points.shape[0]:
+                logger.warning(
+                    f"Color/point count mismatch: {combined_colors.shape[0]} colors vs {combined_points.shape[0]} points, dropping colors"
+                )
+                combined_colors = None
+
+        except Exception as e:
+            logger.error(f"Failed to combine point clouds: {e}")
+            return None
+
+        try:
+            # Check if there's already a grasp task running
+            if hasattr(self, "grasp_task") and self.grasp_task and not self.grasp_task.done():
+                return self.grasp_task
+
+            task = asyncio.run_coroutine_threadsafe(
+                self._send_grasp_request(combined_points, combined_colors), self.grasp_loop
+            )
+
+            self.grasp_task = task
+            return task
+        except Exception as e:
+            logger.warning("Failed to create grasp task")
+            return None
+
+    def get_latest_grasps(self, timeout: float = 5.0) -> Optional[List[dict]]:
+        """Get latest grasp results, waiting for new ones if current ones have been consumed."""
+        # Mark current grasps as consumed and get a reference
+        with self.grasp_lock:
+            current_grasps = self.latest_grasps
+            self.grasps_consumed = True
+
+        # If we already have grasps and they haven't been consumed, return them
+        if current_grasps is not None and not getattr(self, "grasps_consumed", False):
+            return current_grasps
+
+        # Wait for new grasps
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            with self.grasp_lock:
+                # Check if we have new grasps (different from what we marked as consumed)
+                if self.latest_grasps is not None and not getattr(self, "grasps_consumed", False):
+                    return self.latest_grasps
+            time.sleep(0.1)  # Check every 100ms
+
+        return None  # Timeout reached
+
+    def clear_grasps(self) -> None:
+        """Clear all stored grasp results."""
+        with self.grasp_lock:
+            self.latest_grasps = []
+
+    def _prepare_colors(self, colors: Optional[np.ndarray]) -> Optional[np.ndarray]:
+        """Prepare colors array, converting from various formats if needed."""
+        if colors is None:
+            return None
+
+        if colors.max() > 1.0:
+            colors = colors / 255.0
+
+        return colors
+
+    def _convert_grasp_format(self, grasps: List[dict]) -> List[dict]:
+        """Convert Grasp format to our visualization format."""
+        converted = []
+
+        for i, grasp in enumerate(grasps):
+            rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3)))
+            euler_angles = self._rotation_matrix_to_euler(rotation_matrix)
+
+            converted_grasp = {
+                "id": f"grasp_{i}",
+                "score": grasp.get("score", 0.0),
+                "width": grasp.get("width", 0.0),
+                "height": grasp.get("height", 0.0),
+                "depth": grasp.get("depth", 0.0),
+                "translation": grasp.get("translation", [0, 0, 0]),
+                "rotation_matrix": rotation_matrix.tolist(),
+                "euler_angles": euler_angles,
+            }
+            converted.append(converted_grasp)
+
+        converted.sort(key=lambda x: x["score"], reverse=True)
+
+        return converted
+
+    def _rotation_matrix_to_euler(self, rotation_matrix: np.ndarray) -> Dict[str, float]:
+        """Convert rotation matrix to Euler angles (in radians)."""
+        sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2)
+
+        singular = sy < 1e-6
+
+        if not singular:
+            x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2])
+            y = np.arctan2(-rotation_matrix[2, 0], sy)
+            z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0])
+        else:
+            x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1])
+            y = np.arctan2(-rotation_matrix[2, 0], sy)
+            z = 0
+
+        return {"roll": x, "pitch": y, "yaw": z}
+
+    def cleanup(self):
+        """Clean up resources."""
+        if hasattr(self.detector, "cleanup"):
+            self.detector.cleanup()
+
+        if self.grasp_loop and self.grasp_loop_thread:
+            self.grasp_loop.call_soon_threadsafe(self.grasp_loop.stop)
+            self.grasp_loop_thread.join(timeout=1.0)
+
+        if hasattr(self.pointcloud_filter, "cleanup"):
+            self.pointcloud_filter.cleanup()
+        logger.info("ManipulationPipeline cleaned up")
diff --git a/dimos/perception/manip_aio_processer.py b/dimos/perception/manip_aio_processer.py
new file mode 100644
index 0000000000..2d80b527ce
--- /dev/null
+++ b/dimos/perception/manip_aio_processer.py
@@ -0,0 +1,411 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Sequential manipulation processor for single-frame processing without reactive streams.
+"""
+
+import logging
+import time
+from typing import Dict, List, Optional, Any, Tuple
+import numpy as np
+import cv2
+
+from dimos.utils.logging_config import setup_logger
+from dimos.perception.detection2d.detic_2d_det import Detic2DDetector
+from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering
+from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter
+from dimos.perception.grasp_generation.grasp_generation import HostedGraspGenerator
+from dimos.perception.grasp_generation.utils import create_grasp_overlay
+from dimos.perception.pointcloud.utils import (
+    create_point_cloud_overlay_visualization,
+    extract_and_cluster_misc_points,
+    overlay_point_clouds_on_image,
+)
+from dimos.perception.common.utils import (
+    colorize_depth,
+    detection_results_to_object_data,
+    combine_object_data,
+)
+
+logger = setup_logger("dimos.perception.manip_aio_processor")
+
+
+class ManipulationProcessor:
+    """
+    Sequential manipulation processor for single-frame processing.
+
+    Processes RGB-D frames through object detection, point cloud filtering,
+    and grasp generation in a single thread without reactive streams.
+    """
+
+    def __init__(
+        self,
+        camera_intrinsics: List[float],  # [fx, fy, cx, cy]
+        min_confidence: float = 0.6,
+        max_objects: int = 20,
+        vocabulary: Optional[str] = None,
+        enable_grasp_generation: bool = False,
+        grasp_server_url: Optional[str] = None,  # Required when enable_grasp_generation=True
+        enable_segmentation: bool = True,
+    ):
+        """
+        Initialize the manipulation processor.
+
+        Args:
+            camera_intrinsics: [fx, fy, cx, cy] camera parameters
+            min_confidence: Minimum detection confidence threshold
+            max_objects: Maximum number of objects to process
+            vocabulary: Optional vocabulary for Detic detector
+            enable_grasp_generation: Whether to enable grasp generation
+            grasp_server_url: WebSocket URL for Dimensional Grasp server (required when enable_grasp_generation=True)
+            enable_segmentation: Whether to enable semantic segmentation
+            segmentation_model: Segmentation model to use (SAM 2 or FastSAM)
+        """
+        self.camera_intrinsics = camera_intrinsics
+        self.min_confidence = min_confidence
+        self.max_objects = max_objects
+        self.enable_grasp_generation = enable_grasp_generation
+        self.grasp_server_url = grasp_server_url
+        self.enable_segmentation = enable_segmentation
+
+        # Validate grasp generation requirements
+        if enable_grasp_generation and not grasp_server_url:
+            raise ValueError("grasp_server_url is required when enable_grasp_generation=True")
+
+        # Initialize object detector
+        self.detector = Detic2DDetector(vocabulary=vocabulary, threshold=min_confidence)
+
+        # Initialize point cloud processor
+        self.pointcloud_filter = PointcloudFiltering(
+            color_intrinsics=camera_intrinsics,
+            depth_intrinsics=camera_intrinsics,  # ZED uses same intrinsics
+            max_num_objects=max_objects,
+        )
+
+        # Initialize semantic segmentation
+        self.segmenter = None
+        if self.enable_segmentation:
+            self.segmenter = Sam2DSegmenter(
+                device="cuda",
+                use_tracker=False,  # Disable tracker for simple segmentation
+                use_analyzer=False,  # Disable analyzer for simple segmentation
+            )
+
+        # Initialize grasp generator if enabled
+        self.grasp_generator = None
+        if self.enable_grasp_generation:
+            try:
+                self.grasp_generator = HostedGraspGenerator(server_url=grasp_server_url)
+                logger.info("Hosted grasp generator initialized successfully")
+            except Exception as e:
+                logger.error(f"Failed to initialize hosted grasp generator: {e}")
+                self.grasp_generator = None
+                self.enable_grasp_generation = False
+
+        logger.info(
+            f"Initialized ManipulationProcessor with confidence={min_confidence}, "
+            f"grasp_generation={enable_grasp_generation}"
+        )
+
+    def process_frame(
+        self, rgb_image: np.ndarray, depth_image: np.ndarray, generate_grasps: bool = None
+    ) -> Dict[str, Any]:
+        """
+        Process a single RGB-D frame through the complete pipeline.
+
+        Args:
+            rgb_image: RGB image (H, W, 3)
+            depth_image: Depth image (H, W) in meters
+            generate_grasps: Override grasp generation setting for this frame
+
+        Returns:
+            Dictionary containing:
+                - detection_viz: Visualization of object detection
+                - pointcloud_viz: Visualization of point cloud overlay
+                - segmentation_viz: Visualization of semantic segmentation (if enabled)
+                - detection2d_objects: Raw detection results as ObjectData
+                - segmentation2d_objects: Raw segmentation results as ObjectData (if enabled)
+                - detected_objects: Detection (Object Detection) objects with point clouds filtered
+                - all_objects: Combined objects with intelligent duplicate removal
+                - full_pointcloud: Complete scene point cloud (if point cloud processing enabled)
+                - misc_clusters: List of clustered background/miscellaneous point clouds (DBSCAN)
+                - misc_voxel_grid: Open3D voxel grid approximating all misc/background points
+                - misc_pointcloud_viz: Visualization of misc/background cluster overlay
+                - grasps: Grasp results (list of dictionaries, if enabled)
+                - grasp_overlay: Grasp visualization overlay (if enabled)
+                - processing_time: Total processing time
+        """
+        start_time = time.time()
+        results = {}
+
+        try:
+            # Step 1: Object Detection
+            step_start = time.time()
+            detection_results = self.run_object_detection(rgb_image)
+            results["detection2d_objects"] = detection_results.get("objects", [])
+            results["detection_viz"] = detection_results.get("viz_frame")
+            detection_time = time.time() - step_start
+
+            # Step 2: Semantic Segmentation (if enabled)
+            segmentation_time = 0
+            if self.enable_segmentation:
+                step_start = time.time()
+                segmentation_results = self.run_segmentation(rgb_image)
+                results["segmentation2d_objects"] = segmentation_results.get("objects", [])
+                results["segmentation_viz"] = segmentation_results.get("viz_frame")
+                segmentation_time = time.time() - step_start
+
+            # Step 3: Point Cloud Processing
+            pointcloud_time = 0
+            detection2d_objects = results.get("detection2d_objects", [])
+            segmentation2d_objects = results.get("segmentation2d_objects", [])
+
+            # Process detection objects if available
+            detected_objects = []
+            if detection2d_objects:
+                step_start = time.time()
+                detected_objects = self.run_pointcloud_filtering(
+                    rgb_image, depth_image, detection2d_objects
+                )
+                pointcloud_time += time.time() - step_start
+
+            # Process segmentation objects if available
+            segmentation_filtered_objects = []
+            if segmentation2d_objects:
+                step_start = time.time()
+                segmentation_filtered_objects = self.run_pointcloud_filtering(
+                    rgb_image, depth_image, segmentation2d_objects
+                )
+                pointcloud_time += time.time() - step_start
+
+            # Combine all objects using intelligent duplicate removal
+            all_objects = combine_object_data(
+                detected_objects, segmentation_filtered_objects, overlap_threshold=0.8
+            )
+
+            # Get full point cloud
+            full_pcd = self.pointcloud_filter.get_full_point_cloud()
+
+            # Extract misc/background points and create voxel grid
+            misc_start = time.time()
+            misc_clusters, misc_voxel_grid = extract_and_cluster_misc_points(
+                full_pcd,
+                all_objects,
+                eps=0.03,
+                min_points=100,
+                enable_filtering=True,
+                voxel_size=0.02,
+            )
+            misc_time = time.time() - misc_start
+
+            # Store results
+            results.update(
+                {
+                    "detected_objects": detected_objects,
+                    "all_objects": all_objects,
+                    "full_pointcloud": full_pcd,
+                    "misc_clusters": misc_clusters,
+                    "misc_voxel_grid": misc_voxel_grid,
+                }
+            )
+
+            # Create point cloud visualizations
+            base_image = colorize_depth(depth_image, max_depth=10.0)
+
+            # Create visualizations
+            results["pointcloud_viz"] = (
+                create_point_cloud_overlay_visualization(
+                    base_image=base_image,
+                    objects=all_objects,
+                    intrinsics=self.camera_intrinsics,
+                )
+                if all_objects
+                else base_image
+            )
+
+            results["detected_pointcloud_viz"] = (
+                create_point_cloud_overlay_visualization(
+                    base_image=base_image,
+                    objects=detected_objects,
+                    intrinsics=self.camera_intrinsics,
+                )
+                if detected_objects
+                else base_image
+            )
+
+            if misc_clusters:
+                # Generate consistent colors for clusters
+                cluster_colors = [
+                    tuple((np.random.RandomState(i + 100).rand(3) * 255).astype(int))
+                    for i in range(len(misc_clusters))
+                ]
+                results["misc_pointcloud_viz"] = overlay_point_clouds_on_image(
+                    base_image=base_image,
+                    point_clouds=misc_clusters,
+                    camera_intrinsics=self.camera_intrinsics,
+                    colors=cluster_colors,
+                    point_size=2,
+                    alpha=0.6,
+                )
+            else:
+                results["misc_pointcloud_viz"] = base_image
+
+            # Step 4: Grasp Generation (if enabled)
+            should_generate_grasps = (
+                generate_grasps if generate_grasps is not None else self.enable_grasp_generation
+            )
+
+            if should_generate_grasps and all_objects and full_pcd:
+                grasps = self.run_grasp_generation(all_objects, full_pcd)
+                results["grasps"] = grasps
+                if grasps:
+                    results["grasp_overlay"] = create_grasp_overlay(
+                        rgb_image, grasps, self.camera_intrinsics
+                    )
+
+        except Exception as e:
+            logger.error(f"Error processing frame: {e}")
+            results["error"] = str(e)
+
+        # Add timing information
+        total_time = time.time() - start_time
+        results.update(
+            {
+                "processing_time": total_time,
+                "timing_breakdown": {
+                    "detection": detection_time if "detection_time" in locals() else 0,
+                    "segmentation": segmentation_time if "segmentation_time" in locals() else 0,
+                    "pointcloud": pointcloud_time if "pointcloud_time" in locals() else 0,
+                    "misc_extraction": misc_time if "misc_time" in locals() else 0,
+                    "total": total_time,
+                },
+            }
+        )
+
+        return results
+
+    def run_object_detection(self, rgb_image: np.ndarray) -> Dict[str, Any]:
+        """Run object detection on RGB image."""
+        try:
+            # Convert RGB to BGR for Detic detector
+            bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
+
+            # Use process_image method from Detic detector
+            bboxes, track_ids, class_ids, confidences, names, masks = self.detector.process_image(
+                bgr_image
+            )
+
+            # Convert to ObjectData format using utility function
+            objects = detection_results_to_object_data(
+                bboxes=bboxes,
+                track_ids=track_ids,
+                class_ids=class_ids,
+                confidences=confidences,
+                names=names,
+                masks=masks,
+                source="detection",
+            )
+
+            # Create visualization using detector's built-in method
+            viz_frame = self.detector.visualize_results(
+                rgb_image, bboxes, track_ids, class_ids, confidences, names
+            )
+
+            return {"objects": objects, "viz_frame": viz_frame}
+
+        except Exception as e:
+            logger.error(f"Object detection failed: {e}")
+            return {"objects": [], "viz_frame": rgb_image.copy()}
+
+    def run_pointcloud_filtering(
+        self, rgb_image: np.ndarray, depth_image: np.ndarray, objects: List[Dict]
+    ) -> List[Dict]:
+        """Run point cloud filtering on detected objects."""
+        try:
+            filtered_objects = self.pointcloud_filter.process_images(
+                rgb_image, depth_image, objects
+            )
+            return filtered_objects if filtered_objects else []
+        except Exception as e:
+            logger.error(f"Point cloud filtering failed: {e}")
+            return []
+
+    def run_segmentation(self, rgb_image: np.ndarray) -> Dict[str, Any]:
+        """Run semantic segmentation on RGB image."""
+        if not self.segmenter:
+            return {"objects": [], "viz_frame": rgb_image.copy()}
+
+        try:
+            # Convert RGB to BGR for segmenter
+            bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
+
+            # Get segmentation results
+            masks, bboxes, track_ids, probs, names = self.segmenter.process_image(bgr_image)
+
+            # Convert to ObjectData format using utility function
+            objects = detection_results_to_object_data(
+                bboxes=bboxes,
+                track_ids=track_ids,
+                class_ids=list(range(len(bboxes))),  # Use indices as class IDs for segmentation
+                confidences=probs,
+                names=names,
+                masks=masks,
+                source="segmentation",
+            )
+
+            # Create visualization
+            if masks:
+                viz_bgr = self.segmenter.visualize_results(
+                    bgr_image, masks, bboxes, track_ids, probs, names
+                )
+                # Convert back to RGB
+                viz_frame = cv2.cvtColor(viz_bgr, cv2.COLOR_BGR2RGB)
+            else:
+                viz_frame = rgb_image.copy()
+
+            return {"objects": objects, "viz_frame": viz_frame}
+
+        except Exception as e:
+            logger.error(f"Segmentation failed: {e}")
+            return {"objects": [], "viz_frame": rgb_image.copy()}
+
+    def run_grasp_generation(self, filtered_objects: List[Dict], full_pcd) -> Optional[List[Dict]]:
+        """Run grasp generation using the configured generator."""
+        if not self.grasp_generator:
+            logger.warning("Grasp generation requested but no generator available")
+            return None
+
+        try:
+            # Generate grasps using the configured generator
+            grasps = self.grasp_generator.generate_grasps_from_objects(filtered_objects, full_pcd)
+
+            # Return parsed results directly (list of grasp dictionaries)
+            return grasps
+
+        except Exception as e:
+            logger.error(f"Grasp generation failed: {e}")
+            return None
+
+    def cleanup(self):
+        """Clean up resources."""
+        if hasattr(self.detector, "cleanup"):
+            self.detector.cleanup()
+        if hasattr(self.pointcloud_filter, "cleanup"):
+            self.pointcloud_filter.cleanup()
+        if self.segmenter and hasattr(self.segmenter, "cleanup"):
+            self.segmenter.cleanup()
+        if self.grasp_generator and hasattr(self.grasp_generator, "cleanup"):
+            self.grasp_generator.cleanup()
+        logger.info("ManipulationProcessor cleaned up")
diff --git a/dimos/perception/object_detection_stream.py b/dimos/perception/object_detection_stream.py
index 3284d99f8b..4fb8fc2691 100644
--- a/dimos/perception/object_detection_stream.py
+++ b/dimos/perception/object_detection_stream.py
@@ -33,6 +33,7 @@
     calculate_object_size_from_bbox,
     calculate_position_rotation_from_bbox,
 )
+from dimos.perception.common.utils import draw_object_detection_visualization
 from dimos.types.vector import Vector
 from typing import Optional, Union, Callable
 from dimos.types.manipulation import ObjectData
@@ -234,68 +235,10 @@ def process_frame(frame):
 
                 objects.append(object_data)
 
-                # Add visualization
-                x1, y1, x2, y2 = map(int, bbox)
-                color = (0, 255, 0)  # Green for detected objects
-                mask_color = (0, 200, 200)  # Yellow-green for masks
-
-                # Draw segmentation mask if available and valid
-                try:
-                    if self.draw_masks and object_data["segmentation_mask"] is not None:
-                        # Create a colored mask overlay
-                        mask = object_data["segmentation_mask"].astype(np.uint8)
-                        colored_mask = np.zeros_like(viz_frame)
-                        colored_mask[mask > 0] = mask_color
-
-                        # Apply the mask with transparency
-                        alpha = 0.5  # transparency factor
-                        mask_area = mask > 0
-                        viz_frame[mask_area] = cv2.addWeighted(
-                            viz_frame[mask_area], 1 - alpha, colored_mask[mask_area], alpha, 0
-                        )
-
-                        # Draw mask contour
-                        contours, _ = cv2.findContours(
-                            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
-                        )
-                        cv2.drawContours(viz_frame, contours, -1, mask_color, 2)
-                except Exception as e:
-                    logger.warning(f"Error drawing segmentation mask: {e}")
-
-                # Draw bounding box with metadata
-                try:
-                    cv2.rectangle(viz_frame, (x1, y1), (x2, y2), color, 1)
-
-                    # Add text for class only (removed position data)
-                    # Handle possible None values for class_name or track_ids[i]
-                    class_text = class_name if class_name is not None else "Unknown"
-                    id_text = (
-                        track_ids[i] if i < len(track_ids) and track_ids[i] is not None else "?"
-                    )
-                    text = f"{class_text}, ID: {id_text}"
-
-                    # Draw text background with smaller font
-                    text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.3, 1)[0]
-                    cv2.rectangle(
-                        viz_frame,
-                        (x1, y1 - text_size[1] - 5),
-                        (x1 + text_size[0], y1),
-                        (0, 0, 0),
-                        -1,
-                    )
-
-                    # Draw text with smaller font
-                    cv2.putText(
-                        viz_frame,
-                        text,
-                        (x1, y1 - 5),
-                        cv2.FONT_HERSHEY_SIMPLEX,
-                        0.3,
-                        (255, 255, 255),
-                        1,
-                    )
-                except Exception as e:
-                    logger.warning(f"Error drawing bounding box or text: {e}")
+            # Create visualization using common function
+            viz_frame = draw_object_detection_visualization(
+                viz_frame, objects, draw_masks=self.draw_masks, font_scale=1.5
+            )
 
             return {"frame": frame, "viz_frame": viz_frame, "objects": objects}
 
diff --git a/dimos/perception/pointcloud/__init__.py b/dimos/perception/pointcloud/__init__.py
new file mode 100644
index 0000000000..1f282bb738
--- /dev/null
+++ b/dimos/perception/pointcloud/__init__.py
@@ -0,0 +1,3 @@
+from .utils import *
+from .cuboid_fit import *
+from .pointcloud_filtering import *
diff --git a/dimos/perception/pointcloud/cuboid_fit.py b/dimos/perception/pointcloud/cuboid_fit.py
new file mode 100644
index 0000000000..d567f40395
--- /dev/null
+++ b/dimos/perception/pointcloud/cuboid_fit.py
@@ -0,0 +1,414 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import open3d as o3d
+import cv2
+from typing import Dict, Optional, Union, Tuple
+
+
+def fit_cuboid(
+    points: Union[np.ndarray, o3d.geometry.PointCloud], method: str = "minimal"
+) -> Optional[Dict]:
+    """
+    Fit a cuboid to a point cloud using Open3D's built-in methods.
+
+    Args:
+        points: Nx3 array of points or Open3D PointCloud
+        method: Fitting method:
+            - 'minimal': Minimal oriented bounding box (best fit)
+            - 'oriented': PCA-based oriented bounding box
+            - 'axis_aligned': Axis-aligned bounding box
+
+    Returns:
+        Dictionary containing:
+            - center: 3D center point
+            - dimensions: 3D dimensions (extent)
+            - rotation: 3x3 rotation matrix
+            - error: Fitting error
+            - bounding_box: Open3D OrientedBoundingBox object
+        Returns None if insufficient points or fitting fails.
+
+    Raises:
+        ValueError: If method is invalid or inputs are malformed
+    """
+    # Validate method
+    valid_methods = ["minimal", "oriented", "axis_aligned"]
+    if method not in valid_methods:
+        raise ValueError(f"method must be one of {valid_methods}, got '{method}'")
+
+    # Convert to point cloud if needed
+    if isinstance(points, np.ndarray):
+        points = np.asarray(points)
+        if len(points.shape) != 2 or points.shape[1] != 3:
+            raise ValueError(f"points array must be Nx3, got shape {points.shape}")
+        if len(points) < 4:
+            return None
+
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(points)
+    elif isinstance(points, o3d.geometry.PointCloud):
+        pcd = points
+        points = np.asarray(pcd.points)
+        if len(points) < 4:
+            return None
+    else:
+        raise ValueError(f"points must be numpy array or Open3D PointCloud, got {type(points)}")
+
+    try:
+        # Get bounding box based on method
+        if method == "minimal":
+            obb = pcd.get_minimal_oriented_bounding_box(robust=True)
+        elif method == "oriented":
+            obb = pcd.get_oriented_bounding_box(robust=True)
+        elif method == "axis_aligned":
+            # Convert axis-aligned to oriented format for consistency
+            aabb = pcd.get_axis_aligned_bounding_box()
+            obb = o3d.geometry.OrientedBoundingBox()
+            obb.center = aabb.get_center()
+            obb.extent = aabb.get_extent()
+            obb.R = np.eye(3)  # Identity rotation for axis-aligned
+
+        # Extract parameters
+        center = np.asarray(obb.center)
+        dimensions = np.asarray(obb.extent)
+        rotation = np.asarray(obb.R)
+
+        # Calculate fitting error
+        error = _compute_fitting_error(points, center, dimensions, rotation)
+
+        return {
+            "center": center,
+            "dimensions": dimensions,
+            "rotation": rotation,
+            "error": error,
+            "bounding_box": obb,
+            "method": method,
+        }
+
+    except Exception as e:
+        # Log error but don't crash - return None for graceful handling
+        print(f"Warning: Cuboid fitting failed with method '{method}': {e}")
+        return None
+
+
+def fit_cuboid_simple(points: Union[np.ndarray, o3d.geometry.PointCloud]) -> Optional[Dict]:
+    """
+    Simple wrapper for minimal oriented bounding box fitting.
+
+    Args:
+        points: Nx3 array of points or Open3D PointCloud
+
+    Returns:
+        Dictionary with center, dimensions, rotation, and bounding_box,
+        or None if insufficient points
+    """
+    return fit_cuboid(points, method="minimal")
+
+
+def _compute_fitting_error(
+    points: np.ndarray, center: np.ndarray, dimensions: np.ndarray, rotation: np.ndarray
+) -> float:
+    """
+    Compute fitting error as mean squared distance from points to cuboid surface.
+
+    Args:
+        points: Nx3 array of points
+        center: 3D center point
+        dimensions: 3D dimensions
+        rotation: 3x3 rotation matrix
+
+    Returns:
+        Mean squared error
+    """
+    if len(points) == 0:
+        return 0.0
+
+    # Transform points to local coordinates
+    local_points = (points - center) @ rotation
+    half_dims = dimensions / 2
+
+    # Calculate distance to cuboid surface
+    dx = np.abs(local_points[:, 0]) - half_dims[0]
+    dy = np.abs(local_points[:, 1]) - half_dims[1]
+    dz = np.abs(local_points[:, 2]) - half_dims[2]
+
+    # Points outside: distance to nearest face
+    # Points inside: negative distance to nearest face
+    outside_dist = np.sqrt(np.maximum(dx, 0) ** 2 + np.maximum(dy, 0) ** 2 + np.maximum(dz, 0) ** 2)
+    inside_dist = np.minimum(np.minimum(dx, dy), dz)
+    distances = np.where((dx > 0) | (dy > 0) | (dz > 0), outside_dist, -inside_dist)
+
+    return float(np.mean(distances**2))
+
+
+def get_cuboid_corners(
+    center: np.ndarray, dimensions: np.ndarray, rotation: np.ndarray
+) -> np.ndarray:
+    """
+    Get the 8 corners of a cuboid.
+
+    Args:
+        center: 3D center point
+        dimensions: 3D dimensions
+        rotation: 3x3 rotation matrix
+
+    Returns:
+        8x3 array of corner coordinates
+    """
+    half_dims = dimensions / 2
+    corners_local = (
+        np.array(
+            [
+                [-1, -1, -1],  # 0: left  bottom back
+                [-1, -1, 1],  # 1: left  bottom front
+                [-1, 1, -1],  # 2: left  top    back
+                [-1, 1, 1],  # 3: left  top    front
+                [1, -1, -1],  # 4: right bottom back
+                [1, -1, 1],  # 5: right bottom front
+                [1, 1, -1],  # 6: right top    back
+                [1, 1, 1],  # 7: right top    front
+            ]
+        )
+        * half_dims
+    )
+
+    # Apply rotation and translation
+    return corners_local @ rotation.T + center
+
+
+def visualize_cuboid_on_image(
+    image: np.ndarray,
+    cuboid_params: Dict,
+    camera_matrix: np.ndarray,
+    extrinsic_rotation: Optional[np.ndarray] = None,
+    extrinsic_translation: Optional[np.ndarray] = None,
+    color: Tuple[int, int, int] = (0, 255, 0),
+    thickness: int = 2,
+    show_dimensions: bool = True,
+) -> np.ndarray:
+    """
+    Draw a fitted cuboid on an image using camera projection.
+
+    Args:
+        image: Input image to draw on
+        cuboid_params: Dictionary containing cuboid parameters
+        camera_matrix: Camera intrinsic matrix (3x3)
+        extrinsic_rotation: Optional external rotation (3x3)
+        extrinsic_translation: Optional external translation (3x1)
+        color: Line color as (B, G, R) tuple
+        thickness: Line thickness
+        show_dimensions: Whether to display dimension text
+
+    Returns:
+        Image with cuboid visualization
+
+    Raises:
+        ValueError: If required parameters are missing or invalid
+    """
+    # Validate inputs
+    required_keys = ["center", "dimensions", "rotation"]
+    if not all(key in cuboid_params for key in required_keys):
+        raise ValueError(f"cuboid_params must contain keys: {required_keys}")
+
+    if camera_matrix.shape != (3, 3):
+        raise ValueError(f"camera_matrix must be 3x3, got {camera_matrix.shape}")
+
+    # Get corners in world coordinates
+    corners = get_cuboid_corners(
+        cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"]
+    )
+
+    # Transform corners if extrinsic parameters are provided
+    if extrinsic_rotation is not None and extrinsic_translation is not None:
+        if extrinsic_rotation.shape != (3, 3):
+            raise ValueError(f"extrinsic_rotation must be 3x3, got {extrinsic_rotation.shape}")
+        if extrinsic_translation.shape not in [(3,), (3, 1)]:
+            raise ValueError(
+                f"extrinsic_translation must be (3,) or (3,1), got {extrinsic_translation.shape}"
+            )
+
+        extrinsic_translation = extrinsic_translation.flatten()
+        corners = (extrinsic_rotation @ corners.T).T + extrinsic_translation
+
+    try:
+        # Project 3D corners to image coordinates
+        corners_img, _ = cv2.projectPoints(
+            corners.astype(np.float32),
+            np.zeros(3),
+            np.zeros(3),  # No additional rotation/translation
+            camera_matrix.astype(np.float32),
+            None,  # No distortion
+        )
+        corners_img = corners_img.reshape(-1, 2).astype(int)
+
+        # Check if corners are within image bounds
+        h, w = image.shape[:2]
+        valid_corners = (
+            (corners_img[:, 0] >= 0)
+            & (corners_img[:, 0] < w)
+            & (corners_img[:, 1] >= 0)
+            & (corners_img[:, 1] < h)
+        )
+
+        if not np.any(valid_corners):
+            print("Warning: All cuboid corners are outside image bounds")
+            return image.copy()
+
+    except Exception as e:
+        print(f"Warning: Failed to project cuboid corners: {e}")
+        return image.copy()
+
+    # Define edges for wireframe visualization
+    edges = [
+        # Bottom face
+        (0, 1),
+        (1, 5),
+        (5, 4),
+        (4, 0),
+        # Top face
+        (2, 3),
+        (3, 7),
+        (7, 6),
+        (6, 2),
+        # Vertical edges
+        (0, 2),
+        (1, 3),
+        (5, 7),
+        (4, 6),
+    ]
+
+    # Draw edges
+    vis_img = image.copy()
+    for i, j in edges:
+        # Only draw edge if both corners are valid
+        if valid_corners[i] and valid_corners[j]:
+            cv2.line(vis_img, tuple(corners_img[i]), tuple(corners_img[j]), color, thickness)
+
+    # Add dimension text if requested
+    if show_dimensions and np.any(valid_corners):
+        dims = cuboid_params["dimensions"]
+        dim_text = f"Dims: {dims[0]:.3f} x {dims[1]:.3f} x {dims[2]:.3f}"
+
+        # Find a good position for text (top-left of image)
+        text_pos = (10, 30)
+        font_scale = 0.7
+
+        # Add background rectangle for better readability
+        text_size = cv2.getTextSize(dim_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 2)[0]
+        cv2.rectangle(
+            vis_img,
+            (text_pos[0] - 5, text_pos[1] - text_size[1] - 5),
+            (text_pos[0] + text_size[0] + 5, text_pos[1] + 5),
+            (0, 0, 0),
+            -1,
+        )
+
+        cv2.putText(vis_img, dim_text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, font_scale, color, 2)
+
+    return vis_img
+
+
+def compute_cuboid_volume(cuboid_params: Dict) -> float:
+    """
+    Compute the volume of a cuboid.
+
+    Args:
+        cuboid_params: Dictionary containing cuboid parameters
+
+    Returns:
+        Volume in cubic units
+    """
+    if "dimensions" not in cuboid_params:
+        raise ValueError("cuboid_params must contain 'dimensions' key")
+
+    dims = cuboid_params["dimensions"]
+    return float(np.prod(dims))
+
+
+def compute_cuboid_surface_area(cuboid_params: Dict) -> float:
+    """
+    Compute the surface area of a cuboid.
+
+    Args:
+        cuboid_params: Dictionary containing cuboid parameters
+
+    Returns:
+        Surface area in square units
+    """
+    if "dimensions" not in cuboid_params:
+        raise ValueError("cuboid_params must contain 'dimensions' key")
+
+    dims = cuboid_params["dimensions"]
+    return 2.0 * (dims[0] * dims[1] + dims[1] * dims[2] + dims[2] * dims[0])
+
+
+def check_cuboid_quality(cuboid_params: Dict, points: np.ndarray) -> Dict:
+    """
+    Assess the quality of a cuboid fit.
+
+    Args:
+        cuboid_params: Dictionary containing cuboid parameters
+        points: Original points used for fitting
+
+    Returns:
+        Dictionary with quality metrics
+    """
+    if len(points) == 0:
+        return {"error": "No points provided"}
+
+    # Basic metrics
+    volume = compute_cuboid_volume(cuboid_params)
+    surface_area = compute_cuboid_surface_area(cuboid_params)
+    error = cuboid_params.get("error", 0.0)
+
+    # Aspect ratio analysis
+    dims = cuboid_params["dimensions"]
+    aspect_ratios = [
+        dims[0] / dims[1] if dims[1] > 0 else float("inf"),
+        dims[1] / dims[2] if dims[2] > 0 else float("inf"),
+        dims[2] / dims[0] if dims[0] > 0 else float("inf"),
+    ]
+    max_aspect_ratio = max(aspect_ratios)
+
+    # Volume ratio (cuboid volume vs convex hull volume)
+    try:
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(points)
+        hull, _ = pcd.compute_convex_hull()
+        hull_volume = hull.get_volume()
+        volume_ratio = volume / hull_volume if hull_volume > 0 else float("inf")
+    except:
+        volume_ratio = None
+
+    return {
+        "fitting_error": error,
+        "volume": volume,
+        "surface_area": surface_area,
+        "max_aspect_ratio": max_aspect_ratio,
+        "volume_ratio": volume_ratio,
+        "num_points": len(points),
+        "method": cuboid_params.get("method", "unknown"),
+    }
+
+
+# Backward compatibility
+def visualize_fit(image, cuboid_params, camera_matrix, R=None, t=None):
+    """
+    Legacy function for backward compatibility.
+    Use visualize_cuboid_on_image instead.
+    """
+    return visualize_cuboid_on_image(
+        image, cuboid_params, camera_matrix, R, t, show_dimensions=True
+    )
diff --git a/dimos/perception/pointcloud/pointcloud_filtering.py b/dimos/perception/pointcloud/pointcloud_filtering.py
new file mode 100644
index 0000000000..64fd8953a6
--- /dev/null
+++ b/dimos/perception/pointcloud/pointcloud_filtering.py
@@ -0,0 +1,674 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import cv2
+import os
+import torch
+import open3d as o3d
+import argparse
+import pickle
+from typing import Dict, List, Optional, Union
+import time
+from dimos.types.manipulation import ObjectData
+from dimos.types.vector import Vector
+from dimos.perception.pointcloud.utils import (
+    load_camera_matrix_from_yaml,
+    create_point_cloud_and_extract_masks,
+    o3d_point_cloud_to_numpy,
+)
+from dimos.perception.pointcloud.cuboid_fit import fit_cuboid
+
+
+class PointcloudFiltering:
+    """
+    A production-ready point cloud filtering pipeline for segmented objects.
+
+    This class takes segmentation results and produces clean, filtered point clouds
+    for each object with consistent coloring and optional outlier removal.
+    """
+
+    def __init__(
+        self,
+        color_intrinsics: Optional[Union[str, List[float], np.ndarray]] = None,
+        depth_intrinsics: Optional[Union[str, List[float], np.ndarray]] = None,
+        color_weight: float = 0.3,
+        enable_statistical_filtering: bool = True,
+        statistical_neighbors: int = 20,
+        statistical_std_ratio: float = 1.5,
+        enable_radius_filtering: bool = True,
+        radius_filtering_radius: float = 0.015,
+        radius_filtering_min_neighbors: int = 25,
+        enable_subsampling: bool = True,
+        voxel_size: float = 0.005,
+        max_num_objects: int = 10,
+        min_points_for_cuboid: int = 10,
+        cuboid_method: str = "oriented",
+        max_bbox_size_percent: float = 30.0,
+    ):
+        """
+        Initialize the point cloud filtering pipeline.
+
+        Args:
+            color_intrinsics: Camera intrinsics for color image
+            depth_intrinsics: Camera intrinsics for depth image
+            color_weight: Weight for blending generated color with original (0.0-1.0)
+            enable_statistical_filtering: Enable/disable statistical outlier filtering
+            statistical_neighbors: Number of neighbors for statistical filtering
+            statistical_std_ratio: Standard deviation ratio for statistical filtering
+            enable_radius_filtering: Enable/disable radius outlier filtering
+            radius_filtering_radius: Search radius for radius filtering (meters)
+            radius_filtering_min_neighbors: Min neighbors within radius
+            enable_subsampling: Enable/disable point cloud subsampling
+            voxel_size: Voxel size for downsampling (meters, when subsampling enabled)
+            max_num_objects: Maximum number of objects to process (top N by confidence)
+            min_points_for_cuboid: Minimum points required for cuboid fitting
+            cuboid_method: Method for cuboid fitting ('minimal', 'oriented', 'axis_aligned')
+            max_bbox_size_percent: Maximum percentage of image size for object bboxes (0-100)
+
+        Raises:
+            ValueError: If invalid parameters are provided
+        """
+        # Validate parameters
+        if not 0.0 <= color_weight <= 1.0:
+            raise ValueError(f"color_weight must be between 0.0 and 1.0, got {color_weight}")
+        if not 0.0 <= max_bbox_size_percent <= 100.0:
+            raise ValueError(
+                f"max_bbox_size_percent must be between 0.0 and 100.0, got {max_bbox_size_percent}"
+            )
+
+        # Store settings
+        self.color_weight = color_weight
+        self.enable_statistical_filtering = enable_statistical_filtering
+        self.statistical_neighbors = statistical_neighbors
+        self.statistical_std_ratio = statistical_std_ratio
+        self.enable_radius_filtering = enable_radius_filtering
+        self.radius_filtering_radius = radius_filtering_radius
+        self.radius_filtering_min_neighbors = radius_filtering_min_neighbors
+        self.enable_subsampling = enable_subsampling
+        self.voxel_size = voxel_size
+        self.max_num_objects = max_num_objects
+        self.min_points_for_cuboid = min_points_for_cuboid
+        self.cuboid_method = cuboid_method
+        self.max_bbox_size_percent = max_bbox_size_percent
+
+        # Load camera matrices
+        self.color_camera_matrix = load_camera_matrix_from_yaml(color_intrinsics)
+        self.depth_camera_matrix = load_camera_matrix_from_yaml(depth_intrinsics)
+
+        # Store the full point cloud
+        self.full_pcd = None
+
+    def generate_color_from_id(self, object_id: int) -> np.ndarray:
+        """Generate a consistent color for a given object ID."""
+        np.random.seed(object_id)
+        color = np.random.randint(0, 255, 3, dtype=np.uint8)
+        np.random.seed(None)
+        return color
+
+    def _validate_inputs(
+        self, color_img: np.ndarray, depth_img: np.ndarray, objects: List[ObjectData]
+    ):
+        """Validate input parameters."""
+        if color_img.shape[:2] != depth_img.shape:
+            raise ValueError("Color and depth image dimensions don't match")
+
+    def _prepare_masks(self, masks: List[np.ndarray], target_shape: tuple) -> List[np.ndarray]:
+        """Prepare and validate masks to match target shape."""
+        processed_masks = []
+        for mask in masks:
+            # Convert mask to numpy if it's a tensor
+            if hasattr(mask, "cpu"):
+                mask = mask.cpu().numpy()
+
+            mask = mask.astype(bool)
+
+            # Handle shape mismatches
+            if mask.shape != target_shape:
+                if len(mask.shape) > 2:
+                    mask = mask[:, :, 0]
+
+                if mask.shape != target_shape:
+                    mask = cv2.resize(
+                        mask.astype(np.uint8),
+                        (target_shape[1], target_shape[0]),
+                        interpolation=cv2.INTER_NEAREST,
+                    ).astype(bool)
+
+            processed_masks.append(mask)
+
+        return processed_masks
+
+    def _apply_color_mask(
+        self, pcd: o3d.geometry.PointCloud, rgb_color: np.ndarray
+    ) -> o3d.geometry.PointCloud:
+        """Apply weighted color mask to point cloud."""
+        if len(np.asarray(pcd.colors)) > 0:
+            original_colors = np.asarray(pcd.colors)
+            generated_color = rgb_color.astype(np.float32) / 255.0
+            colored_mask = (
+                1.0 - self.color_weight
+            ) * original_colors + self.color_weight * generated_color
+            colored_mask = np.clip(colored_mask, 0.0, 1.0)
+            pcd.colors = o3d.utility.Vector3dVector(colored_mask)
+        return pcd
+
+    def _apply_filtering(self, pcd: o3d.geometry.PointCloud) -> o3d.geometry.PointCloud:
+        """Apply optional filtering to point cloud based on enabled flags."""
+        current_pcd = pcd
+
+        # Apply statistical filtering if enabled
+        if self.enable_statistical_filtering:
+            current_pcd, _ = current_pcd.remove_statistical_outlier(
+                nb_neighbors=self.statistical_neighbors, std_ratio=self.statistical_std_ratio
+            )
+
+        # Apply radius filtering if enabled
+        if self.enable_radius_filtering:
+            current_pcd, _ = current_pcd.remove_radius_outlier(
+                nb_points=self.radius_filtering_min_neighbors, radius=self.radius_filtering_radius
+            )
+
+        return current_pcd
+
+    def _apply_subsampling(self, pcd: o3d.geometry.PointCloud) -> o3d.geometry.PointCloud:
+        """Apply subsampling to limit point cloud size using Open3D's voxel downsampling."""
+        if self.enable_subsampling:
+            return pcd.voxel_down_sample(self.voxel_size)
+        return pcd
+
+    def _extract_masks_from_objects(self, objects: List[ObjectData]) -> List[np.ndarray]:
+        """Extract segmentation masks from ObjectData objects."""
+        return [obj["segmentation_mask"] for obj in objects]
+
+    def get_full_point_cloud(self) -> o3d.geometry.PointCloud:
+        """Get the full point cloud."""
+        return self._apply_subsampling(self.full_pcd)
+
+    def process_images(
+        self, color_img: np.ndarray, depth_img: np.ndarray, objects: List[ObjectData]
+    ) -> List[ObjectData]:
+        """
+        Process color and depth images with object detection results to create filtered point clouds.
+
+        Args:
+            color_img: RGB image as numpy array (H, W, 3)
+            depth_img: Depth image as numpy array (H, W) in meters
+            objects: List of ObjectData from object detection stream
+
+        Returns:
+            List of updated ObjectData with pointcloud and 3D information. Each ObjectData
+            dictionary is enhanced with the following new fields:
+
+            **3D Spatial Information** (added when sufficient points for cuboid fitting):
+            - "position": Vector(x, y, z) - 3D center position in world coordinates (meters)
+            - "rotation": Vector(roll, pitch, yaw) - 3D orientation as Euler angles (radians)
+            - "size": {"width": float, "height": float, "depth": float} - 3D bounding box dimensions (meters)
+
+            **Point Cloud Data**:
+            - "point_cloud": o3d.geometry.PointCloud - Filtered Open3D point cloud with colors
+            - "color": np.ndarray - Consistent RGB color [R,G,B] (0-255) generated from object_id
+
+            **Grasp Generation Arrays** (Dimensional grasp format):
+            - "point_cloud_numpy": np.ndarray - Nx3 XYZ coordinates as float32 (meters)
+            - "colors_numpy": np.ndarray - Nx3 RGB colors as float32 (0.0-1.0 range)
+
+        Raises:
+            ValueError: If inputs are invalid
+            RuntimeError: If processing fails
+        """
+        # Validate inputs
+        self._validate_inputs(color_img, depth_img, objects)
+
+        if not objects:
+            return []
+
+        # Filter to top N objects by confidence
+        if len(objects) > self.max_num_objects:
+            # Sort objects by confidence (highest first), handle None confidences
+            sorted_objects = sorted(
+                objects,
+                key=lambda obj: obj.get("confidence", 0.0)
+                if obj.get("confidence") is not None
+                else 0.0,
+                reverse=True,
+            )
+            objects = sorted_objects[: self.max_num_objects]
+
+        # Filter out objects with bboxes too large
+        image_area = color_img.shape[0] * color_img.shape[1]
+        max_bbox_area = image_area * (self.max_bbox_size_percent / 100.0)
+
+        filtered_objects = []
+        for obj in objects:
+            if "bbox" in obj and obj["bbox"] is not None:
+                bbox = obj["bbox"]
+                # Calculate bbox area (assuming bbox format [x1, y1, x2, y2])
+                bbox_area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                if bbox_area <= max_bbox_area:
+                    filtered_objects.append(obj)
+            else:
+                filtered_objects.append(obj)
+
+        objects = filtered_objects
+
+        # Extract masks from ObjectData
+        masks = self._extract_masks_from_objects(objects)
+
+        # Prepare masks
+        processed_masks = self._prepare_masks(masks, depth_img.shape)
+
+        # Create point clouds efficiently
+        self.full_pcd, masked_pcds = create_point_cloud_and_extract_masks(
+            color_img, depth_img, processed_masks, self.depth_camera_matrix, depth_scale=1.0
+        )
+
+        # Process each object and update ObjectData
+        updated_objects = []
+
+        for i, (obj, mask, pcd) in enumerate(zip(objects, processed_masks, masked_pcds)):
+            # Skip empty point clouds
+            if len(np.asarray(pcd.points)) == 0:
+                continue
+
+            # Create a copy of the object data to avoid modifying the original
+            updated_obj = obj.copy()
+
+            # Generate consistent color
+            object_id = obj.get("object_id", i)
+            rgb_color = self.generate_color_from_id(object_id)
+
+            # Apply color mask
+            pcd = self._apply_color_mask(pcd, rgb_color)
+
+            # Apply subsampling to control point cloud size
+            pcd = self._apply_subsampling(pcd)
+
+            # Apply filtering (optional based on flags)
+            pcd_filtered = self._apply_filtering(pcd)
+
+            # Fit cuboid and extract 3D information
+            points = np.asarray(pcd_filtered.points)
+            if len(points) >= self.min_points_for_cuboid:
+                cuboid_params = fit_cuboid(points, method=self.cuboid_method)
+                if cuboid_params is not None:
+                    # Update position, rotation, and size from cuboid
+                    center = cuboid_params["center"]
+                    dimensions = cuboid_params["dimensions"]
+                    rotation_matrix = cuboid_params["rotation"]
+
+                    # Convert rotation matrix to euler angles (roll, pitch, yaw)
+                    sy = np.sqrt(
+                        rotation_matrix[0, 0] * rotation_matrix[0, 0]
+                        + rotation_matrix[1, 0] * rotation_matrix[1, 0]
+                    )
+                    singular = sy < 1e-6
+
+                    if not singular:
+                        roll = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2])
+                        pitch = np.arctan2(-rotation_matrix[2, 0], sy)
+                        yaw = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0])
+                    else:
+                        roll = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1])
+                        pitch = np.arctan2(-rotation_matrix[2, 0], sy)
+                        yaw = 0
+
+                    # Update position, rotation, and size from cuboid
+                    updated_obj["position"] = Vector(center[0], center[1], center[2])
+                    updated_obj["rotation"] = Vector(roll, pitch, yaw)
+                    updated_obj["size"] = {
+                        "width": float(dimensions[0]),
+                        "height": float(dimensions[1]),
+                        "depth": float(dimensions[2]),
+                    }
+
+            # Add point cloud data to ObjectData
+            updated_obj["point_cloud"] = pcd_filtered
+            updated_obj["color"] = rgb_color
+
+            # Extract numpy arrays for grasp generation
+            points_array = np.asarray(pcd_filtered.points).astype(np.float32)  # Nx3 XYZ coordinates
+            if pcd_filtered.has_colors():
+                colors_array = np.asarray(pcd_filtered.colors).astype(
+                    np.float32
+                )  # Nx3 RGB (0-1 range)
+            else:
+                # If no colors, create array of zeros
+                colors_array = np.zeros((len(points_array), 3), dtype=np.float32)
+
+            updated_obj["point_cloud_numpy"] = points_array
+            updated_obj["colors_numpy"] = colors_array
+
+            updated_objects.append(updated_obj)
+
+        return updated_objects
+
+    def cleanup(self):
+        """Clean up resources."""
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+
+def create_test_pipeline(data_dir: str) -> tuple:
+    """
+    Create a test pipeline with default settings.
+
+    Args:
+        data_dir: Directory containing camera info files
+
+    Returns:
+        Tuple of (filter_pipeline, color_info_path, depth_info_path)
+    """
+    color_info_path = os.path.join(data_dir, "color_camera_info.yaml")
+    depth_info_path = os.path.join(data_dir, "depth_camera_info.yaml")
+
+    # Default pipeline with subsampling disabled by default
+    filter_pipeline = PointcloudFiltering(
+        color_intrinsics=color_info_path,
+        depth_intrinsics=depth_info_path,
+    )
+
+    return filter_pipeline, color_info_path, depth_info_path
+
+
+def load_test_images(data_dir: str) -> tuple:
+    """
+    Load the first available test images from data directory.
+
+    Args:
+        data_dir: Directory containing color and depth subdirectories
+
+    Returns:
+        Tuple of (color_img, depth_img) or raises FileNotFoundError
+    """
+
+    def find_first_image(directory):
+        """Find the first image file in the given directory."""
+        if not os.path.exists(directory):
+            return None
+
+        image_extensions = [".jpg", ".jpeg", ".png", ".bmp"]
+        for filename in sorted(os.listdir(directory)):
+            if any(filename.lower().endswith(ext) for ext in image_extensions):
+                return os.path.join(directory, filename)
+        return None
+
+    color_dir = os.path.join(data_dir, "color")
+    depth_dir = os.path.join(data_dir, "depth")
+
+    color_img_path = find_first_image(color_dir)
+    depth_img_path = find_first_image(depth_dir)
+
+    if not color_img_path or not depth_img_path:
+        raise FileNotFoundError(f"Could not find color or depth images in {data_dir}")
+
+    # Load color image
+    color_img = cv2.imread(color_img_path)
+    if color_img is None:
+        raise FileNotFoundError(f"Could not load color image from {color_img_path}")
+    color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB)
+
+    # Load depth image
+    depth_img = cv2.imread(depth_img_path, cv2.IMREAD_UNCHANGED)
+    if depth_img is None:
+        raise FileNotFoundError(f"Could not load depth image from {depth_img_path}")
+
+    # Convert depth to meters if needed
+    if depth_img.dtype == np.uint16:
+        depth_img = depth_img.astype(np.float32) / 1000.0
+
+    return color_img, depth_img
+
+
+def run_segmentation(color_img: np.ndarray, device: str = "auto") -> List[ObjectData]:
+    """
+    Run segmentation on color image and return ObjectData objects.
+
+    Args:
+        color_img: RGB color image
+        device: Device to use ('auto', 'cuda', or 'cpu')
+
+    Returns:
+        List of ObjectData objects with segmentation results
+    """
+    if device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    # Import here to avoid circular imports
+    from dimos.perception.segmentation import Sam2DSegmenter
+
+    segmenter = Sam2DSegmenter(
+        model_path="FastSAM-s.pt", device=device, use_tracker=False, use_analyzer=False
+    )
+
+    try:
+        masks, bboxes, target_ids, probs, names = segmenter.process_image(np.array(color_img))
+
+        # Create ObjectData objects
+        objects = []
+        for i in range(len(bboxes)):
+            obj_data: ObjectData = {
+                "object_id": target_ids[i] if i < len(target_ids) else i,
+                "bbox": bboxes[i],
+                "depth": -1.0,  # Will be populated by pointcloud filtering
+                "confidence": probs[i] if i < len(probs) else 1.0,
+                "class_id": i,
+                "label": names[i] if i < len(names) else "",
+                "segmentation_mask": masks[i].cpu().numpy()
+                if hasattr(masks[i], "cpu")
+                else masks[i],
+                "position": Vector(0, 0, 0),  # Will be populated by pointcloud filtering
+                "rotation": Vector(0, 0, 0),  # Will be populated by pointcloud filtering
+                "size": {
+                    "width": 0.0,
+                    "height": 0.0,
+                    "depth": 0.0,
+                },  # Will be populated by pointcloud filtering
+            }
+            objects.append(obj_data)
+
+        return objects
+
+    finally:
+        segmenter.cleanup()
+
+
+def visualize_results(objects: List[ObjectData]):
+    """
+    Visualize point cloud filtering results with 3D bounding boxes.
+
+    Args:
+        objects: List of ObjectData with point clouds
+    """
+    all_pcds = []
+
+    for obj in objects:
+        if "point_cloud" in obj and obj["point_cloud"] is not None:
+            pcd = obj["point_cloud"]
+            all_pcds.append(pcd)
+
+            # Draw 3D bounding box if position, rotation, and size are available
+            if (
+                "position" in obj
+                and "rotation" in obj
+                and "size" in obj
+                and obj["position"] is not None
+                and obj["rotation"] is not None
+                and obj["size"] is not None
+            ):
+                try:
+                    position = obj["position"]
+                    rotation = obj["rotation"]
+                    size = obj["size"]
+
+                    # Convert position to numpy array
+                    if hasattr(position, "x"):  # Vector object
+                        center = np.array([position.x, position.y, position.z])
+                    else:  # Dictionary
+                        center = np.array([position["x"], position["y"], position["z"]])
+
+                    # Convert rotation (euler angles) to rotation matrix
+                    if hasattr(rotation, "x"):  # Vector object (roll, pitch, yaw)
+                        roll, pitch, yaw = rotation.x, rotation.y, rotation.z
+                    else:  # Dictionary
+                        roll, pitch, yaw = rotation["roll"], rotation["pitch"], rotation["yaw"]
+
+                    # Create rotation matrix from euler angles (ZYX order)
+                    # Roll (X), Pitch (Y), Yaw (Z)
+                    cos_r, sin_r = np.cos(roll), np.sin(roll)
+                    cos_p, sin_p = np.cos(pitch), np.sin(pitch)
+                    cos_y, sin_y = np.cos(yaw), np.sin(yaw)
+
+                    # Rotation matrix for ZYX euler angles
+                    R = np.array(
+                        [
+                            [
+                                cos_y * cos_p,
+                                cos_y * sin_p * sin_r - sin_y * cos_r,
+                                cos_y * sin_p * cos_r + sin_y * sin_r,
+                            ],
+                            [
+                                sin_y * cos_p,
+                                sin_y * sin_p * sin_r + cos_y * cos_r,
+                                sin_y * sin_p * cos_r - cos_y * sin_r,
+                            ],
+                            [-sin_p, cos_p * sin_r, cos_p * cos_r],
+                        ]
+                    )
+
+                    # Get dimensions
+                    width = size.get("width", 0.1)
+                    height = size.get("height", 0.1)
+                    depth = size.get("depth", 0.1)
+                    extent = np.array([width, height, depth])
+
+                    # Create oriented bounding box
+                    obb = o3d.geometry.OrientedBoundingBox(center=center, R=R, extent=extent)
+                    obb.color = [1, 0, 0]  # Red bounding boxes
+                    all_pcds.append(obb)
+
+                except Exception as e:
+                    print(
+                        f"Warning: Failed to create bounding box for object {obj.get('object_id', 'unknown')}: {e}"
+                    )
+
+    # Add coordinate frame
+    coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
+    all_pcds.append(coordinate_frame)
+
+    # Visualize
+    if all_pcds:
+        o3d.visualization.draw_geometries(
+            all_pcds,
+            window_name="Filtered Point Clouds with 3D Bounding Boxes",
+            width=1280,
+            height=720,
+        )
+
+
+def main():
+    """Main function to demonstrate the PointcloudFiltering pipeline."""
+    parser = argparse.ArgumentParser(description="Point cloud filtering pipeline demonstration")
+    parser.add_argument(
+        "--save-pickle",
+        type=str,
+        help="Save generated ObjectData to pickle file (provide filename)",
+    )
+    parser.add_argument(
+        "--data-dir", type=str, help="Directory containing RGBD data (default: auto-detect)"
+    )
+    args = parser.parse_args()
+
+    try:
+        # Setup paths
+        if args.data_dir:
+            data_dir = args.data_dir
+        else:
+            script_dir = os.path.dirname(os.path.abspath(__file__))
+            dimos_dir = os.path.abspath(os.path.join(script_dir, "../../../"))
+            data_dir = os.path.join(dimos_dir, "assets/rgbd_data")
+
+        # Load test data
+        print("Loading test images...")
+        color_img, depth_img = load_test_images(data_dir)
+        print(f"Loaded images: color {color_img.shape}, depth {depth_img.shape}")
+
+        # Run segmentation
+        print("Running segmentation...")
+        objects = run_segmentation(color_img)
+        print(f"Found {len(objects)} objects")
+
+        # Create filtering pipeline
+        print("Creating filtering pipeline...")
+        filter_pipeline, _, _ = create_test_pipeline(data_dir)
+
+        # Process images
+        print("Processing point clouds...")
+        updated_objects = filter_pipeline.process_images(color_img, depth_img, objects)
+
+        # Print results
+        print(f"Processing complete:")
+        print(f"  Objects processed: {len(updated_objects)}/{len(objects)}")
+
+        # Print per-object stats
+        for i, obj in enumerate(updated_objects):
+            if "point_cloud" in obj and obj["point_cloud"] is not None:
+                num_points = len(np.asarray(obj["point_cloud"].points))
+                position = obj.get("position", Vector(0, 0, 0))
+                size = obj.get("size", {})
+                print(f"  Object {i + 1} (ID: {obj['object_id']}): {num_points} points")
+                print(f"    Position: ({position.x:.2f}, {position.y:.2f}, {position.z:.2f})")
+                print(
+                    f"    Size: {size.get('width', 0):.3f} x {size.get('height', 0):.3f} x {size.get('depth', 0):.3f}"
+                )
+
+        # Save to pickle file if requested
+        if args.save_pickle:
+            pickle_path = args.save_pickle
+            if not pickle_path.endswith(".pkl"):
+                pickle_path += ".pkl"
+
+            print(f"Saving ObjectData to {pickle_path}...")
+
+            # Create serializable objects (exclude Open3D point clouds)
+            serializable_objects = []
+            for obj in updated_objects:
+                obj_copy = obj.copy()
+                # Remove the Open3D point cloud object (can't be pickled)
+                if "point_cloud" in obj_copy:
+                    del obj_copy["point_cloud"]
+                serializable_objects.append(obj_copy)
+
+            with open(pickle_path, "wb") as f:
+                pickle.dump(serializable_objects, f)
+
+            print(f"Successfully saved {len(serializable_objects)} objects to {pickle_path}")
+            print("To load: objects = pickle.load(open('filename.pkl', 'rb'))")
+            print(
+                "Note: Open3D point clouds excluded - use point_cloud_numpy and colors_numpy for processing"
+            )
+
+        # Visualize results
+        print("Visualizing results...")
+        visualize_results(updated_objects)
+
+    except Exception as e:
+        print(f"Error: {e}")
+        import traceback
+
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dimos/perception/pointcloud/utils.py b/dimos/perception/pointcloud/utils.py
new file mode 100644
index 0000000000..16cb0e0800
--- /dev/null
+++ b/dimos/perception/pointcloud/utils.py
@@ -0,0 +1,1367 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Point cloud utilities for RGBD data processing.
+
+This module provides efficient utilities for creating and manipulating point clouds
+from RGBD images using Open3D.
+"""
+
+import numpy as np
+import yaml
+import os
+import cv2
+import open3d as o3d
+from typing import List, Optional, Tuple, Union, Dict
+from scipy.spatial import cKDTree
+
+
+def depth_to_point_cloud(depth_image, camera_intrinsics, subsample_factor=4):
+    """
+    Convert depth image to point cloud using camera intrinsics.
+    Subsamples points to reduce density.
+
+    Args:
+        depth_image: HxW depth image in meters
+        camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
+        subsample_factor: Factor to subsample points (higher = fewer points)
+
+    Returns:
+        Nx3 array of 3D points
+    """
+    # Filter out inf and nan values from depth image
+    depth_filtered = depth_image.copy()
+
+    # Create mask for valid depth values (finite, positive, non-zero)
+    valid_mask = np.isfinite(depth_filtered) & (depth_filtered > 0)
+
+    # Set invalid values to 0
+    depth_filtered[~valid_mask] = 0.0
+
+    # Extract camera parameters
+    if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
+        fx, fy, cx, cy = camera_intrinsics
+    else:
+        fx = camera_intrinsics[0, 0]
+        fy = camera_intrinsics[1, 1]
+        cx = camera_intrinsics[0, 2]
+        cy = camera_intrinsics[1, 2]
+
+    # Create pixel coordinate grid
+    rows, cols = depth_filtered.shape
+    x_grid, y_grid = np.meshgrid(
+        np.arange(0, cols, subsample_factor), np.arange(0, rows, subsample_factor)
+    )
+
+    # Flatten grid and depth
+    x = x_grid.flatten()
+    y = y_grid.flatten()
+    z = depth_filtered[y_grid, x_grid].flatten()
+
+    # Remove points with invalid depth (after filtering, this catches zeros)
+    valid = z > 0
+    x = x[valid]
+    y = y[valid]
+    z = z[valid]
+
+    # Convert to 3D points
+    X = (x - cx) * z / fx
+    Y = (y - cy) * z / fy
+    Z = z
+
+    return np.column_stack([X, Y, Z])
+
+
+def load_camera_matrix_from_yaml(
+    camera_info: Optional[Union[str, List[float], np.ndarray, dict]],
+) -> Optional[np.ndarray]:
+    """
+    Load camera intrinsic matrix from various input formats.
+
+    Args:
+        camera_info: Can be:
+            - Path to YAML file containing camera parameters
+            - List of [fx, fy, cx, cy]
+            - 3x3 numpy array (returned as-is)
+            - Dict with camera parameters
+            - None (returns None)
+
+    Returns:
+        3x3 camera intrinsic matrix or None if input is None
+
+    Raises:
+        ValueError: If camera_info format is invalid or file cannot be read
+        FileNotFoundError: If YAML file path doesn't exist
+    """
+    if camera_info is None:
+        return None
+
+    # Handle case where camera_info is already a matrix
+    if isinstance(camera_info, np.ndarray) and camera_info.shape == (3, 3):
+        return camera_info.astype(np.float32)
+
+    # Handle case where camera_info is [fx, fy, cx, cy] format
+    if isinstance(camera_info, list) and len(camera_info) == 4:
+        fx, fy, cx, cy = camera_info
+        return np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
+
+    # Handle case where camera_info is a dict
+    if isinstance(camera_info, dict):
+        return _extract_matrix_from_dict(camera_info)
+
+    # Handle case where camera_info is a path to a YAML file
+    if isinstance(camera_info, str):
+        if not os.path.isfile(camera_info):
+            raise FileNotFoundError(f"Camera info file not found: {camera_info}")
+
+        try:
+            with open(camera_info, "r") as f:
+                data = yaml.safe_load(f)
+            return _extract_matrix_from_dict(data)
+        except Exception as e:
+            raise ValueError(f"Failed to read camera info from {camera_info}: {e}")
+
+    raise ValueError(
+        f"Invalid camera_info format. Expected str, list, dict, or numpy array, got {type(camera_info)}"
+    )
+
+
+def _extract_matrix_from_dict(data: dict) -> np.ndarray:
+    """Extract camera matrix from dictionary with various formats."""
+    # ROS format with 'K' field (most common)
+    if "K" in data:
+        k_data = data["K"]
+        if len(k_data) == 9:
+            return np.array(k_data, dtype=np.float32).reshape(3, 3)
+
+    # Standard format with 'camera_matrix'
+    if "camera_matrix" in data:
+        if "data" in data["camera_matrix"]:
+            matrix_data = data["camera_matrix"]["data"]
+            if len(matrix_data) == 9:
+                return np.array(matrix_data, dtype=np.float32).reshape(3, 3)
+
+    # Explicit intrinsics format
+    if all(k in data for k in ["fx", "fy", "cx", "cy"]):
+        fx, fy = float(data["fx"]), float(data["fy"])
+        cx, cy = float(data["cx"]), float(data["cy"])
+        return np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
+
+    # Error case - provide helpful debug info
+    available_keys = list(data.keys())
+    if "K" in data:
+        k_info = f"K field length: {len(data['K']) if hasattr(data['K'], '__len__') else 'unknown'}"
+    else:
+        k_info = "K field not found"
+
+    raise ValueError(
+        f"Cannot extract camera matrix from data. "
+        f"Available keys: {available_keys}. {k_info}. "
+        f"Expected formats: 'K' (9 elements), 'camera_matrix.data' (9 elements), "
+        f"or individual 'fx', 'fy', 'cx', 'cy' fields."
+    )
+
+
+def create_o3d_point_cloud_from_rgbd(
+    color_img: np.ndarray,
+    depth_img: np.ndarray,
+    intrinsic: np.ndarray,
+    depth_scale: float = 1.0,
+    depth_trunc: float = 3.0,
+) -> o3d.geometry.PointCloud:
+    """
+    Create an Open3D point cloud from RGB and depth images.
+
+    Args:
+        color_img: RGB image as numpy array (H, W, 3)
+        depth_img: Depth image as numpy array (H, W)
+        intrinsic: Camera intrinsic matrix (3x3 numpy array)
+        depth_scale: Scale factor to convert depth to meters
+        depth_trunc: Maximum depth in meters
+
+    Returns:
+        Open3D point cloud object
+
+    Raises:
+        ValueError: If input dimensions are invalid
+    """
+    # Validate inputs
+    if len(color_img.shape) != 3 or color_img.shape[2] != 3:
+        raise ValueError(f"color_img must be (H, W, 3), got {color_img.shape}")
+    if len(depth_img.shape) != 2:
+        raise ValueError(f"depth_img must be (H, W), got {depth_img.shape}")
+    if color_img.shape[:2] != depth_img.shape:
+        raise ValueError(
+            f"Color and depth image dimensions don't match: {color_img.shape[:2]} vs {depth_img.shape}"
+        )
+    if intrinsic.shape != (3, 3):
+        raise ValueError(f"intrinsic must be (3, 3), got {intrinsic.shape}")
+
+    # Convert to Open3D format
+    color_o3d = o3d.geometry.Image(color_img.astype(np.uint8))
+
+    # Filter out inf and nan values from depth image
+    depth_filtered = depth_img.copy()
+
+    # Create mask for valid depth values (finite, positive, non-zero)
+    valid_mask = np.isfinite(depth_filtered) & (depth_filtered > 0)
+
+    # Set invalid values to 0 (which Open3D treats as no depth)
+    depth_filtered[~valid_mask] = 0.0
+
+    depth_o3d = o3d.geometry.Image(depth_filtered.astype(np.float32))
+
+    # Create Open3D intrinsic object
+    height, width = color_img.shape[:2]
+    fx, fy = intrinsic[0, 0], intrinsic[1, 1]
+    cx, cy = intrinsic[0, 2], intrinsic[1, 2]
+    intrinsic_o3d = o3d.camera.PinholeCameraIntrinsic(
+        width,
+        height,
+        fx,
+        fy,  # fx, fy
+        cx,
+        cy,  # cx, cy
+    )
+
+    # Create RGBD image
+    rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth(
+        color_o3d,
+        depth_o3d,
+        depth_scale=depth_scale,
+        depth_trunc=depth_trunc,
+        convert_rgb_to_intensity=False,
+    )
+
+    # Create point cloud
+    pcd = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, intrinsic_o3d)
+
+    return pcd
+
+
+def o3d_point_cloud_to_numpy(pcd: o3d.geometry.PointCloud) -> np.ndarray:
+    """
+    Convert Open3D point cloud to numpy array of XYZRGB points.
+
+    Args:
+        pcd: Open3D point cloud object
+
+    Returns:
+        Nx6 array of XYZRGB points (empty array if no points)
+    """
+    points = np.asarray(pcd.points)
+    if len(points) == 0:
+        return np.zeros((0, 6), dtype=np.float32)
+
+    # Get colors if available
+    if pcd.has_colors():
+        colors = np.asarray(pcd.colors) * 255.0  # Convert from [0,1] to [0,255]
+        return np.column_stack([points, colors]).astype(np.float32)
+    else:
+        # No colors available, return points with zero colors
+        zeros = np.zeros((len(points), 3), dtype=np.float32)
+        return np.column_stack([points, zeros]).astype(np.float32)
+
+
+def numpy_to_o3d_point_cloud(points_rgb: np.ndarray) -> o3d.geometry.PointCloud:
+    """
+    Convert numpy array of XYZRGB points to Open3D point cloud.
+
+    Args:
+        points_rgb: Nx6 array of XYZRGB points or Nx3 array of XYZ points
+
+    Returns:
+        Open3D point cloud object
+
+    Raises:
+        ValueError: If array shape is invalid
+    """
+    if len(points_rgb) == 0:
+        return o3d.geometry.PointCloud()
+
+    if points_rgb.shape[1] < 3:
+        raise ValueError(
+            f"points_rgb must have at least 3 columns (XYZ), got {points_rgb.shape[1]}"
+        )
+
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points_rgb[:, :3])
+
+    # Add colors if available
+    if points_rgb.shape[1] >= 6:
+        colors = points_rgb[:, 3:6] / 255.0  # Convert from [0,255] to [0,1]
+        colors = np.clip(colors, 0.0, 1.0)  # Ensure valid range
+        pcd.colors = o3d.utility.Vector3dVector(colors)
+
+    return pcd
+
+
+def create_masked_point_cloud(color_img, depth_img, mask, intrinsic, depth_scale=1.0):
+    """
+    Create a point cloud for a masked region of RGBD data using Open3D.
+
+    Args:
+        color_img: RGB image (H, W, 3)
+        depth_img: Depth image (H, W)
+        mask: Boolean mask of the same size as color_img and depth_img
+        intrinsic: Camera intrinsic matrix (3x3 numpy array)
+        depth_scale: Scale factor to convert depth to meters
+
+    Returns:
+        Open3D point cloud object for the masked region
+    """
+    # Filter out inf and nan values from depth image
+    depth_filtered = depth_img.copy()
+
+    # Create mask for valid depth values (finite, positive, non-zero)
+    valid_depth_mask = np.isfinite(depth_filtered) & (depth_filtered > 0)
+
+    # Set invalid values to 0
+    depth_filtered[~valid_depth_mask] = 0.0
+
+    # Create masked color and depth images
+    masked_color = color_img.copy()
+    masked_depth = depth_filtered.copy()
+
+    # Apply mask
+    if not mask.shape[:2] == color_img.shape[:2]:
+        raise ValueError(f"Mask shape {mask.shape} doesn't match image shape {color_img.shape[:2]}")
+
+    # Create a boolean mask that is properly expanded for the RGB channels
+    # For RGB image, we need to properly broadcast the mask to all 3 channels
+    if len(color_img.shape) == 3 and color_img.shape[2] == 3:
+        # Properly broadcast mask to match the RGB dimensions
+        mask_rgb = np.broadcast_to(mask[:, :, np.newaxis], color_img.shape)
+        masked_color[~mask_rgb] = 0
+    else:
+        # For grayscale images
+        masked_color[~mask] = 0
+
+    # Apply mask to depth image
+    masked_depth[~mask] = 0
+
+    # Create point cloud
+    pcd = create_o3d_point_cloud_from_rgbd(masked_color, masked_depth, intrinsic, depth_scale)
+
+    # Remove points with coordinates at origin (0,0,0) which are likely from masked out regions
+    points = np.asarray(pcd.points)
+    if len(points) > 0:
+        # Find points that are not at origin
+        dist_from_origin = np.sum(points**2, axis=1)
+        valid_indices = dist_from_origin > 1e-6
+
+        # Filter points and colors
+        pcd = pcd.select_by_index(np.where(valid_indices)[0])
+
+    return pcd
+
+
+def create_point_cloud_and_extract_masks(
+    color_img: np.ndarray,
+    depth_img: np.ndarray,
+    masks: List[np.ndarray],
+    intrinsic: np.ndarray,
+    depth_scale: float = 1.0,
+    depth_trunc: float = 3.0,
+) -> Tuple[o3d.geometry.PointCloud, List[o3d.geometry.PointCloud]]:
+    """
+    Efficiently create a point cloud once and extract multiple masked regions.
+
+    Args:
+        color_img: RGB image (H, W, 3)
+        depth_img: Depth image (H, W)
+        masks: List of boolean masks, each of shape (H, W)
+        intrinsic: Camera intrinsic matrix (3x3 numpy array)
+        depth_scale: Scale factor to convert depth to meters
+        depth_trunc: Maximum depth in meters
+
+    Returns:
+        Tuple of (full_point_cloud, list_of_masked_point_clouds)
+    """
+    if not masks:
+        return o3d.geometry.PointCloud(), []
+
+    # Create the full point cloud
+    full_pcd = create_o3d_point_cloud_from_rgbd(
+        color_img, depth_img, intrinsic, depth_scale, depth_trunc
+    )
+
+    if len(np.asarray(full_pcd.points)) == 0:
+        return full_pcd, [o3d.geometry.PointCloud() for _ in masks]
+
+    # Create pixel-to-point mapping
+    valid_depth_mask = np.isfinite(depth_img) & (depth_img > 0) & (depth_img <= depth_trunc)
+
+    valid_depth = valid_depth_mask.flatten()
+    if not np.any(valid_depth):
+        return full_pcd, [o3d.geometry.PointCloud() for _ in masks]
+
+    pixel_to_point = np.full(len(valid_depth), -1, dtype=np.int32)
+    pixel_to_point[valid_depth] = np.arange(np.sum(valid_depth))
+
+    # Extract point clouds for each mask
+    masked_pcds = []
+    max_points = len(np.asarray(full_pcd.points))
+
+    for mask in masks:
+        if mask.shape != depth_img.shape:
+            masked_pcds.append(o3d.geometry.PointCloud())
+            continue
+
+        mask_flat = mask.flatten()
+        valid_mask_indices = mask_flat & valid_depth
+        point_indices = pixel_to_point[valid_mask_indices]
+        valid_point_indices = point_indices[point_indices >= 0]
+
+        if len(valid_point_indices) > 0:
+            valid_point_indices = np.clip(valid_point_indices, 0, max_points - 1)
+            valid_point_indices = np.unique(valid_point_indices)
+            masked_pcd = full_pcd.select_by_index(valid_point_indices.tolist())
+        else:
+            masked_pcd = o3d.geometry.PointCloud()
+
+        masked_pcds.append(masked_pcd)
+
+    return full_pcd, masked_pcds
+
+
+def extract_masked_point_cloud_efficient(
+    full_pcd: o3d.geometry.PointCloud, depth_img: np.ndarray, mask: np.ndarray
+) -> o3d.geometry.PointCloud:
+    """
+    Extract a masked region from an existing point cloud efficiently.
+
+    This assumes the point cloud was created from the given depth image.
+    Use this when you have a pre-computed full point cloud and want to extract
+    individual masked regions.
+
+    Args:
+        full_pcd: Complete Open3D point cloud
+        depth_img: Depth image used to create the point cloud (H, W)
+        mask: Boolean mask (H, W)
+
+    Returns:
+        Open3D point cloud for the masked region
+
+    Raises:
+        ValueError: If mask shape doesn't match depth image
+    """
+    if mask.shape != depth_img.shape:
+        raise ValueError(
+            f"Mask shape {mask.shape} doesn't match depth image shape {depth_img.shape}"
+        )
+
+    # Early return if no points in full point cloud
+    if len(np.asarray(full_pcd.points)) == 0:
+        return o3d.geometry.PointCloud()
+
+    # Get valid depth mask
+    valid_depth = depth_img.flatten() > 0
+    mask_flat = mask.flatten()
+
+    # Find pixels that are both valid and in the mask
+    valid_mask_indices = mask_flat & valid_depth
+
+    # Get indices of valid points
+    point_indices = np.where(valid_mask_indices[valid_depth])[0]
+
+    # Extract the masked point cloud
+    if len(point_indices) > 0:
+        return full_pcd.select_by_index(point_indices)
+    else:
+        return o3d.geometry.PointCloud()
+
+
+def segment_and_remove_plane(pcd, distance_threshold=0.02, ransac_n=3, num_iterations=1000):
+    """
+    Segment the dominant plane from a point cloud using RANSAC and remove it.
+    Often used to remove table tops, floors, walls, or other planar surfaces.
+
+    Args:
+        pcd: Open3D point cloud object
+        distance_threshold: Maximum distance a point can be from the plane to be considered an inlier (in meters)
+        ransac_n: Number of points to sample for each RANSAC iteration
+        num_iterations: Number of RANSAC iterations
+
+    Returns:
+        Open3D point cloud with the dominant plane removed
+    """
+    # Make a copy of the input point cloud to avoid modifying the original
+    pcd_filtered = o3d.geometry.PointCloud()
+    pcd_filtered.points = o3d.utility.Vector3dVector(np.asarray(pcd.points))
+    if pcd.has_colors():
+        pcd_filtered.colors = o3d.utility.Vector3dVector(np.asarray(pcd.colors))
+    if pcd.has_normals():
+        pcd_filtered.normals = o3d.utility.Vector3dVector(np.asarray(pcd.normals))
+
+    # Check if point cloud has enough points
+    if len(pcd_filtered.points) < ransac_n:
+        return pcd_filtered
+
+    # Run RANSAC to find the largest plane
+    _, inliers = pcd_filtered.segment_plane(
+        distance_threshold=distance_threshold, ransac_n=ransac_n, num_iterations=num_iterations
+    )
+
+    # Remove the dominant plane (regardless of orientation)
+    pcd_without_dominant_plane = pcd_filtered.select_by_index(inliers, invert=True)
+    return pcd_without_dominant_plane
+
+
+def filter_point_cloud_statistical(
+    pcd: o3d.geometry.PointCloud, nb_neighbors: int = 20, std_ratio: float = 2.0
+) -> Tuple[o3d.geometry.PointCloud, np.ndarray]:
+    """
+    Apply statistical outlier filtering to point cloud.
+
+    Args:
+        pcd: Input point cloud
+        nb_neighbors: Number of neighbors to analyze for each point
+        std_ratio: Threshold level based on standard deviation
+
+    Returns:
+        Tuple of (filtered_point_cloud, outlier_indices)
+    """
+    if len(np.asarray(pcd.points)) == 0:
+        return pcd, np.array([])
+
+    return pcd.remove_statistical_outlier(nb_neighbors=nb_neighbors, std_ratio=std_ratio)
+
+
+def filter_point_cloud_radius(
+    pcd: o3d.geometry.PointCloud, nb_points: int = 16, radius: float = 0.05
+) -> Tuple[o3d.geometry.PointCloud, np.ndarray]:
+    """
+    Apply radius-based outlier filtering to point cloud.
+
+    Args:
+        pcd: Input point cloud
+        nb_points: Minimum number of points within radius
+        radius: Search radius in meters
+
+    Returns:
+        Tuple of (filtered_point_cloud, outlier_indices)
+    """
+    if len(np.asarray(pcd.points)) == 0:
+        return pcd, np.array([])
+
+    return pcd.remove_radius_outlier(nb_points=nb_points, radius=radius)
+
+
+def compute_point_cloud_bounds(pcd: o3d.geometry.PointCloud) -> dict:
+    """
+    Compute bounding box information for a point cloud.
+
+    Args:
+        pcd: Input point cloud
+
+    Returns:
+        Dictionary with bounds information
+    """
+    points = np.asarray(pcd.points)
+    if len(points) == 0:
+        return {
+            "min": np.array([0, 0, 0]),
+            "max": np.array([0, 0, 0]),
+            "center": np.array([0, 0, 0]),
+            "size": np.array([0, 0, 0]),
+            "volume": 0.0,
+        }
+
+    min_bound = points.min(axis=0)
+    max_bound = points.max(axis=0)
+    center = (min_bound + max_bound) / 2
+    size = max_bound - min_bound
+    volume = np.prod(size)
+
+    return {"min": min_bound, "max": max_bound, "center": center, "size": size, "volume": volume}
+
+
+def project_3d_points_to_2d(
+    points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray]
+) -> np.ndarray:
+    """
+    Project 3D points to 2D image coordinates using camera intrinsics.
+
+    Args:
+        points_3d: Nx3 array of 3D points (X, Y, Z)
+        camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
+
+    Returns:
+        Nx2 array of 2D image coordinates (u, v)
+    """
+    if len(points_3d) == 0:
+        return np.zeros((0, 2), dtype=np.int32)
+
+    # Filter out points with zero or negative depth
+    valid_mask = points_3d[:, 2] > 0
+    if not np.any(valid_mask):
+        return np.zeros((0, 2), dtype=np.int32)
+
+    valid_points = points_3d[valid_mask]
+
+    # Extract camera parameters
+    if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
+        fx, fy, cx, cy = camera_intrinsics
+    else:
+        fx = camera_intrinsics[0, 0]
+        fy = camera_intrinsics[1, 1]
+        cx = camera_intrinsics[0, 2]
+        cy = camera_intrinsics[1, 2]
+
+    # Project to image coordinates
+    u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx
+    v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy
+
+    # Round to integer pixel coordinates
+    points_2d = np.column_stack([u, v]).astype(np.int32)
+
+    return points_2d
+
+
+def overlay_point_clouds_on_image(
+    base_image: np.ndarray,
+    point_clouds: List[o3d.geometry.PointCloud],
+    camera_intrinsics: Union[List[float], np.ndarray],
+    colors: List[Tuple[int, int, int]],
+    point_size: int = 2,
+    alpha: float = 0.7,
+) -> np.ndarray:
+    """
+    Overlay multiple colored point clouds onto an image.
+
+    Args:
+        base_image: Base image to overlay onto (H, W, 3) - assumed to be RGB
+        point_clouds: List of Open3D point cloud objects
+        camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
+        colors: List of RGB color tuples for each point cloud. If None, generates distinct colors.
+        point_size: Size of points to draw (in pixels)
+        alpha: Blending factor for overlay (0.0 = fully transparent, 1.0 = fully opaque)
+
+    Returns:
+        Image with overlaid point clouds (H, W, 3)
+    """
+    if len(point_clouds) == 0:
+        return base_image.copy()
+
+    # Create overlay image
+    overlay = base_image.copy()
+    height, width = base_image.shape[:2]
+
+    # Process each point cloud
+    for i, pcd in enumerate(point_clouds):
+        if pcd is None:
+            continue
+
+        points_3d = np.asarray(pcd.points)
+        if len(points_3d) == 0:
+            continue
+
+        # Project 3D points to 2D
+        points_2d = project_3d_points_to_2d(points_3d, camera_intrinsics)
+
+        if len(points_2d) == 0:
+            continue
+
+        # Filter points within image bounds
+        valid_mask = (
+            (points_2d[:, 0] >= 0)
+            & (points_2d[:, 0] < width)
+            & (points_2d[:, 1] >= 0)
+            & (points_2d[:, 1] < height)
+        )
+        valid_points_2d = points_2d[valid_mask]
+
+        if len(valid_points_2d) == 0:
+            continue
+
+        # Get color for this point cloud
+        color = colors[i % len(colors)]
+
+        # Ensure color is a tuple of integers for OpenCV
+        if isinstance(color, (list, tuple, np.ndarray)):
+            color = tuple(int(c) for c in color[:3])
+        else:
+            color = (255, 255, 255)
+
+        # Draw points on overlay
+        for point in valid_points_2d:
+            u, v = point
+            # Draw a small filled circle for each point
+            cv2.circle(overlay, (u, v), point_size, color, -1)
+
+    # Blend overlay with base image
+    result = cv2.addWeighted(base_image, 1 - alpha, overlay, alpha, 0)
+
+    return result
+
+
+def create_point_cloud_overlay_visualization(
+    base_image: np.ndarray,
+    objects: List[dict],
+    intrinsics: np.ndarray,
+) -> np.ndarray:
+    """
+    Create a visualization showing object point clouds and bounding boxes overlaid on a base image.
+
+    Args:
+        base_image: Base image to overlay onto (H, W, 3)
+        objects: List of object dictionaries containing 'point_cloud', 'color', 'position', 'rotation', 'size' keys
+        intrinsics: Camera intrinsics as [fx, fy, cx, cy] or 3x3 matrix
+
+    Returns:
+        Visualization image with overlaid point clouds and bounding boxes (H, W, 3)
+    """
+    # Extract point clouds and colors from objects
+    point_clouds = []
+    colors = []
+    for obj in objects:
+        if "point_cloud" in obj and obj["point_cloud"] is not None:
+            point_clouds.append(obj["point_cloud"])
+
+            # Convert color to tuple
+            color = obj["color"]
+            if isinstance(color, np.ndarray):
+                color = tuple(int(c) for c in color)
+            elif isinstance(color, (list, tuple)):
+                color = tuple(int(c) for c in color[:3])
+            colors.append(color)
+
+    # Create visualization
+    if point_clouds:
+        result = overlay_point_clouds_on_image(
+            base_image=base_image,
+            point_clouds=point_clouds,
+            camera_intrinsics=intrinsics,
+            colors=colors,
+            point_size=3,
+            alpha=0.8,
+        )
+    else:
+        result = base_image.copy()
+
+    # Draw 3D bounding boxes
+    height_img, width_img = result.shape[:2]
+    for i, obj in enumerate(objects):
+        if all(key in obj and obj[key] is not None for key in ["position", "rotation", "size"]):
+            try:
+                # Create and project 3D bounding box
+                corners_3d = create_3d_bounding_box_corners(
+                    obj["position"], obj["rotation"], obj["size"]
+                )
+                corners_2d = project_3d_points_to_2d(corners_3d, intrinsics)
+
+                # Check if any corners are visible
+                valid_mask = (
+                    (corners_2d[:, 0] >= 0)
+                    & (corners_2d[:, 0] < width_img)
+                    & (corners_2d[:, 1] >= 0)
+                    & (corners_2d[:, 1] < height_img)
+                )
+
+                if np.any(valid_mask):
+                    # Get color
+                    bbox_color = colors[i] if i < len(colors) else (255, 255, 255)
+                    draw_3d_bounding_box_on_image(result, corners_2d, bbox_color, thickness=2)
+            except:
+                continue
+
+    return result
+
+
+def create_3d_bounding_box_corners(position, rotation, size):
+    """
+    Create 8 corners of a 3D bounding box from position, rotation, and size.
+
+    Args:
+        position: Vector or dict with x, y, z coordinates
+        rotation: Vector or dict with roll, pitch, yaw angles
+        size: Dict with width, height, depth
+
+    Returns:
+        8x3 numpy array of corner coordinates
+    """
+    # Convert position to numpy array
+    if hasattr(position, "x"):  # Vector object
+        center = np.array([position.x, position.y, position.z])
+    else:  # Dictionary
+        center = np.array([position["x"], position["y"], position["z"]])
+
+    # Convert rotation (euler angles) to rotation matrix
+    if hasattr(rotation, "x"):  # Vector object (roll, pitch, yaw)
+        roll, pitch, yaw = rotation.x, rotation.y, rotation.z
+    else:  # Dictionary
+        roll, pitch, yaw = rotation["roll"], rotation["pitch"], rotation["yaw"]
+
+    # Create rotation matrix from euler angles (ZYX order)
+    cos_r, sin_r = np.cos(roll), np.sin(roll)
+    cos_p, sin_p = np.cos(pitch), np.sin(pitch)
+    cos_y, sin_y = np.cos(yaw), np.sin(yaw)
+
+    # Rotation matrix for ZYX euler angles
+    R = np.array(
+        [
+            [
+                cos_y * cos_p,
+                cos_y * sin_p * sin_r - sin_y * cos_r,
+                cos_y * sin_p * cos_r + sin_y * sin_r,
+            ],
+            [
+                sin_y * cos_p,
+                sin_y * sin_p * sin_r + cos_y * cos_r,
+                sin_y * sin_p * cos_r - cos_y * sin_r,
+            ],
+            [-sin_p, cos_p * sin_r, cos_p * cos_r],
+        ]
+    )
+
+    # Get dimensions
+    width = size.get("width", 0.1)
+    height = size.get("height", 0.1)
+    depth = size.get("depth", 0.1)
+
+    # Create 8 corners of the bounding box (before rotation)
+    corners = np.array(
+        [
+            [-width / 2, -height / 2, -depth / 2],  # 0
+            [width / 2, -height / 2, -depth / 2],  # 1
+            [width / 2, height / 2, -depth / 2],  # 2
+            [-width / 2, height / 2, -depth / 2],  # 3
+            [-width / 2, -height / 2, depth / 2],  # 4
+            [width / 2, -height / 2, depth / 2],  # 5
+            [width / 2, height / 2, depth / 2],  # 6
+            [-width / 2, height / 2, depth / 2],  # 7
+        ]
+    )
+
+    # Apply rotation and translation
+    rotated_corners = corners @ R.T + center
+
+    return rotated_corners
+
+
+def draw_3d_bounding_box_on_image(image, corners_2d, color, thickness=2):
+    """
+    Draw a 3D bounding box on an image using projected 2D corners.
+
+    Args:
+        image: Image to draw on
+        corners_2d: 8x2 array of 2D corner coordinates
+        color: RGB color tuple
+        thickness: Line thickness
+    """
+    # Define the 12 edges of a cube (connecting corner indices)
+    edges = [
+        (0, 1),
+        (1, 2),
+        (2, 3),
+        (3, 0),  # Bottom face
+        (4, 5),
+        (5, 6),
+        (6, 7),
+        (7, 4),  # Top face
+        (0, 4),
+        (1, 5),
+        (2, 6),
+        (3, 7),  # Vertical edges
+    ]
+
+    # Draw each edge
+    for start_idx, end_idx in edges:
+        start_point = tuple(corners_2d[start_idx].astype(int))
+        end_point = tuple(corners_2d[end_idx].astype(int))
+        cv2.line(image, start_point, end_point, color, thickness)
+
+
+def extract_and_cluster_misc_points(
+    full_pcd: o3d.geometry.PointCloud,
+    all_objects: List[dict],
+    eps: float = 0.03,
+    min_points: int = 100,
+    enable_filtering: bool = True,
+    voxel_size: float = 0.02,
+) -> Tuple[List[o3d.geometry.PointCloud], o3d.geometry.VoxelGrid]:
+    """
+    Extract miscellaneous/background points and cluster them using DBSCAN.
+
+    Args:
+        full_pcd: Complete scene point cloud
+        all_objects: List of objects with point clouds to subtract
+        eps: DBSCAN epsilon parameter (max distance between points in cluster)
+        min_points: DBSCAN min_samples parameter (min points to form cluster)
+        enable_filtering: Whether to apply statistical and radius filtering
+        voxel_size: Size of voxels for voxel grid generation
+
+    Returns:
+        Tuple of (clustered_point_clouds, voxel_grid)
+    """
+    if full_pcd is None or len(np.asarray(full_pcd.points)) == 0:
+        return [], o3d.geometry.VoxelGrid()
+
+    if not all_objects:
+        # If no objects detected, cluster the full point cloud
+        clusters = _cluster_point_cloud_dbscan(full_pcd, eps, min_points)
+        voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size)
+        return clusters, voxel_grid
+
+    try:
+        # Start with a copy of the full point cloud
+        misc_pcd = o3d.geometry.PointCloud(full_pcd)
+
+        # Remove object points by combining all object point clouds
+        all_object_points = []
+        for obj in all_objects:
+            if "point_cloud" in obj and obj["point_cloud"] is not None:
+                obj_points = np.asarray(obj["point_cloud"].points)
+                if len(obj_points) > 0:
+                    all_object_points.append(obj_points)
+
+        if not all_object_points:
+            # No object points to remove, cluster full point cloud
+            clusters = _cluster_point_cloud_dbscan(misc_pcd, eps, min_points)
+            voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size)
+            return clusters, voxel_grid
+
+        # Combine all object points
+        combined_obj_points = np.vstack(all_object_points)
+
+        # For efficiency, downsample both point clouds
+        misc_downsampled = misc_pcd.voxel_down_sample(voxel_size=0.005)
+
+        # Create object point cloud for efficient operations
+        obj_pcd = o3d.geometry.PointCloud()
+        obj_pcd.points = o3d.utility.Vector3dVector(combined_obj_points)
+        obj_downsampled = obj_pcd.voxel_down_sample(voxel_size=0.005)
+
+        misc_points = np.asarray(misc_downsampled.points)
+        obj_points_down = np.asarray(obj_downsampled.points)
+
+        if len(misc_points) == 0 or len(obj_points_down) == 0:
+            clusters = _cluster_point_cloud_dbscan(misc_downsampled, eps, min_points)
+            voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size)
+            return clusters, voxel_grid
+
+        # Build tree for object points
+        obj_tree = cKDTree(obj_points_down)
+
+        # Find distances from misc points to nearest object points
+        distances, _ = obj_tree.query(misc_points, k=1)
+
+        # Keep points that are far enough from any object point
+        threshold = 0.015  # 1.5cm threshold
+        keep_mask = distances > threshold
+
+        if not np.any(keep_mask):
+            return [], o3d.geometry.VoxelGrid()
+
+        # Filter misc points
+        misc_indices = np.where(keep_mask)[0]
+        final_misc_pcd = misc_downsampled.select_by_index(misc_indices)
+
+        if len(np.asarray(final_misc_pcd.points)) == 0:
+            return [], o3d.geometry.VoxelGrid()
+
+        # Apply additional filtering if enabled
+        if enable_filtering:
+            # Apply statistical outlier filtering
+            filtered_misc_pcd, _ = filter_point_cloud_statistical(
+                final_misc_pcd, nb_neighbors=30, std_ratio=2.0
+            )
+
+            if len(np.asarray(filtered_misc_pcd.points)) == 0:
+                return [], o3d.geometry.VoxelGrid()
+
+            # Apply radius outlier filtering
+            final_filtered_misc_pcd, _ = filter_point_cloud_radius(
+                filtered_misc_pcd,
+                nb_points=20,
+                radius=0.03,  # 3cm radius
+            )
+
+            if len(np.asarray(final_filtered_misc_pcd.points)) == 0:
+                return [], o3d.geometry.VoxelGrid()
+
+            final_misc_pcd = final_filtered_misc_pcd
+
+        # Cluster the misc points using DBSCAN
+        clusters = _cluster_point_cloud_dbscan(final_misc_pcd, eps, min_points)
+
+        # Create voxel grid from all misc points (before clustering)
+        voxel_grid = _create_voxel_grid_from_point_cloud(final_misc_pcd, voxel_size)
+
+        return clusters, voxel_grid
+
+    except Exception as e:
+        print(f"Error in misc point extraction and clustering: {e}")
+        # Fallback: return downsampled full point cloud as single cluster
+        try:
+            downsampled = full_pcd.voxel_down_sample(voxel_size=0.02)
+            if len(np.asarray(downsampled.points)) > 0:
+                voxel_grid = _create_voxel_grid_from_point_cloud(downsampled, voxel_size)
+                return [downsampled], voxel_grid
+            else:
+                return [], o3d.geometry.VoxelGrid()
+        except:
+            return [], o3d.geometry.VoxelGrid()
+
+
+def _create_voxel_grid_from_point_cloud(
+    pcd: o3d.geometry.PointCloud, voxel_size: float = 0.02
+) -> o3d.geometry.VoxelGrid:
+    """
+    Create a voxel grid from a point cloud.
+
+    Args:
+        pcd: Input point cloud
+        voxel_size: Size of each voxel
+
+    Returns:
+        Open3D VoxelGrid object
+    """
+    if len(np.asarray(pcd.points)) == 0:
+        return o3d.geometry.VoxelGrid()
+
+    try:
+        # Create voxel grid from point cloud
+        voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud(pcd, voxel_size)
+
+        # Color the voxels with a semi-transparent gray
+        for voxel in voxel_grid.get_voxels():
+            voxel.color = [0.5, 0.5, 0.5]  # Gray color
+
+        print(
+            f"Created voxel grid with {len(voxel_grid.get_voxels())} voxels (voxel_size={voxel_size})"
+        )
+        return voxel_grid
+
+    except Exception as e:
+        print(f"Error creating voxel grid: {e}")
+        return o3d.geometry.VoxelGrid()
+
+
+def _create_voxel_grid_from_clusters(
+    clusters: List[o3d.geometry.PointCloud], voxel_size: float = 0.02
+) -> o3d.geometry.VoxelGrid:
+    """
+    Create a voxel grid from multiple clustered point clouds.
+
+    Args:
+        clusters: List of clustered point clouds
+        voxel_size: Size of each voxel
+
+    Returns:
+        Open3D VoxelGrid object
+    """
+    if not clusters:
+        return o3d.geometry.VoxelGrid()
+
+    # Combine all clusters into one point cloud
+    combined_points = []
+    for cluster in clusters:
+        points = np.asarray(cluster.points)
+        if len(points) > 0:
+            combined_points.append(points)
+
+    if not combined_points:
+        return o3d.geometry.VoxelGrid()
+
+    # Create combined point cloud
+    all_points = np.vstack(combined_points)
+    combined_pcd = o3d.geometry.PointCloud()
+    combined_pcd.points = o3d.utility.Vector3dVector(all_points)
+
+    return _create_voxel_grid_from_point_cloud(combined_pcd, voxel_size)
+
+
+def _cluster_point_cloud_dbscan(
+    pcd: o3d.geometry.PointCloud, eps: float = 0.05, min_points: int = 50
+) -> List[o3d.geometry.PointCloud]:
+    """
+    Cluster a point cloud using DBSCAN and return list of clustered point clouds.
+
+    Args:
+        pcd: Point cloud to cluster
+        eps: DBSCAN epsilon parameter
+        min_points: DBSCAN min_samples parameter
+
+    Returns:
+        List of point clouds, one for each cluster
+    """
+    if len(np.asarray(pcd.points)) == 0:
+        return []
+
+    try:
+        # Apply DBSCAN clustering
+        labels = np.array(pcd.cluster_dbscan(eps=eps, min_points=min_points))
+
+        # Get unique cluster labels (excluding noise points labeled as -1)
+        unique_labels = np.unique(labels)
+        cluster_pcds = []
+
+        for label in unique_labels:
+            if label == -1:  # Skip noise points
+                continue
+
+            # Get indices for this cluster
+            cluster_indices = np.where(labels == label)[0]
+
+            if len(cluster_indices) > 0:
+                # Create point cloud for this cluster
+                cluster_pcd = pcd.select_by_index(cluster_indices)
+
+                # Assign a random color to this cluster
+                cluster_color = np.random.rand(3)  # Random RGB color
+                cluster_pcd.paint_uniform_color(cluster_color)
+
+                cluster_pcds.append(cluster_pcd)
+
+        print(
+            f"DBSCAN clustering found {len(cluster_pcds)} clusters from {len(np.asarray(pcd.points))} points"
+        )
+        return cluster_pcds
+
+    except Exception as e:
+        print(f"Error in DBSCAN clustering: {e}")
+        return [pcd]  # Return original point cloud as fallback
+
+
+def get_standard_coordinate_transform():
+    """
+    Get a standard coordinate transformation matrix for consistent visualization.
+
+    This transformation ensures that:
+    - X (red) axis points right
+    - Y (green) axis points up
+    - Z (blue) axis points toward viewer
+
+    Returns:
+        4x4 transformation matrix
+    """
+    # Standard transformation matrix to ensure consistent coordinate frame orientation
+    transform = np.array(
+        [
+            [1, 0, 0, 0],  # X points right
+            [0, -1, 0, 0],  # Y points up (flip from OpenCV to standard)
+            [0, 0, -1, 0],  # Z points toward viewer (flip depth)
+            [0, 0, 0, 1],
+        ]
+    )
+    return transform
+
+
+def visualize_clustered_point_clouds(
+    clustered_pcds: List[o3d.geometry.PointCloud],
+    window_name: str = "Clustered Point Clouds",
+    point_size: float = 2.0,
+    show_coordinate_frame: bool = True,
+    coordinate_frame_size: float = 0.1,
+) -> None:
+    """
+    Visualize multiple clustered point clouds with different colors.
+
+    Args:
+        clustered_pcds: List of point clouds (already colored)
+        window_name: Name of the visualization window
+        point_size: Size of points in the visualization
+        show_coordinate_frame: Whether to show coordinate frame
+        coordinate_frame_size: Size of the coordinate frame
+    """
+    if not clustered_pcds:
+        print("Warning: No clustered point clouds to visualize")
+        return
+
+    # Apply standard coordinate transformation
+    transform = get_standard_coordinate_transform()
+    geometries = []
+    for pcd in clustered_pcds:
+        pcd_copy = o3d.geometry.PointCloud(pcd)
+        pcd_copy.transform(transform)
+        geometries.append(pcd_copy)
+
+    # Add coordinate frame
+    if show_coordinate_frame:
+        coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
+            size=coordinate_frame_size
+        )
+        coordinate_frame.transform(transform)
+        geometries.append(coordinate_frame)
+
+    total_points = sum(len(np.asarray(pcd.points)) for pcd in clustered_pcds)
+    print(f"Visualizing {len(clustered_pcds)} clusters with {total_points} total points")
+
+    try:
+        vis = o3d.visualization.Visualizer()
+        vis.create_window(window_name=window_name, width=1280, height=720)
+        for geom in geometries:
+            vis.add_geometry(geom)
+        render_option = vis.get_render_option()
+        render_option.point_size = point_size
+        vis.run()
+        vis.destroy_window()
+    except Exception as e:
+        print(f"Failed to create interactive visualization: {e}")
+        o3d.visualization.draw_geometries(
+            geometries, window_name=window_name, width=1280, height=720
+        )
+
+
+def visualize_pcd(
+    pcd: o3d.geometry.PointCloud,
+    window_name: str = "Point Cloud Visualization",
+    point_size: float = 1.0,
+    show_coordinate_frame: bool = True,
+    coordinate_frame_size: float = 0.1,
+) -> None:
+    """
+    Visualize an Open3D point cloud using Open3D's visualization window.
+
+    Args:
+        pcd: Open3D point cloud to visualize
+        window_name: Name of the visualization window
+        point_size: Size of points in the visualization
+        show_coordinate_frame: Whether to show coordinate frame
+        coordinate_frame_size: Size of the coordinate frame
+    """
+    if pcd is None:
+        print("Warning: Point cloud is None, nothing to visualize")
+        return
+
+    if len(np.asarray(pcd.points)) == 0:
+        print("Warning: Point cloud is empty, nothing to visualize")
+        return
+
+    # Apply standard coordinate transformation
+    transform = get_standard_coordinate_transform()
+    pcd_copy = o3d.geometry.PointCloud(pcd)
+    pcd_copy.transform(transform)
+    geometries = [pcd_copy]
+
+    # Add coordinate frame
+    if show_coordinate_frame:
+        coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
+            size=coordinate_frame_size
+        )
+        coordinate_frame.transform(transform)
+        geometries.append(coordinate_frame)
+
+    print(f"Visualizing point cloud with {len(np.asarray(pcd.points))} points")
+
+    try:
+        vis = o3d.visualization.Visualizer()
+        vis.create_window(window_name=window_name, width=1280, height=720)
+        for geom in geometries:
+            vis.add_geometry(geom)
+        render_option = vis.get_render_option()
+        render_option.point_size = point_size
+        vis.run()
+        vis.destroy_window()
+    except Exception as e:
+        print(f"Failed to create interactive visualization: {e}")
+        o3d.visualization.draw_geometries(
+            geometries, window_name=window_name, width=1280, height=720
+        )
+
+
+def visualize_voxel_grid(
+    voxel_grid: o3d.geometry.VoxelGrid,
+    window_name: str = "Voxel Grid Visualization",
+    show_coordinate_frame: bool = True,
+    coordinate_frame_size: float = 0.1,
+) -> None:
+    """
+    Visualize an Open3D voxel grid using Open3D's visualization window.
+
+    Args:
+        voxel_grid: Open3D voxel grid to visualize
+        window_name: Name of the visualization window
+        show_coordinate_frame: Whether to show coordinate frame
+        coordinate_frame_size: Size of the coordinate frame
+    """
+    if voxel_grid is None:
+        print("Warning: Voxel grid is None, nothing to visualize")
+        return
+
+    if len(voxel_grid.get_voxels()) == 0:
+        print("Warning: Voxel grid is empty, nothing to visualize")
+        return
+
+    # VoxelGrid doesn't support transform, so we need to transform the source points instead
+    # For now, just visualize as-is with transformed coordinate frame
+    geometries = [voxel_grid]
+
+    # Add coordinate frame
+    if show_coordinate_frame:
+        coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
+            size=coordinate_frame_size
+        )
+        coordinate_frame.transform(get_standard_coordinate_transform())
+        geometries.append(coordinate_frame)
+
+    print(f"Visualizing voxel grid with {len(voxel_grid.get_voxels())} voxels")
+
+    try:
+        vis = o3d.visualization.Visualizer()
+        vis.create_window(window_name=window_name, width=1280, height=720)
+        for geom in geometries:
+            vis.add_geometry(geom)
+        vis.run()
+        vis.destroy_window()
+    except Exception as e:
+        print(f"Failed to create interactive visualization: {e}")
+        o3d.visualization.draw_geometries(
+            geometries, window_name=window_name, width=1280, height=720
+        )
+
+
+def combine_object_pointclouds(
+    point_clouds: Union[List[np.ndarray], List[o3d.geometry.PointCloud]],
+    colors: Optional[List[np.ndarray]] = None,
+) -> o3d.geometry.PointCloud:
+    """
+    Combine multiple point clouds into a single Open3D point cloud.
+
+    Args:
+        point_clouds: List of point clouds as numpy arrays or Open3D point clouds
+        colors: List of colors as numpy arrays
+    Returns:
+        Combined Open3D point cloud
+    """
+    all_points = []
+    all_colors = []
+
+    for i, pcd in enumerate(point_clouds):
+        if isinstance(pcd, np.ndarray):
+            points = pcd[:, :3]
+            all_points.append(points)
+            if colors:
+                all_colors.append(colors[i])
+
+        elif isinstance(pcd, o3d.geometry.PointCloud):
+            points = np.asarray(pcd.points)
+            all_points.append(points)
+            if pcd.has_colors():
+                colors = np.asarray(pcd.colors)
+                all_colors.append(colors)
+
+    if not all_points:
+        return o3d.geometry.PointCloud()
+
+    combined_pcd = o3d.geometry.PointCloud()
+    combined_pcd.points = o3d.utility.Vector3dVector(np.vstack(all_points))
+
+    if all_colors:
+        combined_pcd.colors = o3d.utility.Vector3dVector(np.vstack(all_colors))
+
+    return combined_pcd
diff --git a/dimos/perception/segmentation/sam_2d_seg.py b/dimos/perception/segmentation/sam_2d_seg.py
index d33c7faa0d..fcf27584e6 100644
--- a/dimos/perception/segmentation/sam_2d_seg.py
+++ b/dimos/perception/segmentation/sam_2d_seg.py
@@ -181,7 +181,17 @@ def process_image(self, image):
                     tracked_names,
                 )
             else:
-                # Return filtered results directly if tracker is disabled
+                # When tracker disabled, just use the filtered results directly
+                if self.use_analyzer:
+                    # Add unanalyzed IDs to the analysis queue
+                    for track_id in filtered_track_ids:
+                        if (
+                            track_id not in self.object_names
+                            and track_id not in self.to_be_analyzed
+                        ):
+                            self.to_be_analyzed.append(track_id)
+
+                # Simply return filtered results
                 return (
                     filtered_masks,
                     filtered_bboxes,
@@ -299,6 +309,9 @@ def main():
     # Example 4: Basic segmentation only (both tracker and analyzer disabled)
     # segmenter = Sam2DSegmenter(use_tracker=False, use_analyzer=False)
 
+    # Example 5: Analyzer without tracker (new capability)
+    # segmenter = Sam2DSegmenter(use_tracker=False, use_analyzer=True)
+
     try:
         while cap.isOpened():
             ret, frame = cap.read()
@@ -311,7 +324,7 @@ def main():
             masks, bboxes, target_ids, probs, names = segmenter.process_image(frame)
 
             # Run analysis if enabled
-            if segmenter.use_tracker and segmenter.use_analyzer:
+            if segmenter.use_analyzer:
                 segmenter.run_analysis(frame, bboxes, target_ids)
                 names = segmenter.get_object_names(target_ids, names)
 
diff --git a/dimos/perception/segmentation/utils.py b/dimos/perception/segmentation/utils.py
index c96a7d4a64..4101edfa40 100644
--- a/dimos/perception/segmentation/utils.py
+++ b/dimos/perception/segmentation/utils.py
@@ -241,6 +241,12 @@ def plot_results(image, masks, bboxes, track_ids, probs, names, alpha=0.5):
         if isinstance(mask, torch.Tensor):
             mask = mask.cpu().numpy()
 
+        # Ensure mask is in proper format for OpenCV resize
+        if mask.dtype == bool:
+            mask = mask.astype(np.uint8)
+        elif mask.dtype != np.uint8 and mask.dtype != np.float32:
+            mask = mask.astype(np.float32)
+
         mask_resized = cv2.resize(mask, (w, h), interpolation=cv2.INTER_LINEAR)
 
         # Generate consistent color based on track_id
diff --git a/dimos/stream/stereo_camera_streams/zed.py b/dimos/stream/stereo_camera_streams/zed.py
new file mode 100644
index 0000000000..ceb7eac58b
--- /dev/null
+++ b/dimos/stream/stereo_camera_streams/zed.py
@@ -0,0 +1,164 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import time
+from typing import Optional, Dict
+
+from reactivex import Observable, create
+from reactivex import operators as ops
+
+from dimos.hardware.zed_camera import ZEDCamera
+
+try:
+    import pyzed.sl as sl
+except ImportError:
+    sl = None
+    logging.warning("ZED SDK not found. Please install pyzed to use ZED camera functionality.")
+
+logger = logging.getLogger(__name__)
+
+
+class ZEDCameraStream:
+    """ZED Camera stream that provides RGB and depth data as observables."""
+
+    def __init__(
+        self,
+        camera_id: int = 0,
+        resolution: Optional["sl.RESOLUTION"] = None,
+        depth_mode: Optional["sl.DEPTH_MODE"] = None,
+        fps: int = 30,
+    ):
+        """
+        Initialize ZED camera stream.
+
+        Args:
+            camera_id: Camera ID (0 for first ZED)
+            resolution: ZED camera resolution (defaults to HD720)
+            depth_mode: Depth computation mode (defaults to NEURAL)
+            fps: Camera frame rate (default: 30)
+        """
+        if sl is None:
+            raise ImportError("ZED SDK not installed. Please install pyzed package.")
+
+        self.camera_id = camera_id
+        self.fps = fps
+
+        # Set default values if not provided
+        if resolution is None:
+            resolution = sl.RESOLUTION.HD720
+        if depth_mode is None:
+            depth_mode = sl.DEPTH_MODE.NEURAL
+
+        self.resolution = resolution
+        self.depth_mode = depth_mode
+
+        # Initialize ZED camera
+        self.zed_camera = ZEDCamera(
+            camera_id=camera_id, resolution=resolution, depth_mode=depth_mode, fps=fps
+        )
+
+        self.is_opened = False
+
+    def _initialize_camera(self) -> None:
+        """Initialize the ZED camera if not already initialized."""
+        if not self.is_opened:
+            if not self.zed_camera.open():
+                raise RuntimeError(f"Failed to open ZED camera {self.camera_id}")
+            self.is_opened = True
+            logger.info(f"ZED camera {self.camera_id} opened successfully")
+
+    def create_stream(self) -> Observable:
+        """
+        Create an observable stream of RGB and depth frames.
+
+        Returns:
+            Observable: An observable emitting dictionaries with 'rgb' and 'depth' keys.
+        """
+
+        def emit_frames(observer, scheduler):
+            try:
+                # Initialize camera
+                if not self.is_opened:
+                    self._initialize_camera()
+
+                while True:
+                    # Capture frame directly
+                    left_img, right_img, depth_img = self.zed_camera.capture_frame()
+
+                    if left_img is not None and depth_img is not None:
+                        frame_data = {
+                            "rgb": left_img,
+                            "depth": depth_img,
+                            "right": right_img,
+                            "timestamp": time.time(),
+                        }
+
+                        observer.on_next(frame_data)
+
+            except Exception as e:
+                logger.error(f"Error during ZED frame emission: {e}")
+                observer.on_error(e)
+            finally:
+                # Clean up resources
+                self._cleanup_camera()
+                observer.on_completed()
+
+        return create(emit_frames).pipe(
+            ops.share(),  # Share the stream among multiple subscribers
+        )
+
+    def get_camera_info(self) -> Dict[str, float]:
+        """
+        Get ZED camera intrinsics (fx, fy, cx, cy).
+
+        Returns:
+            Dictionary containing camera intrinsics: fx, fy, cx, cy
+        """
+        if not self.is_opened:
+            self._initialize_camera()
+
+        try:
+            camera_info = self.zed_camera.get_camera_info()
+            left_cam = camera_info.get("left_cam", {})
+
+            return {
+                "fx": left_cam.get("fx", 0.0),
+                "fy": left_cam.get("fy", 0.0),
+                "cx": left_cam.get("cx", 0.0),
+                "cy": left_cam.get("cy", 0.0),
+            }
+        except Exception as e:
+            logger.error(f"Error getting camera info: {e}")
+            return {}
+
+    def _cleanup_camera(self) -> None:
+        """Clean up camera resources."""
+        if self.is_opened:
+            self.zed_camera.close()
+            self.is_opened = False
+            logger.info("ZED camera resources cleaned up")
+
+    def cleanup(self) -> None:
+        """Clean up all resources."""
+        self._cleanup_camera()
+
+    def __enter__(self):
+        """Context manager entry."""
+        self._initialize_camera()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        self._cleanup_camera()
diff --git a/dimos/types/manipulation.py b/dimos/types/manipulation.py
index d61d73a7ed..fee4e69ebb 100644
--- a/dimos/types/manipulation.py
+++ b/dimos/types/manipulation.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from enum import Enum
-from typing import Dict, List, Optional, Any, Union, TypedDict, Tuple, Literal
+from typing import Dict, List, Optional, Any, Union, TypedDict, Tuple, Literal, TYPE_CHECKING
 from dataclasses import dataclass, field, fields
 from abc import ABC, abstractmethod
 import uuid
@@ -21,6 +21,9 @@
 import time
 from dimos.types.vector import Vector
 
+if TYPE_CHECKING:
+    import open3d as o3d
+
 
 class ConstraintType(Enum):
     """Types of manipulation constraints."""
@@ -72,6 +75,7 @@ class ForceConstraint(AbstractConstraint):
 class ObjectData(TypedDict, total=False):
     """Data about an object in the manipulation scene."""
 
+    # Basic detection information
     object_id: int  # Unique ID for the object
     bbox: List[float]  # Bounding box [x1, y1, x2, y2]
     depth: float  # Depth in meters from Metric3d
@@ -80,9 +84,16 @@ class ObjectData(TypedDict, total=False):
     label: str  # Semantic label (e.g., 'cup', 'table')
     movement_tolerance: float  # (0.0 = immovable, 1.0 = freely movable)
     segmentation_mask: np.ndarray  # Binary mask of the object's pixels
-    position: Dict[str, float]  # 3D position {x, y, z}
-    rotation: Dict[str, float]  # 3D rotation {roll, pitch, yaw}
-    size: Dict[str, float]  # Object dimensions {width, height}
+
+    # 3D pose and dimensions
+    position: Union[Dict[str, float], Vector]  # 3D position {x, y, z} or Vector
+    rotation: Union[Dict[str, float], Vector]  # 3D rotation {roll, pitch, yaw} or Vector
+    size: Dict[str, float]  # Object dimensions {width, height, depth}
+
+    # Point cloud data
+    point_cloud: "o3d.geometry.PointCloud"  # Open3D point cloud object
+    point_cloud_numpy: np.ndarray  # Nx6 array of XYZRGB points
+    color: np.ndarray  # RGB color for visualization [R, G, B]
 
 
 class ManipulationMetadata(TypedDict, total=False):
diff --git a/tests/test_manipulation_perception_pipeline.py.py b/tests/test_manipulation_perception_pipeline.py.py
new file mode 100644
index 0000000000..8b333ec310
--- /dev/null
+++ b/tests/test_manipulation_perception_pipeline.py.py
@@ -0,0 +1,167 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import time
+import threading
+from reactivex import operators as ops
+
+import tests.test_header
+
+from pyzed import sl
+from dimos.stream.stereo_camera_streams.zed import ZEDCameraStream
+from dimos.web.robot_web_interface import RobotWebInterface
+from dimos.utils.logging_config import logger
+from dimos.perception.manip_aio_pipeline import ManipulationPipeline
+
+
+def monitor_grasps(pipeline):
+    """Monitor and print grasp updates in a separate thread."""
+    print(" Grasp monitor started...")
+
+    last_grasp_count = 0
+    last_update_time = time.time()
+
+    while True:
+        try:
+            # Get latest grasps using the getter function
+            grasps = pipeline.get_latest_grasps(timeout=0.5)
+            current_time = time.time()
+
+            if grasps is not None:
+                current_count = len(grasps)
+                if current_count != last_grasp_count:
+                    print(f" Grasps received: {current_count} (at {time.strftime('%H:%M:%S')})")
+                    if current_count > 0:
+                        best_score = max(grasp.get("score", 0.0) for grasp in grasps)
+                        print(f"   Best grasp score: {best_score:.3f}")
+                    last_grasp_count = current_count
+                    last_update_time = current_time
+            else:
+                # Show periodic "still waiting" message
+                if current_time - last_update_time > 10.0:
+                    print(f" Still waiting for grasps... ({time.strftime('%H:%M:%S')})")
+                    last_update_time = current_time
+
+            time.sleep(1.0)  # Check every second
+
+        except Exception as e:
+            print(f" Error in grasp monitor: {e}")
+            time.sleep(2.0)
+
+
+def main():
+    """Test point cloud filtering with grasp generation using ManipulationPipeline."""
+    print(" Testing point cloud filtering + grasp generation with ManipulationPipeline...")
+
+    # Configuration
+    min_confidence = 0.6
+    web_port = 5555
+    grasp_server_url = "ws://18.224.39.74:8000/ws/grasp"
+
+    try:
+        # Initialize ZED camera stream
+        zed_stream = ZEDCameraStream(resolution=sl.RESOLUTION.HD1080, fps=10)
+
+        # Get camera intrinsics
+        camera_intrinsics_dict = zed_stream.get_camera_info()
+        camera_intrinsics = [
+            camera_intrinsics_dict["fx"],
+            camera_intrinsics_dict["fy"],
+            camera_intrinsics_dict["cx"],
+            camera_intrinsics_dict["cy"],
+        ]
+
+        # Create the concurrent manipulation pipeline WITH grasp generation
+        pipeline = ManipulationPipeline(
+            camera_intrinsics=camera_intrinsics,
+            min_confidence=min_confidence,
+            max_objects=10,
+            grasp_server_url=grasp_server_url,
+            enable_grasp_generation=True,  # Enable grasp generation
+        )
+
+        # Create ZED stream
+        zed_frame_stream = zed_stream.create_stream().pipe(ops.share())
+
+        # Create concurrent processing streams
+        streams = pipeline.create_streams(zed_frame_stream)
+        detection_viz_stream = streams["detection_viz"]
+        pointcloud_viz_stream = streams["pointcloud_viz"]
+        grasps_stream = streams.get("grasps")  # Get grasp stream if available
+        grasp_overlay_stream = streams.get("grasp_overlay")  # Get grasp overlay stream if available
+
+    except ImportError:
+        print("Error: ZED SDK not installed. Please install pyzed package.")
+        sys.exit(1)
+    except RuntimeError as e:
+        print(f"Error: Failed to open ZED camera: {e}")
+        sys.exit(1)
+
+    try:
+        # Set up web interface with concurrent visualization streams
+        print("Initializing web interface...")
+        web_interface = RobotWebInterface(
+            port=web_port,
+            object_detection=detection_viz_stream,
+            pointcloud_stream=pointcloud_viz_stream,
+            grasp_overlay_stream=grasp_overlay_stream,
+        )
+
+        # Start grasp monitoring in background thread
+        grasp_monitor_thread = threading.Thread(
+            target=monitor_grasps, args=(pipeline,), daemon=True
+        )
+        grasp_monitor_thread.start()
+
+        print(f"\n Point Cloud + Grasp Generation Test Running:")
+        print(f"   Web Interface: http://localhost:{web_port}")
+        print(f"   Object Detection View: RGB with bounding boxes")
+        print(f"   Point Cloud View: Depth with colored point clouds and 3D bounding boxes")
+        print(f"   Confidence threshold: {min_confidence}")
+        print(f"   Grasp server: {grasp_server_url}")
+        print(f"   Available streams: {list(streams.keys())}")
+        print("\nPress Ctrl+C to stop the test\n")
+
+        # Start web server (blocking call)
+        web_interface.run()
+
+    except KeyboardInterrupt:
+        print("\nTest interrupted by user")
+    except Exception as e:
+        print(f"Error during test: {e}")
+    finally:
+        print("Cleaning up resources...")
+        if "zed_stream" in locals():
+            zed_stream.cleanup()
+        if "pipeline" in locals():
+            pipeline.cleanup()
+        print("Test completed")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_manipulation_pipeline_single_frame.py b/tests/test_manipulation_pipeline_single_frame.py
new file mode 100644
index 0000000000..1fc42d0810
--- /dev/null
+++ b/tests/test_manipulation_pipeline_single_frame.py
@@ -0,0 +1,248 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test manipulation processor with direct visualization and grasp data output."""
+
+import os
+import sys
+import cv2
+import numpy as np
+import time
+import argparse
+import matplotlib
+
+# Try to use TkAgg backend for live display, fallback to Agg if not available
+try:
+    matplotlib.use("TkAgg")
+except:
+    try:
+        matplotlib.use("Qt5Agg")
+    except:
+        matplotlib.use("Agg")  # Fallback to non-interactive
+import matplotlib.pyplot as plt
+import open3d as o3d
+from typing import Dict, List
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid
+from dimos.perception.manip_aio_processer import ManipulationProcessor
+from dimos.perception.pointcloud.utils import (
+    load_camera_matrix_from_yaml,
+    visualize_pcd,
+    combine_object_pointclouds,
+)
+from dimos.utils.logging_config import setup_logger
+
+from dimos.perception.grasp_generation.utils import visualize_grasps_3d, create_grasp_overlay
+
+logger = setup_logger("test_pipeline_viz")
+
+
+def load_first_frame(data_dir: str):
+    """Load first RGB-D frame and camera intrinsics."""
+    # Load images
+    color_img = cv2.imread(os.path.join(data_dir, "color", "00000.png"))
+    color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB)
+
+    depth_img = cv2.imread(os.path.join(data_dir, "depth", "00000.png"), cv2.IMREAD_ANYDEPTH)
+    if depth_img.dtype == np.uint16:
+        depth_img = depth_img.astype(np.float32) / 1000.0
+    # Load intrinsics
+    camera_matrix = load_camera_matrix_from_yaml(os.path.join(data_dir, "color_camera_info.yaml"))
+    intrinsics = [
+        camera_matrix[0, 0],
+        camera_matrix[1, 1],
+        camera_matrix[0, 2],
+        camera_matrix[1, 2],
+    ]
+
+    return color_img, depth_img, intrinsics
+
+
+def create_point_cloud(color_img, depth_img, intrinsics):
+    """Create Open3D point cloud."""
+    fx, fy, cx, cy = intrinsics
+    height, width = depth_img.shape
+
+    o3d_intrinsics = o3d.camera.PinholeCameraIntrinsic(width, height, fx, fy, cx, cy)
+    color_o3d = o3d.geometry.Image(color_img)
+    depth_o3d = o3d.geometry.Image((depth_img * 1000).astype(np.uint16))
+
+    rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth(
+        color_o3d, depth_o3d, depth_scale=1000.0, convert_rgb_to_intensity=False
+    )
+
+    return o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, o3d_intrinsics)
+
+
+def run_processor(color_img, depth_img, intrinsics, grasp_server_url=None):
+    """Run processor and collect results."""
+    processor_kwargs = {
+        "camera_intrinsics": intrinsics,
+        "enable_grasp_generation": True,
+        "enable_segmentation": True,
+    }
+
+    if grasp_server_url:
+        processor_kwargs["grasp_server_url"] = grasp_server_url
+
+    processor = ManipulationProcessor(**processor_kwargs)
+
+    # Process frame without grasp generation
+    results = processor.process_frame(color_img, depth_img, generate_grasps=False)
+
+    # Run grasp generation separately
+    grasps = processor.run_grasp_generation(results["all_objects"], results["full_pointcloud"])
+    results["grasps"] = grasps
+    results["grasp_overlay"] = create_grasp_overlay(color_img, grasps, intrinsics)
+
+    processor.cleanup()
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-dir", default="assets/rgbd_data")
+    parser.add_argument("--wait-time", type=float, default=5.0)
+    parser.add_argument(
+        "--grasp-server-url",
+        default="ws://18.224.39.74:8000/ws/grasp",
+        help="WebSocket URL for Dimensional Grasp server",
+    )
+    args = parser.parse_args()
+
+    # Load data
+    color_img, depth_img, intrinsics = load_first_frame(args.data_dir)
+    logger.info(f"Loaded images: color {color_img.shape}, depth {depth_img.shape}")
+
+    # Run processor
+    results = run_processor(color_img, depth_img, intrinsics, args.grasp_server_url)
+
+    # Print results summary
+    print(f"Processing time: {results.get('processing_time', 0):.3f}s")
+    print(f"Detection objects: {len(results.get('detected_objects', []))}")
+    print(f"All objects processed: {len(results.get('all_objects', []))}")
+
+    # Print grasp summary
+    grasp_data = results["grasps"]
+    total_grasps = len(grasp_data) if isinstance(grasp_data, list) else 0
+    best_score = max(grasp["score"] for grasp in grasp_data) if grasp_data else 0
+
+    print(f"Grasps: {total_grasps} total (best score: {best_score:.3f})")
+
+    # Create visualizations
+    plot_configs = []
+    if results["detection_viz"] is not None:
+        plot_configs.append(("detection_viz", "Object Detection"))
+    if results["segmentation_viz"] is not None:
+        plot_configs.append(("segmentation_viz", "Semantic Segmentation"))
+    if results["pointcloud_viz"] is not None:
+        plot_configs.append(("pointcloud_viz", "All Objects Point Cloud"))
+    if results["detected_pointcloud_viz"] is not None:
+        plot_configs.append(("detected_pointcloud_viz", "Detection Objects Point Cloud"))
+    if results["misc_pointcloud_viz"] is not None:
+        plot_configs.append(("misc_pointcloud_viz", "Misc/Background Points"))
+    if results["grasp_overlay"] is not None:
+        plot_configs.append(("grasp_overlay", "Grasp Overlay"))
+
+    # Create subplot layout
+    num_plots = len(plot_configs)
+    if num_plots <= 3:
+        fig, axes = plt.subplots(1, num_plots, figsize=(6 * num_plots, 5))
+    else:
+        rows = 2
+        cols = (num_plots + 1) // 2
+        fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 5 * rows))
+
+    if num_plots == 1:
+        axes = [axes]
+    elif num_plots > 2:
+        axes = axes.flatten()
+
+    # Plot each result
+    for i, (key, title) in enumerate(plot_configs):
+        axes[i].imshow(results[key])
+        axes[i].set_title(title)
+        axes[i].axis("off")
+
+    # Hide unused subplots
+    if num_plots > 3:
+        for i in range(num_plots, len(axes)):
+            axes[i].axis("off")
+
+    plt.tight_layout()
+    plt.savefig("manipulation_results.png", dpi=150, bbox_inches="tight")
+    plt.show(block=True)
+    plt.close()
+
+    point_clouds = [obj["point_cloud"] for obj in results["all_objects"]]
+    colors = [obj["color"] for obj in results["all_objects"]]
+    combined_pcd = combine_object_pointclouds(point_clouds, colors)
+
+    # 3D Grasp visualization
+    if grasp_data:
+        # Convert grasp format to visualization format for 3D display
+        viz_grasps = []
+        for grasp in grasp_data:
+            translation = grasp.get("translation", [0, 0, 0])
+            rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3).tolist()))
+            score = grasp.get("score", 0.0)
+            width = grasp.get("width", 0.08)
+
+            viz_grasp = {
+                "translation": translation,
+                "rotation_matrix": rotation_matrix,
+                "width": width,
+                "score": score,
+            }
+            viz_grasps.append(viz_grasp)
+
+        # Use unified 3D visualization
+        visualize_grasps_3d(combined_pcd, viz_grasps)
+
+    # Visualize full point cloud
+    visualize_pcd(
+        results["full_pointcloud"],
+        window_name="Full Scene Point Cloud",
+        point_size=2.0,
+        show_coordinate_frame=True,
+    )
+
+    # Visualize all objects point cloud
+    visualize_pcd(
+        combined_pcd,
+        window_name="All Objects Point Cloud",
+        point_size=3.0,
+        show_coordinate_frame=True,
+    )
+
+    # Visualize misc clusters
+    visualize_clustered_point_clouds(
+        results["misc_clusters"],
+        window_name="Misc/Background Clusters (DBSCAN)",
+        point_size=3.0,
+        show_coordinate_frame=True,
+    )
+
+    # Visualize voxel grid
+    visualize_voxel_grid(
+        results["misc_voxel_grid"],
+        window_name="Misc/Background Voxel Grid",
+        show_coordinate_frame=True,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_manipulation_pipeline_single_frame_lcm.py b/tests/test_manipulation_pipeline_single_frame_lcm.py
new file mode 100644
index 0000000000..d8090575eb
--- /dev/null
+++ b/tests/test_manipulation_pipeline_single_frame_lcm.py
@@ -0,0 +1,431 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test manipulation processor with LCM topic subscription."""
+
+import os
+import sys
+import cv2
+import numpy as np
+import time
+import argparse
+import threading
+import pickle
+import matplotlib
+import json
+import copy
+
+# Try to use TkAgg backend for live display, fallback to Agg if not available
+try:
+    matplotlib.use("TkAgg")
+except:
+    try:
+        matplotlib.use("Qt5Agg")
+    except:
+        matplotlib.use("Agg")  # Fallback to non-interactive
+import matplotlib.pyplot as plt
+import open3d as o3d
+from typing import Dict, List, Optional
+
+# LCM imports
+import lcm
+from lcm_msgs.sensor_msgs import Image as LCMImage
+from lcm_msgs.sensor_msgs import CameraInfo as LCMCameraInfo
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid
+from dimos.perception.manip_aio_processer import ManipulationProcessor
+from dimos.perception.grasp_generation.utils import visualize_grasps_3d
+from dimos.perception.pointcloud.utils import visualize_pcd
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger("test_pipeline_lcm")
+
+
+class LCMDataCollector:
+    """Collects one message from each required LCM topic."""
+
+    def __init__(self, lcm_url: str = "udpm://239.255.76.67:7667?ttl=1"):
+        self.lcm = lcm.LCM(lcm_url)
+
+        # Data storage
+        self.rgb_data: Optional[np.ndarray] = None
+        self.depth_data: Optional[np.ndarray] = None
+        self.camera_intrinsics: Optional[List[float]] = None
+
+        # Synchronization
+        self.data_lock = threading.Lock()
+        self.data_ready_event = threading.Event()
+
+        # Flags to track received messages
+        self.rgb_received = False
+        self.depth_received = False
+        self.camera_info_received = False
+
+        # Subscribe to topics
+        self.lcm.subscribe("head_cam_rgb#sensor_msgs.Image", self._handle_rgb_message)
+        self.lcm.subscribe("head_cam_depth#sensor_msgs.Image", self._handle_depth_message)
+        self.lcm.subscribe("head_cam_info#sensor_msgs.CameraInfo", self._handle_camera_info_message)
+
+        logger.info("LCM Data Collector initialized")
+        logger.info("Subscribed to topics:")
+        logger.info("  - head_cam_rgb#sensor_msgs.Image")
+        logger.info("  - head_cam_depth#sensor_msgs.Image")
+        logger.info("  - head_cam_info#sensor_msgs.CameraInfo")
+
+    def _handle_rgb_message(self, channel: str, data: bytes):
+        """Handle RGB image message."""
+        if self.rgb_received:
+            return  # Already got one, ignore subsequent messages
+
+        try:
+            msg = LCMImage.decode(data)
+
+            # Convert message data to numpy array
+            if msg.encoding == "rgb8":
+                # RGB8 format: 3 bytes per pixel
+                rgb_array = np.frombuffer(msg.data[: msg.data_length], dtype=np.uint8)
+                rgb_image = rgb_array.reshape((msg.height, msg.width, 3))
+
+                with self.data_lock:
+                    self.rgb_data = rgb_image
+                    self.rgb_received = True
+                    logger.info(
+                        f"RGB message received: {msg.width}x{msg.height}, encoding: {msg.encoding}"
+                    )
+                    self._check_all_data_received()
+
+            else:
+                logger.warning(f"Unsupported RGB encoding: {msg.encoding}")
+
+        except Exception as e:
+            logger.error(f"Error processing RGB message: {e}")
+
+    def _handle_depth_message(self, channel: str, data: bytes):
+        """Handle depth image message."""
+        if self.depth_received:
+            return  # Already got one, ignore subsequent messages
+
+        try:
+            msg = LCMImage.decode(data)
+
+            # Convert message data to numpy array
+            if msg.encoding == "32FC1":
+                # 32FC1 format: 4 bytes (float32) per pixel
+                depth_array = np.frombuffer(msg.data[: msg.data_length], dtype=np.float32)
+                depth_image = depth_array.reshape((msg.height, msg.width))
+
+                with self.data_lock:
+                    self.depth_data = depth_image
+                    self.depth_received = True
+                    logger.info(
+                        f"Depth message received: {msg.width}x{msg.height}, encoding: {msg.encoding}"
+                    )
+                    logger.info(
+                        f"Depth range: {depth_image.min():.3f} - {depth_image.max():.3f} meters"
+                    )
+                    self._check_all_data_received()
+
+            else:
+                logger.warning(f"Unsupported depth encoding: {msg.encoding}")
+
+        except Exception as e:
+            logger.error(f"Error processing depth message: {e}")
+
+    def _handle_camera_info_message(self, channel: str, data: bytes):
+        """Handle camera info message."""
+        if self.camera_info_received:
+            return  # Already got one, ignore subsequent messages
+
+        try:
+            msg = LCMCameraInfo.decode(data)
+
+            # Extract intrinsics from K matrix: [fx, 0, cx, 0, fy, cy, 0, 0, 1]
+            K = msg.K
+            fx = K[0]  # K[0,0]
+            fy = K[4]  # K[1,1]
+            cx = K[2]  # K[0,2]
+            cy = K[5]  # K[1,2]
+
+            intrinsics = [fx, fy, cx, cy]
+
+            with self.data_lock:
+                self.camera_intrinsics = intrinsics
+                self.camera_info_received = True
+                logger.info(f"Camera info received: {msg.width}x{msg.height}")
+                logger.info(f"Intrinsics: fx={fx:.1f}, fy={fy:.1f}, cx={cx:.1f}, cy={cy:.1f}")
+                self._check_all_data_received()
+
+        except Exception as e:
+            logger.error(f"Error processing camera info message: {e}")
+
+    def _check_all_data_received(self):
+        """Check if all required data has been received."""
+        if self.rgb_received and self.depth_received and self.camera_info_received:
+            logger.info("✅ All required data received!")
+            self.data_ready_event.set()
+
+    def wait_for_data(self, timeout: float = 30.0) -> bool:
+        """Wait for all data to be received."""
+        logger.info("Waiting for RGB, depth, and camera info messages...")
+
+        # Start LCM handling in a separate thread
+        lcm_thread = threading.Thread(target=self._lcm_handle_loop, daemon=True)
+        lcm_thread.start()
+
+        # Wait for data with timeout
+        return self.data_ready_event.wait(timeout)
+
+    def _lcm_handle_loop(self):
+        """LCM message handling loop."""
+        try:
+            while not self.data_ready_event.is_set():
+                self.lcm.handle_timeout(100)  # 100ms timeout
+        except Exception as e:
+            logger.error(f"Error in LCM handling loop: {e}")
+
+    def get_data(self):
+        """Get the collected data."""
+        with self.data_lock:
+            return self.rgb_data, self.depth_data, self.camera_intrinsics
+
+
+def create_point_cloud(color_img, depth_img, intrinsics):
+    """Create Open3D point cloud."""
+    fx, fy, cx, cy = intrinsics
+    height, width = depth_img.shape
+
+    o3d_intrinsics = o3d.camera.PinholeCameraIntrinsic(width, height, fx, fy, cx, cy)
+    color_o3d = o3d.geometry.Image(color_img)
+    depth_o3d = o3d.geometry.Image((depth_img * 1000).astype(np.uint16))
+
+    rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth(
+        color_o3d, depth_o3d, depth_scale=1000.0, convert_rgb_to_intensity=False
+    )
+
+    return o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, o3d_intrinsics)
+
+
+def run_processor(color_img, depth_img, intrinsics):
+    """Run processor and collect results."""
+    # Create processor
+    processor = ManipulationProcessor(
+        camera_intrinsics=intrinsics,
+        grasp_server_url="ws://18.224.39.74:8000/ws/grasp",
+        enable_grasp_generation=False,
+        enable_segmentation=True,
+    )
+
+    # Process single frame directly
+    results = processor.process_frame(color_img, depth_img)
+
+    # Debug: print available results
+    print(f"Available results: {list(results.keys())}")
+
+    processor.cleanup()
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lcm-url", default="udpm://239.255.76.67:7667?ttl=1", help="LCM URL for subscription"
+    )
+    parser.add_argument(
+        "--timeout", type=float, default=30.0, help="Timeout in seconds to wait for messages"
+    )
+    parser.add_argument(
+        "--save-images", action="store_true", help="Save received RGB and depth images to files"
+    )
+    args = parser.parse_args()
+
+    # Create data collector
+    collector = LCMDataCollector(args.lcm_url)
+
+    # Wait for data
+    if not collector.wait_for_data(args.timeout):
+        logger.error(f"Timeout waiting for data after {args.timeout} seconds")
+        logger.error("Make sure Unity is running and publishing to the LCM topics")
+        return
+
+    # Get the collected data
+    color_img, depth_img, intrinsics = collector.get_data()
+
+    logger.info(f"Loaded images: color {color_img.shape}, depth {depth_img.shape}")
+    logger.info(f"Intrinsics: {intrinsics}")
+
+    # Save images if requested
+    if args.save_images:
+        try:
+            cv2.imwrite("received_rgb.png", cv2.cvtColor(color_img, cv2.COLOR_RGB2BGR))
+            # Save depth as 16-bit for visualization
+            depth_viz = (np.clip(depth_img * 1000, 0, 65535)).astype(np.uint16)
+            cv2.imwrite("received_depth.png", depth_viz)
+            logger.info("Saved received_rgb.png and received_depth.png")
+        except Exception as e:
+            logger.warning(f"Failed to save images: {e}")
+
+    # Run processor
+    results = run_processor(color_img, depth_img, intrinsics)
+
+    # Debug: Print what we received
+    print(f"\n✅ Processor Results:")
+    print(f"   Available results: {list(results.keys())}")
+    print(f"   Processing time: {results.get('processing_time', 0):.3f}s")
+
+    # Show timing breakdown if available
+    if "timing_breakdown" in results:
+        breakdown = results["timing_breakdown"]
+        print(f"   Timing breakdown:")
+        print(f"     - Detection: {breakdown.get('detection', 0):.3f}s")
+        print(f"     - Segmentation: {breakdown.get('segmentation', 0):.3f}s")
+        print(f"     - Point cloud: {breakdown.get('pointcloud', 0):.3f}s")
+        print(f"     - Misc extraction: {breakdown.get('misc_extraction', 0):.3f}s")
+
+    # Print object information
+    detected_count = len(results.get("detected_objects", []))
+    all_count = len(results.get("all_objects", []))
+
+    print(f"   Detection objects: {detected_count}")
+    print(f"   All objects processed: {all_count}")
+
+    # Print misc clusters information
+    if "misc_clusters" in results and results["misc_clusters"]:
+        cluster_count = len(results["misc_clusters"])
+        total_misc_points = sum(
+            len(np.asarray(cluster.points)) for cluster in results["misc_clusters"]
+        )
+        print(f"   Misc clusters: {cluster_count} clusters with {total_misc_points} total points")
+    else:
+        print(f"   Misc clusters: None")
+
+    # Print grasp summary
+    if "grasps" in results and results["grasps"]:
+        total_grasps = 0
+        best_score = 0
+        for grasp in results["grasps"]:
+            score = grasp.get("score", 0)
+            if score > best_score:
+                best_score = score
+            total_grasps += 1
+        print(f"   Grasps generated: {total_grasps} (best score: {best_score:.3f})")
+    else:
+        print("   Grasps: None generated")
+
+    # Save results to pickle file
+    pickle_path = "manipulation_results.pkl"
+    print(f"\nSaving results to pickle file: {pickle_path}")
+
+    def serialize_point_cloud(pcd):
+        """Convert Open3D PointCloud to serializable format."""
+        if pcd is None:
+            return None
+        data = {
+            "points": np.asarray(pcd.points).tolist() if hasattr(pcd, "points") else [],
+            "colors": np.asarray(pcd.colors).tolist()
+            if hasattr(pcd, "colors") and pcd.colors
+            else [],
+        }
+        return data
+
+    def serialize_voxel_grid(voxel_grid):
+        """Convert Open3D VoxelGrid to serializable format."""
+        if voxel_grid is None:
+            return None
+
+        # Extract voxel data
+        voxels = voxel_grid.get_voxels()
+        data = {
+            "voxel_size": voxel_grid.voxel_size,
+            "origin": np.asarray(voxel_grid.origin).tolist(),
+            "voxels": [
+                (
+                    v.grid_index[0],
+                    v.grid_index[1],
+                    v.grid_index[2],
+                    v.color[0],
+                    v.color[1],
+                    v.color[2],
+                )
+                for v in voxels
+            ],
+        }
+        return data
+
+    # Create a copy of results with non-picklable objects converted
+    pickle_data = {
+        "color_img": color_img,
+        "depth_img": depth_img,
+        "intrinsics": intrinsics,
+        "results": {},
+    }
+
+    # Convert and store all results, properly handling Open3D objects
+    for key, value in results.items():
+        if key.endswith("_viz") or key in [
+            "processing_time",
+            "timing_breakdown",
+            "detection2d_objects",
+            "segmentation2d_objects",
+        ]:
+            # These are already serializable
+            pickle_data["results"][key] = value
+        elif key == "full_pointcloud":
+            # Serialize PointCloud object
+            pickle_data["results"][key] = serialize_point_cloud(value)
+            print(f"Serialized {key}")
+        elif key == "misc_voxel_grid":
+            # Serialize VoxelGrid object
+            pickle_data["results"][key] = serialize_voxel_grid(value)
+            print(f"Serialized {key}")
+        elif key == "misc_clusters":
+            # List of PointCloud objects
+            if value:
+                serialized_clusters = [serialize_point_cloud(cluster) for cluster in value]
+                pickle_data["results"][key] = serialized_clusters
+                print(f"Serialized {key} ({len(serialized_clusters)} clusters)")
+        elif key == "detected_objects" or key == "all_objects":
+            # Objects with PointCloud attributes
+            serialized_objects = []
+            for obj in value:
+                obj_dict = {k: v for k, v in obj.items() if k != "point_cloud"}
+                if "point_cloud" in obj:
+                    obj_dict["point_cloud"] = serialize_point_cloud(obj.get("point_cloud"))
+                serialized_objects.append(obj_dict)
+            pickle_data["results"][key] = serialized_objects
+            print(f"Serialized {key} ({len(serialized_objects)} objects)")
+        else:
+            try:
+                # Try to pickle as is
+                pickle_data["results"][key] = value
+                print(f"Preserved {key} as is")
+            except (TypeError, ValueError):
+                print(f"Warning: Could not serialize {key}, skipping")
+
+    with open(pickle_path, "wb") as f:
+        pickle.dump(pickle_data, f)
+
+    print(f"Results saved successfully with all 3D data serialized!")
+    print(f"Pickled data keys: {list(pickle_data['results'].keys())}")
+
+    # Visualization code has been moved to visualization_script.py
+    # The results have been pickled and can be loaded from there
+    print("\nVisualization code has been moved to visualization_script.py")
+    print("Run 'python visualization_script.py' to visualize the results")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_pointcloud_filtering.py b/tests/test_pointcloud_filtering.py
new file mode 100644
index 0000000000..308b4fc6ac
--- /dev/null
+++ b/tests/test_pointcloud_filtering.py
@@ -0,0 +1,105 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import time
+import threading
+from reactivex import operators as ops
+
+import tests.test_header
+
+from pyzed import sl
+from dimos.stream.stereo_camera_streams.zed import ZEDCameraStream
+from dimos.web.robot_web_interface import RobotWebInterface
+from dimos.utils.logging_config import logger
+from dimos.perception.manip_aio_pipeline import ManipulationPipeline
+
+
+def main():
+    """Test point cloud filtering using the concurrent stream-based ManipulationPipeline."""
+    print("Testing point cloud filtering with ManipulationPipeline...")
+
+    # Configuration
+    min_confidence = 0.6
+    web_port = 5555
+
+    try:
+        # Initialize ZED camera stream
+        zed_stream = ZEDCameraStream(resolution=sl.RESOLUTION.HD1080, fps=10)
+
+        # Get camera intrinsics
+        camera_intrinsics_dict = zed_stream.get_camera_info()
+        camera_intrinsics = [
+            camera_intrinsics_dict["fx"],
+            camera_intrinsics_dict["fy"],
+            camera_intrinsics_dict["cx"],
+            camera_intrinsics_dict["cy"],
+        ]
+
+        # Create the concurrent manipulation pipeline
+        pipeline = ManipulationPipeline(
+            camera_intrinsics=camera_intrinsics,
+            min_confidence=min_confidence,
+            max_objects=10,
+        )
+
+        # Create ZED stream
+        zed_frame_stream = zed_stream.create_stream().pipe(ops.share())
+
+        # Create concurrent processing streams
+        streams = pipeline.create_streams(zed_frame_stream)
+        detection_viz_stream = streams["detection_viz"]
+        pointcloud_viz_stream = streams["pointcloud_viz"]
+
+    except ImportError:
+        print("Error: ZED SDK not installed. Please install pyzed package.")
+        sys.exit(1)
+    except RuntimeError as e:
+        print(f"Error: Failed to open ZED camera: {e}")
+        sys.exit(1)
+
+    try:
+        # Set up web interface with concurrent visualization streams
+        print("Initializing web interface...")
+        web_interface = RobotWebInterface(
+            port=web_port,
+            object_detection=detection_viz_stream,
+            pointcloud_stream=pointcloud_viz_stream,
+        )
+
+        print(f"\nPoint Cloud Filtering Test Running:")
+        print(f"Web Interface: http://localhost:{web_port}")
+        print(f"Object Detection View: RGB with bounding boxes")
+        print(f"Point Cloud View: Depth with colored point clouds and 3D bounding boxes")
+        print(f"Confidence threshold: {min_confidence}")
+        print("\nPress Ctrl+C to stop the test\n")
+
+        # Start web server (blocking call)
+        web_interface.run()
+
+    except KeyboardInterrupt:
+        print("\nTest interrupted by user")
+    except Exception as e:
+        print(f"Error during test: {e}")
+    finally:
+        print("Cleaning up resources...")
+        if "zed_stream" in locals():
+            zed_stream.cleanup()
+        if "pipeline" in locals():
+            pipeline.cleanup()
+        print("Test completed")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_zed_setup.py b/tests/test_zed_setup.py
new file mode 100755
index 0000000000..ca50bb63fb
--- /dev/null
+++ b/tests/test_zed_setup.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Simple test script to verify ZED camera setup and basic functionality.
+"""
+
+import sys
+from pathlib import Path
+
+
+def test_imports():
+    """Test that all required modules can be imported."""
+    print("Testing imports...")
+
+    try:
+        import numpy as np
+
+        print("✓ NumPy imported successfully")
+    except ImportError as e:
+        print(f"✗ NumPy import failed: {e}")
+        return False
+
+    try:
+        import cv2
+
+        print("✓ OpenCV imported successfully")
+    except ImportError as e:
+        print(f"✗ OpenCV import failed: {e}")
+        return False
+
+    try:
+        from PIL import Image, ImageDraw, ImageFont
+
+        print("✓ PIL imported successfully")
+    except ImportError as e:
+        print(f"✗ PIL import failed: {e}")
+        return False
+
+    try:
+        import pyzed.sl as sl
+
+        print("✓ ZED SDK (pyzed) imported successfully")
+        # Note: SDK version method varies between versions
+    except ImportError as e:
+        print(f"✗ ZED SDK import failed: {e}")
+        print("  Please install ZED SDK and pyzed package")
+        return False
+
+    try:
+        from dimos.hardware.zed_camera import ZEDCamera
+
+        print("✓ ZEDCamera class imported successfully")
+    except ImportError as e:
+        print(f"✗ ZEDCamera import failed: {e}")
+        return False
+
+    try:
+        from dimos.perception.zed_visualizer import ZEDVisualizer
+
+        print("✓ ZEDVisualizer class imported successfully")
+    except ImportError as e:
+        print(f"✗ ZEDVisualizer import failed: {e}")
+        return False
+
+    return True
+
+
+def test_camera_detection():
+    """Test if ZED cameras are detected."""
+    print("\nTesting camera detection...")
+
+    try:
+        import pyzed.sl as sl
+
+        # List available cameras
+        cameras = sl.Camera.get_device_list()
+        print(f"Found {len(cameras)} ZED camera(s):")
+
+        for i, camera_info in enumerate(cameras):
+            print(f"  Camera {i}:")
+            print(f"    Model: {camera_info.camera_model}")
+            print(f"    Serial: {camera_info.serial_number}")
+            print(f"    State: {camera_info.camera_state}")
+
+        return len(cameras) > 0
+
+    except Exception as e:
+        print(f"Error detecting cameras: {e}")
+        return False
+
+
+def test_basic_functionality():
+    """Test basic ZED camera functionality without actually opening the camera."""
+    print("\nTesting basic functionality...")
+
+    try:
+        import pyzed.sl as sl
+        from dimos.hardware.zed_camera import ZEDCamera
+        from dimos.perception.zed_visualizer import ZEDVisualizer
+
+        # Test camera initialization (without opening)
+        camera = ZEDCamera(
+            camera_id=0,
+            resolution=sl.RESOLUTION.HD720,
+            depth_mode=sl.DEPTH_MODE.NEURAL,
+        )
+        print("✓ ZEDCamera instance created successfully")
+
+        # Test visualizer initialization
+        visualizer = ZEDVisualizer(max_depth=10.0)
+        print("✓ ZEDVisualizer instance created successfully")
+
+        # Test creating a dummy visualization
+        dummy_rgb = np.zeros((480, 640, 3), dtype=np.uint8)
+        dummy_depth = np.ones((480, 640), dtype=np.float32) * 2.0
+
+        vis = visualizer.create_side_by_side_image(dummy_rgb, dummy_depth)
+        print("✓ Dummy visualization created successfully")
+
+        return True
+
+    except Exception as e:
+        print(f"✗ Basic functionality test failed: {e}")
+        return False
+
+
+def main():
+    """Run all tests."""
+    print("ZED Camera Setup Test")
+    print("=" * 50)
+
+    # Test imports
+    if not test_imports():
+        print("\n❌ Import tests failed. Please install missing dependencies.")
+        return False
+
+    # Test camera detection
+    cameras_found = test_camera_detection()
+    if not cameras_found:
+        print(
+            "\n⚠️  No ZED cameras detected. Please connect a ZED camera to test capture functionality."
+        )
+
+    # Test basic functionality
+    if not test_basic_functionality():
+        print("\n❌ Basic functionality tests failed.")
+        return False
+
+    print("\n" + "=" * 50)
+    if cameras_found:
+        print("✅ All tests passed! You can now run the ZED demo:")
+        print("   python examples/zed_neural_depth_demo.py --display-time 10")
+    else:
+        print("✅ Setup is ready, but no camera detected.")
+        print("   Connect a ZED camera and run:")
+        print("   python examples/zed_neural_depth_demo.py --display-time 10")
+
+    return True
+
+
+if __name__ == "__main__":
+    # Add the project root to Python path
+    sys.path.append(str(Path(__file__).parent))
+
+    # Import numpy after path setup
+    import numpy as np
+
+    success = main()
+    sys.exit(0 if success else 1)
diff --git a/tests/visualization_script.py b/tests/visualization_script.py
new file mode 100644
index 0000000000..d0c4c6af84
--- /dev/null
+++ b/tests/visualization_script.py
@@ -0,0 +1,1041 @@
+#!/usr/bin/env python3
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Visualize pickled manipulation pipeline results."""
+
+import os
+import sys
+import pickle
+import numpy as np
+import json
+import matplotlib
+
+# Try to use TkAgg backend for live display, fallback to Agg if not available
+try:
+    matplotlib.use("TkAgg")
+except:
+    try:
+        matplotlib.use("Qt5Agg")
+    except:
+        matplotlib.use("Agg")  # Fallback to non-interactive
+import matplotlib.pyplot as plt
+import open3d as o3d
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid
+from dimos.perception.grasp_generation.utils import visualize_grasps_3d
+from dimos.perception.pointcloud.utils import visualize_pcd
+from dimos.utils.logging_config import setup_logger
+import trimesh
+
+import tf_lcm_py
+import cv2
+from contextlib import contextmanager
+import lcm_msgs
+from lcm_msgs.sensor_msgs import JointState, PointCloud2, CameraInfo, PointCloud2, PointField
+from lcm_msgs.std_msgs import Header
+from typing import List, Tuple, Optional
+import atexit
+from datetime import datetime
+import time
+
+from pydrake.all import (
+    AddMultibodyPlantSceneGraph,
+    CoulombFriction,
+    Diagram,
+    DiagramBuilder,
+    InverseKinematics,
+    MeshcatVisualizer,
+    MeshcatVisualizerParams,
+    MultibodyPlant,
+    Parser,
+    RigidTransform,
+    RollPitchYaw,
+    RotationMatrix,
+    JointIndex,
+    Solve,
+    StartMeshcat,
+)
+from pydrake.geometry import (
+    CollisionFilterDeclaration,
+    Mesh,
+    ProximityProperties,
+    InMemoryMesh,
+    Box,
+    Cylinder,
+)
+from pydrake.math import RigidTransform as DrakeRigidTransform
+from pydrake.common import MemoryFile
+
+from pydrake.all import (
+    MinimumDistanceLowerBoundConstraint,
+    MultibodyPlant,
+    Parser,
+    DiagramBuilder,
+    AddMultibodyPlantSceneGraph,
+    MeshcatVisualizer,
+    StartMeshcat,
+    RigidTransform,
+    Role,
+    RollPitchYaw,
+    RotationMatrix,
+    Solve,
+    InverseKinematics,
+    MeshcatVisualizerParams,
+    MinimumDistanceLowerBoundConstraint,
+    DoDifferentialInverseKinematics,
+    DifferentialInverseKinematicsStatus,
+    DifferentialInverseKinematicsParameters,
+    DepthImageToPointCloud,
+)
+from manipulation.scenarios import AddMultibodyTriad
+from manipulation.meshcat_utils import (  # TODO(russt): switch to pydrake version
+    _MeshcatPoseSliders,
+)
+from manipulation.scenarios import AddIiwa, AddShape, AddWsg
+
+logger = setup_logger("visualization_script")
+
+
+def create_point_cloud(color_img, depth_img, intrinsics):
+    """Create Open3D point cloud from RGB and depth images."""
+    fx, fy, cx, cy = intrinsics
+    height, width = depth_img.shape
+
+    o3d_intrinsics = o3d.camera.PinholeCameraIntrinsic(width, height, fx, fy, cx, cy)
+    color_o3d = o3d.geometry.Image(color_img)
+    depth_o3d = o3d.geometry.Image((depth_img * 1000).astype(np.uint16))
+
+    rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth(
+        color_o3d, depth_o3d, depth_scale=1000.0, convert_rgb_to_intensity=False
+    )
+
+    return o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, o3d_intrinsics)
+
+
+def deserialize_point_cloud(data):
+    """Reconstruct Open3D PointCloud from serialized data."""
+    if data is None:
+        return None
+
+    pcd = o3d.geometry.PointCloud()
+    if "points" in data and data["points"]:
+        pcd.points = o3d.utility.Vector3dVector(np.array(data["points"]))
+    if "colors" in data and data["colors"]:
+        pcd.colors = o3d.utility.Vector3dVector(np.array(data["colors"]))
+    return pcd
+
+
+def deserialize_voxel_grid(data):
+    """Reconstruct Open3D VoxelGrid from serialized data."""
+    if data is None:
+        return None
+
+    # Create a point cloud to convert to voxel grid
+    pcd = o3d.geometry.PointCloud()
+    voxel_size = data["voxel_size"]
+    origin = np.array(data["origin"])
+
+    # Create points from voxel indices
+    points = []
+    colors = []
+    for voxel in data["voxels"]:
+        # Each voxel is (i, j, k, r, g, b)
+        i, j, k, r, g, b = voxel
+        # Convert voxel grid index to 3D point
+        point = origin + np.array([i, j, k]) * voxel_size
+        points.append(point)
+        colors.append([r, g, b])
+
+    if points:
+        pcd.points = o3d.utility.Vector3dVector(np.array(points))
+        pcd.colors = o3d.utility.Vector3dVector(np.array(colors))
+
+    # Convert to voxel grid
+    voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud(pcd, voxel_size)
+    return voxel_grid
+
+
+def visualize_results(pickle_path="manipulation_results.pkl"):
+    """Load pickled results and visualize them."""
+    print(f"Loading results from {pickle_path}...")
+    try:
+        with open(pickle_path, "rb") as f:
+            data = pickle.load(f)
+
+        results = data["results"]
+        color_img = data["color_img"]
+        depth_img = data["depth_img"]
+        intrinsics = data["intrinsics"]
+
+        print(f"Loaded results with keys: {list(results.keys())}")
+
+    except FileNotFoundError:
+        print(f"Error: Pickle file {pickle_path} not found.")
+        print("Make sure to run test_manipulation_pipeline_single_frame_lcm.py first.")
+        return
+    except Exception as e:
+        print(f"Error loading pickle file: {e}")
+        return
+
+    # Determine number of subplots based on what results we have
+    num_plots = 0
+    plot_configs = []
+
+    if "detection_viz" in results and results["detection_viz"] is not None:
+        plot_configs.append(("detection_viz", "Object Detection"))
+        num_plots += 1
+
+    if "segmentation_viz" in results and results["segmentation_viz"] is not None:
+        plot_configs.append(("segmentation_viz", "Semantic Segmentation"))
+        num_plots += 1
+
+    if "pointcloud_viz" in results and results["pointcloud_viz"] is not None:
+        plot_configs.append(("pointcloud_viz", "All Objects Point Cloud"))
+        num_plots += 1
+
+    if "detected_pointcloud_viz" in results and results["detected_pointcloud_viz"] is not None:
+        plot_configs.append(("detected_pointcloud_viz", "Detection Objects Point Cloud"))
+        num_plots += 1
+
+    if "misc_pointcloud_viz" in results and results["misc_pointcloud_viz"] is not None:
+        plot_configs.append(("misc_pointcloud_viz", "Misc/Background Points"))
+        num_plots += 1
+
+    if "grasp_overlay" in results and results["grasp_overlay"] is not None:
+        plot_configs.append(("grasp_overlay", "Grasp Overlay"))
+        num_plots += 1
+
+    if num_plots == 0:
+        print("No visualization results to display")
+        return
+
+    # Create subplot layout
+    if num_plots <= 3:
+        fig, axes = plt.subplots(1, num_plots, figsize=(6 * num_plots, 5))
+    else:
+        rows = 2
+        cols = (num_plots + 1) // 2
+        fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 5 * rows))
+
+    # Ensure axes is always a list for consistent indexing
+    if num_plots == 1:
+        axes = [axes]
+    elif num_plots > 2:
+        axes = axes.flatten()
+
+    # Plot each result
+    for i, (key, title) in enumerate(plot_configs):
+        axes[i].imshow(results[key])
+        axes[i].set_title(title)
+        axes[i].axis("off")
+
+    # Hide unused subplots if any
+    if num_plots > 3:
+        for i in range(num_plots, len(axes)):
+            axes[i].axis("off")
+
+    plt.tight_layout()
+
+    # Save and show the plot
+    output_path = "visualization_results.png"
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    print(f"Results visualization saved to: {output_path}")
+
+    # Show plot live as well
+    plt.show(block=True)
+    plt.close()
+
+    # Deserialize and reconstruct 3D objects from the pickle file
+    print("\nReconstructing 3D visualization objects from serialized data...")
+
+    # Reconstruct full point cloud if available
+    full_pcd = None
+    if "full_pointcloud" in results and results["full_pointcloud"] is not None:
+        full_pcd = deserialize_point_cloud(results["full_pointcloud"])
+        print(f"Reconstructed full point cloud with {len(np.asarray(full_pcd.points))} points")
+
+        # Visualize reconstructed full point cloud
+        try:
+            visualize_pcd(
+                full_pcd,
+                window_name="Reconstructed Full Scene Point Cloud",
+                point_size=2.0,
+                show_coordinate_frame=True,
+            )
+        except (KeyboardInterrupt, EOFError):
+            print("\nSkipping full point cloud visualization")
+        except Exception as e:
+            print(f"Error in point cloud visualization: {e}")
+    else:
+        print("No full point cloud available for visualization")
+
+    # Reconstruct misc clusters if available
+    if "misc_clusters" in results and results["misc_clusters"]:
+        misc_clusters = [deserialize_point_cloud(cluster) for cluster in results["misc_clusters"]]
+        cluster_count = len(misc_clusters)
+        total_misc_points = sum(len(np.asarray(cluster.points)) for cluster in misc_clusters)
+        print(f"Reconstructed {cluster_count} misc clusters with {total_misc_points} total points")
+
+        # Visualize reconstructed misc clusters
+        try:
+            visualize_clustered_point_clouds(
+                misc_clusters,
+                window_name="Reconstructed Misc/Background Clusters (DBSCAN)",
+                point_size=3.0,
+                show_coordinate_frame=True,
+            )
+        except (KeyboardInterrupt, EOFError):
+            print("\nSkipping misc clusters visualization")
+        except Exception as e:
+            print(f"Error in misc clusters visualization: {e}")
+    else:
+        print("No misc clusters available for visualization")
+
+    # Reconstruct voxel grid if available
+    if "misc_voxel_grid" in results and results["misc_voxel_grid"] is not None:
+        misc_voxel_grid = deserialize_voxel_grid(results["misc_voxel_grid"])
+        if misc_voxel_grid:
+            voxel_count = len(misc_voxel_grid.get_voxels())
+            print(f"Reconstructed voxel grid with {voxel_count} voxels")
+
+            # Visualize reconstructed voxel grid
+            try:
+                visualize_voxel_grid(
+                    misc_voxel_grid,
+                    window_name="Reconstructed Misc/Background Voxel Grid",
+                    show_coordinate_frame=True,
+                )
+            except (KeyboardInterrupt, EOFError):
+                print("\nSkipping voxel grid visualization")
+            except Exception as e:
+                print(f"Error in voxel grid visualization: {e}")
+        else:
+            print("Failed to reconstruct voxel grid")
+    else:
+        print("No voxel grid available for visualization")
+
+
+class DrakeKinematicsEnv:
+    def __init__(
+        self,
+        urdf_path: str,
+        kinematic_chain_joints: List[str],
+        links_to_ignore: Optional[List[str]] = None,
+    ):
+        self._resources_to_cleanup = []
+
+        # Register cleanup at exit
+        atexit.register(self.cleanup_resources)
+
+        # Initialize tf resources once and reuse them
+        self.buffer = tf_lcm_py.Buffer(30.0)
+        self._resources_to_cleanup.append(self.buffer)
+        with self.safe_lcm_instance() as lcm_instance:
+            self.tf_lcm_instance = lcm_instance
+            self._resources_to_cleanup.append(self.tf_lcm_instance)
+            # Create TransformListener with our LCM instance and buffer
+            self.listener = tf_lcm_py.TransformListener(self.tf_lcm_instance, self.buffer)
+            self._resources_to_cleanup.append(self.listener)
+
+        # Check if URDF file exists
+        if not os.path.exists(urdf_path):
+            raise FileNotFoundError(f"URDF file not found: {urdf_path}")
+
+        # Drake utils initialization
+        self.meshcat = StartMeshcat()
+        print(f"Meshcat started at: {self.meshcat.web_url()}")
+
+        self.urdf_path = urdf_path
+        self.builder = DiagramBuilder()
+
+        self.plant, self.scene_graph = AddMultibodyPlantSceneGraph(self.builder, time_step=0.01)
+        self.parser = Parser(self.plant)
+
+        # Load the robot URDF
+        print(f"Loading URDF from: {self.urdf_path}")
+        self.model_instances = self.parser.AddModelsFromUrl(f"file://{self.urdf_path}")
+        self.kinematic_chain_joints = kinematic_chain_joints
+        self.model_instance = self.model_instances[0] if self.model_instances else None
+
+        if not self.model_instances:
+            raise RuntimeError("Failed to load any model instances from URDF")
+
+        print(f"Loaded {len(self.model_instances)} model instances")
+
+        # Set up collision filtering
+        if links_to_ignore:
+            bodies = []
+            for link_name in links_to_ignore:
+                try:
+                    body = self.plant.GetBodyByName(link_name)
+                    if body is not None:
+                        bodies.extend(self.plant.GetBodiesWeldedTo(body))
+                except RuntimeError:
+                    print(f"Warning: Link '{link_name}' not found in URDF")
+
+            if bodies:
+                arm_geoms = self.plant.CollectRegisteredGeometries(bodies)
+                decl = CollisionFilterDeclaration().ExcludeWithin(arm_geoms)
+                manager = self.scene_graph.collision_filter_manager()
+                manager.Apply(decl)
+
+        # Load and process point cloud data
+        self._load_and_process_point_clouds()
+
+        # Finalize the plant before adding visualizer
+        self.plant.Finalize()
+
+        # Print some debug info about the plant
+        print(f"Plant has {self.plant.num_bodies()} bodies")
+        print(f"Plant has {self.plant.num_joints()} joints")
+        for i in range(self.plant.num_joints()):
+            joint = self.plant.get_joint(JointIndex(i))
+            print(f"  Joint {i}: {joint.name()} (type: {joint.type_name()})")
+
+        # Add visualizer
+        self.visualizer = MeshcatVisualizer.AddToBuilder(
+            self.builder, self.scene_graph, self.meshcat, params=MeshcatVisualizerParams()
+        )
+
+        # Build the diagram
+        self.diagram = self.builder.Build()
+        self.diagram_context = self.diagram.CreateDefaultContext()
+        self.plant_context = self.plant.GetMyContextFromRoot(self.diagram_context)
+
+        # Set up joint indices
+        self.joint_indices = []
+        for joint_name in self.kinematic_chain_joints:
+            try:
+                joint = self.plant.GetJointByName(joint_name)
+                if joint.num_positions() > 0:
+                    start_index = joint.position_start()
+                    for i in range(joint.num_positions()):
+                        self.joint_indices.append(start_index + i)
+                    print(
+                        f"Added joint '{joint_name}' at indices {start_index} to {start_index + joint.num_positions() - 1}"
+                    )
+            except RuntimeError:
+                print(f"Warning: Joint '{joint_name}' not found in URDF.")
+
+        # Get important frames/bodies
+        try:
+            self.end_effector_link = self.plant.GetBodyByName("link6")
+            self.end_effector_frame = self.plant.GetFrameByName("link6")
+            print("Found end effector link6")
+        except RuntimeError:
+            print("Warning: link6 not found")
+            self.end_effector_link = None
+            self.end_effector_frame = None
+
+        try:
+            self.camera_link = self.plant.GetBodyByName("camera_center_link")
+            print("Found camera_center_link")
+        except RuntimeError:
+            print("Warning: camera_center_link not found")
+            self.camera_link = None
+
+        # Set robot to a reasonable initial configuration
+        self._set_initial_configuration()
+
+        # Force initial visualization update
+        self._update_visualization()
+
+        print("Drake environment initialization complete!")
+        print(f"Visit {self.meshcat.web_url()} to see the visualization")
+
+    def _load_and_process_point_clouds(self):
+        """Load point cloud data from pickle file and add to scene"""
+        pickle_path = "manipulation_results.pkl"
+        try:
+            with open(pickle_path, "rb") as f:
+                data = pickle.load(f)
+
+            results = data["results"]
+            print(f"Loaded results with keys: {list(results.keys())}")
+
+        except FileNotFoundError:
+            print(f"Warning: Pickle file {pickle_path} not found.")
+            print("Skipping point cloud loading.")
+            return
+        except Exception as e:
+            print(f"Warning: Error loading pickle file: {e}")
+            return
+
+        full_detected_pcd = o3d.geometry.PointCloud()
+        for obj in results["detected_objects"]:
+            pcd = o3d.geometry.PointCloud()
+            pcd.points = o3d.utility.Vector3dVector(obj["point_cloud_numpy"])
+            full_detected_pcd += pcd
+
+        self.process_and_add_object_class("all_objects", results)
+        self.process_and_add_object_class("misc_clusters", results)
+        misc_clusters = results["misc_clusters"]
+        print(type(misc_clusters[0]["points"]))
+        print(np.asarray(misc_clusters[0]["points"]).shape)
+
+    def process_and_add_object_class(self, object_key: str, results: dict):
+        # Process detected objects
+        if object_key in results:
+            detected_objects = results[object_key]
+            if detected_objects:
+                print(f"Processing {len(detected_objects)} {object_key}")
+                all_decomposed_meshes = []
+
+                transform = self.get_transform("world", "camera_center_link")
+                for i in range(len(detected_objects)):
+                    try:
+                        if object_key == "misc_clusters":
+                            points = np.asarray(detected_objects[i]["points"])
+                        elif "point_cloud_numpy" in detected_objects[i]:
+                            points = detected_objects[i]["point_cloud_numpy"]
+                        elif (
+                            "point_cloud" in detected_objects[i]
+                            and detected_objects[i]["point_cloud"]
+                        ):
+                            # Handle serialized point cloud
+                            points = np.array(detected_objects[i]["point_cloud"]["points"])
+                        else:
+                            print(f"Warning: No point cloud data found for object {i}")
+                            continue
+
+                        if len(points) < 10:  # Need more points for mesh reconstruction
+                            print(
+                                f"Warning: Object {i} has too few points ({len(points)}) for mesh reconstruction"
+                            )
+                            continue
+
+                        # Swap y-z axes since this is a common problem
+                        points = np.column_stack((points[:, 0], points[:, 2], -points[:, 1]))
+                        # Transform points to world frame
+                        points = self.transform_point_cloud_with_open3d(points, transform)
+
+                        # Use fast DBSCAN clustering + convex hulls approach
+                        clustered_hulls = self._create_clustered_convex_hulls(points, i)
+                        all_decomposed_meshes.extend(clustered_hulls)
+
+                        print(
+                            f"Created {len(clustered_hulls)} clustered convex hulls for object {i}"
+                        )
+
+                    except Exception as e:
+                        print(f"Warning: Failed to process object {i}: {e}")
+
+                if all_decomposed_meshes:
+                    self.register_convex_hulls_as_collision(all_decomposed_meshes, object_key)
+                    print(f"Registered {len(all_decomposed_meshes)} total clustered convex hulls")
+                else:
+                    print("Warning: No valid clustered convex hulls created from detected objects")
+            else:
+                print("No detected objects found")
+
+    def _create_clustered_convex_hulls(
+        self, points: np.ndarray, object_id: int
+    ) -> List[o3d.geometry.TriangleMesh]:
+        """
+        Create convex hulls from DBSCAN clusters of point cloud data.
+        Fast approach: cluster points, then convex hull each cluster.
+
+        Args:
+            points: Nx3 numpy array of 3D points
+            object_id: ID for debugging/logging
+
+        Returns:
+            List of Open3D triangle meshes (convex hulls of clusters)
+        """
+        try:
+            # Create Open3D point cloud
+            pcd = o3d.geometry.PointCloud()
+            pcd.points = o3d.utility.Vector3dVector(points)
+
+            # Quick outlier removal (optional, can skip for speed)
+            if len(points) > 50:  # Only for larger point clouds
+                pcd, _ = pcd.remove_statistical_outlier(nb_neighbors=10, std_ratio=2.0)
+                points = np.asarray(pcd.points)
+
+            if len(points) < 4:
+                print(f"Warning: Too few points after filtering for object {object_id}")
+                return []
+
+            # Try multiple DBSCAN parameter combinations to find clusters
+            clusters = []
+            labels = None
+
+            # Calculate some basic statistics for parameter estimation
+            if len(points) > 10:
+                # Compute nearest neighbor distances for better eps estimation
+                distances = pcd.compute_nearest_neighbor_distance()
+                avg_nn_distance = np.mean(distances)
+                std_nn_distance = np.std(distances)
+
+                print(
+                    f"Object {object_id}: {len(points)} points, avg_nn_dist={avg_nn_distance:.4f}"
+                )
+
+                for i in range(20):
+                    try:
+                        eps = avg_nn_distance * (2.0 + (i * 0.1))
+                        min_samples = 20
+                        labels = np.array(pcd.cluster_dbscan(eps=eps, min_points=min_samples))
+                        unique_labels = np.unique(labels)
+                        clusters = unique_labels[unique_labels >= 0]  # Remove noise label (-1)
+
+                        noise_points = np.sum(labels == -1)
+                        clustered_points = len(points) - noise_points
+
+                        print(
+                            f"  Try {i + 1}: eps={eps:.4f}, min_samples={min_samples} → {len(clusters)} clusters, {clustered_points}/{len(points)} points clustered"
+                        )
+
+                        # Accept if we found clusters and most points are clustered
+                        if (
+                            len(clusters) > 0 and clustered_points >= len(points) * 0.95
+                        ):  # At least 30% of points clustered
+                            print(f"  ✓ Accepted parameter set {i + 1}")
+                            break
+
+                    except Exception as e:
+                        print(
+                            f"  Try {i + 1}: Failed with eps={eps:.4f}, min_samples={min_samples}: {e}"
+                        )
+                        continue
+
+            if len(clusters) == 0 or labels is None:
+                print(
+                    f"No clusters found for object {object_id} after all attempts, using entire point cloud"
+                )
+                # Fallback: use entire point cloud as single convex hull
+                hull_mesh, _ = pcd.compute_convex_hull()
+                hull_mesh.compute_vertex_normals()
+                return [hull_mesh]
+
+            print(
+                f"Found {len(clusters)} clusters for object {object_id} (eps={eps:.3f}, min_samples={min_samples})"
+            )
+
+            # Create convex hull for each cluster
+            convex_hulls = []
+            for cluster_id in clusters:
+                try:
+                    # Get points for this cluster
+                    cluster_mask = labels == cluster_id
+                    cluster_points = points[cluster_mask]
+
+                    if len(cluster_points) < 4:
+                        print(
+                            f"Skipping cluster {cluster_id} with only {len(cluster_points)} points"
+                        )
+                        continue
+
+                    # Create point cloud for this cluster
+                    cluster_pcd = o3d.geometry.PointCloud()
+                    cluster_pcd.points = o3d.utility.Vector3dVector(cluster_points)
+
+                    # Compute convex hull
+                    hull_mesh, _ = cluster_pcd.compute_convex_hull()
+                    hull_mesh.compute_vertex_normals()
+
+                    # Validate hull
+                    if (
+                        len(np.asarray(hull_mesh.vertices)) >= 4
+                        and len(np.asarray(hull_mesh.triangles)) >= 4
+                    ):
+                        convex_hulls.append(hull_mesh)
+                        print(
+                            f"  Cluster {cluster_id}: {len(cluster_points)} points → convex hull with {len(np.asarray(hull_mesh.vertices))} vertices"
+                        )
+                    else:
+                        print(f"  Skipping degenerate hull for cluster {cluster_id}")
+
+                except Exception as e:
+                    print(f"Error processing cluster {cluster_id} for object {object_id}: {e}")
+
+            if not convex_hulls:
+                print(
+                    f"No valid convex hulls created for object {object_id}, using entire point cloud"
+                )
+                # Fallback: use entire point cloud as single convex hull
+                hull_mesh, _ = pcd.compute_convex_hull()
+                hull_mesh.compute_vertex_normals()
+                return [hull_mesh]
+
+            return convex_hulls
+
+        except Exception as e:
+            print(f"Error in DBSCAN clustering for object {object_id}: {e}")
+            # Final fallback: single convex hull
+            try:
+                pcd = o3d.geometry.PointCloud()
+                pcd.points = o3d.utility.Vector3dVector(points)
+                hull_mesh, _ = pcd.compute_convex_hull()
+                hull_mesh.compute_vertex_normals()
+                return [hull_mesh]
+            except:
+                return []
+
+    def _set_initial_configuration(self):
+        """Set the robot to a reasonable initial joint configuration"""
+        # Set all joints to zero initially
+        if self.joint_indices:
+            q = np.zeros(len(self.joint_indices))
+
+            # You can customize these values for a better initial pose
+            # For example, if you know good default joint angles:
+            if len(q) >= 6:  # Assuming at least 6 DOF arm
+                q[1] = 0.0  # joint1
+                q[2] = 0.0  # joint2
+                q[3] = 0.0  # joint3
+                q[4] = 0.0  # joint4
+                q[5] = 0.0  # joint5
+                q[6] = 0.0  # joint6
+
+            # Set the joint positions in the plant context
+            positions = self.plant.GetPositions(self.plant_context)
+            for i, joint_idx in enumerate(self.joint_indices):
+                if joint_idx < len(positions):
+                    positions[joint_idx] = q[i]
+
+            self.plant.SetPositions(self.plant_context, positions)
+            print(f"Set initial joint configuration: {q}")
+        else:
+            print("Warning: No joint indices found, using default configuration")
+
+    def _update_visualization(self):
+        """Force update the visualization"""
+        try:
+            # Get the visualizer's context from the diagram context
+            visualizer_context = self.visualizer.GetMyContextFromRoot(self.diagram_context)
+            self.visualizer.ForcedPublish(visualizer_context)
+            print("Visualization updated successfully")
+        except Exception as e:
+            print(f"Error updating visualization: {e}")
+
+    def set_joint_positions(self, joint_positions):
+        """Set specific joint positions and update visualization"""
+        if len(joint_positions) != len(self.joint_indices):
+            raise ValueError(
+                f"Expected {len(self.joint_indices)} joint positions, got {len(joint_positions)}"
+            )
+
+        positions = self.plant.GetPositions(self.plant_context)
+        for i, joint_idx in enumerate(self.joint_indices):
+            if joint_idx < len(positions):
+                positions[joint_idx] = joint_positions[i]
+
+        self.plant.SetPositions(self.plant_context, positions)
+        self._update_visualization()
+        print(f"Updated joint positions: {joint_positions}")
+
+    def register_convex_hulls_as_collision(
+        self, meshes: List[o3d.geometry.TriangleMesh], hull_type: str
+    ):
+        """Register convex hulls as collision and visual geometry"""
+        if not meshes:
+            print("No meshes to register")
+            return
+
+        world = self.plant.world_body()
+        proximity = ProximityProperties()
+
+        for i, mesh in enumerate(meshes):
+            try:
+                # Convert Open3D → numpy arrays → trimesh.Trimesh
+                vertices = np.asarray(mesh.vertices)
+                faces = np.asarray(mesh.triangles)
+
+                if len(vertices) == 0 or len(faces) == 0:
+                    print(f"Warning: Mesh {i} is empty, skipping")
+                    continue
+
+                tmesh = trimesh.Trimesh(vertices=vertices, faces=faces)
+
+                # Export to OBJ in memory
+                tmesh_obj_blob = tmesh.export(file_type="obj")
+                mem_file = MemoryFile(
+                    contents=tmesh_obj_blob, extension=".obj", filename_hint=f"convex_hull_{i}.obj"
+                )
+                in_memory_mesh = InMemoryMesh()
+                in_memory_mesh.mesh_file = mem_file
+                drake_mesh = Mesh(in_memory_mesh, scale=1.0)
+
+                pos = np.array([0.0, 0.0, 0.0])
+                rpy = RollPitchYaw(0.0, 0.0, 0.0)
+                X_WG = DrakeRigidTransform(RotationMatrix(rpy), pos)
+
+                # Register collision and visual geometry
+                self.plant.RegisterCollisionGeometry(
+                    body=world,
+                    X_BG=X_WG,
+                    shape=drake_mesh,
+                    name=f"convex_hull_collision_{i}_{hull_type}",
+                    properties=proximity,
+                )
+                self.plant.RegisterVisualGeometry(
+                    body=world,
+                    X_BG=X_WG,
+                    shape=drake_mesh,
+                    name=f"convex_hull_visual_{i}_{hull_type}",
+                    diffuse_color=np.array([0.7, 0.5, 0.3, 0.8]),  # Orange-ish color
+                )
+
+                print(
+                    f"Registered convex hull {i} with {len(vertices)} vertices and {len(faces)} faces"
+                )
+
+            except Exception as e:
+                print(f"Warning: Failed to register mesh {i}: {e}")
+
+        # Add a simple table for reference
+        try:
+            table_shape = Box(1.0, 1.0, 0.1)  # Thinner table
+            table_pose = RigidTransform(p=[0.5, 0.0, -0.05])  # In front of robot
+            self.plant.RegisterCollisionGeometry(
+                world, table_pose, table_shape, "table_collision", proximity
+            )
+            self.plant.RegisterVisualGeometry(
+                world, table_pose, table_shape, "table_visual", [0.8, 0.6, 0.4, 1.0]
+            )
+            print("Added reference table")
+        except Exception as e:
+            print(f"Warning: Failed to add table: {e}")
+
+    def get_seeded_random_rgba(self, id: int):
+        np.random.seed(id)
+        return np.random.rand(4)
+
+    @contextmanager
+    def safe_lcm_instance(self):
+        """Context manager for safely managing LCM instance lifecycle"""
+        lcm_instance = tf_lcm_py.LCM()
+        try:
+            yield lcm_instance
+        finally:
+            pass
+
+    def cleanup_resources(self):
+        """Clean up resources before exiting"""
+        # Only clean up once when exiting
+        print("Cleaning up resources...")
+        # Force cleanup of resources in reverse order (last created first)
+        for resource in reversed(self._resources_to_cleanup):
+            try:
+                # For objects like TransformListener that might have a close or shutdown method
+                if hasattr(resource, "close"):
+                    resource.close()
+                elif hasattr(resource, "shutdown"):
+                    resource.shutdown()
+
+                # Explicitly delete the resource
+                del resource
+            except Exception as e:
+                print(f"Error during cleanup: {e}")
+
+        # Clear the resources list
+        self._resources_to_cleanup = []
+
+    def get_transform(self, target_frame, source_frame):
+        print("Getting transform from", source_frame, "to", target_frame)
+        attempts = 0
+        max_attempts = 20  # Reduced from 120 to avoid long blocking
+
+        while attempts < max_attempts:
+            try:
+                # Process LCM messages with error handling
+                if not self.tf_lcm_instance.handle_timeout(100):  # 100ms timeout
+                    # If handle_timeout returns false, we might need to re-check if LCM is still good
+                    if not self.tf_lcm_instance.good():
+                        print("WARNING: LCM instance is no longer in a good state")
+
+                # Get the most recent timestamp from the buffer instead of using current time
+                try:
+                    timestamp = self.buffer.get_most_recent_timestamp()
+                    if attempts % 10 == 0:
+                        print(f"Using timestamp from buffer: {timestamp}")
+                except Exception as e:
+                    # Fall back to current time if get_most_recent_timestamp fails
+                    timestamp = datetime.now()
+                    if not hasattr(timestamp, "timestamp"):
+                        timestamp.timestamp = (
+                            lambda: time.mktime(timestamp.timetuple()) + timestamp.microsecond / 1e6
+                        )
+                    if attempts % 10 == 0:
+                        print(f"Falling back to current time: {timestamp}")
+
+                # Check if we can find the transform
+                if self.buffer.can_transform(target_frame, source_frame, timestamp):
+                    # print(f"Found transform between '{target_frame}' and '{source_frame}'!")
+
+                    # Look up the transform with the timestamp from the buffer
+                    transform = self.buffer.lookup_transform(
+                        target_frame,
+                        source_frame,
+                        timestamp,
+                        timeout=10.0,
+                        time_tolerance=0.1,
+                        lcm_module=lcm_msgs,
+                    )
+
+                    return transform
+
+                # Increment counter and report status every 10 attempts
+                attempts += 1
+                if attempts % 10 == 0:
+                    print(f"Still waiting... (attempt {attempts}/{max_attempts})")
+                    frames = self.buffer.get_all_frame_names()
+                    if frames:
+                        print(f"Frames received so far ({len(frames)} total):")
+                        for frame in sorted(frames):
+                            print(f"  {frame}")
+                    else:
+                        print("No frames received yet")
+
+                # Brief pause
+                time.sleep(0.5)
+
+            except Exception as e:
+                print(f"Error during transform lookup: {e}")
+                attempts += 1
+                time.sleep(1)  # Longer pause after an error
+
+        print(f"\nERROR: No transform found after {max_attempts} attempts")
+        return None
+
+    def transform_point_cloud_with_open3d(self, points_np: np.ndarray, transform) -> np.ndarray:
+        """
+        Transforms a point cloud using Open3D given a transform.
+
+        Args:
+            points_np (np.ndarray): Nx3 array of 3D points.
+            transform: Transform from tf_lcm_py.
+
+        Returns:
+            np.ndarray: Nx3 array of transformed 3D points.
+        """
+        if points_np.shape[1] != 3:
+            print("Input point cloud must have shape Nx3.")
+            return points_np
+
+        # Convert transform to 4x4 numpy matrix
+        tf_matrix = np.eye(4)
+
+        # Extract rotation quaternion components
+        qw = transform.transform.rotation.w
+        qx = transform.transform.rotation.x
+        qy = transform.transform.rotation.y
+        qz = transform.transform.rotation.z
+
+        # Convert quaternion to rotation matrix
+        # Formula from: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation#Quaternion-derived_rotation_matrix
+        tf_matrix[0, 0] = 1 - 2 * qy * qy - 2 * qz * qz
+        tf_matrix[0, 1] = 2 * qx * qy - 2 * qz * qw
+        tf_matrix[0, 2] = 2 * qx * qz + 2 * qy * qw
+
+        tf_matrix[1, 0] = 2 * qx * qy + 2 * qz * qw
+        tf_matrix[1, 1] = 1 - 2 * qx * qx - 2 * qz * qz
+        tf_matrix[1, 2] = 2 * qy * qz - 2 * qx * qw
+
+        tf_matrix[2, 0] = 2 * qx * qz - 2 * qy * qw
+        tf_matrix[2, 1] = 2 * qy * qz + 2 * qx * qw
+        tf_matrix[2, 2] = 1 - 2 * qx * qx - 2 * qy * qy
+
+        # Set translation
+        tf_matrix[0, 3] = transform.transform.translation.x
+        tf_matrix[1, 3] = transform.transform.translation.y
+        tf_matrix[2, 3] = transform.transform.translation.z
+
+        # Create Open3D point cloud
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(points_np)
+
+        # Apply transformation
+        pcd.transform(tf_matrix)
+
+        # Return as NumPy array
+        return np.asarray(pcd.points)
+
+
+# Updated main function
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Visualize manipulation results")
+    parser.add_argument("--visualize-only", action="store_true", help="Only visualize results")
+    args = parser.parse_args()
+
+    if args.visualize_only:
+        visualize_results()
+        exit(0)
+
+    try:
+        # Then set up Drake environment
+        kinematic_chain_joints = [
+            "pillar_platform_joint",
+            "joint1",
+            "joint2",
+            "joint3",
+            "joint4",
+            "joint5",
+            "joint6",
+        ]
+
+        links_to_ignore = [
+            "devkit_base_link",
+            "pillar_platform",
+            "piper_angled_mount",
+            "pan_tilt_base",
+            "pan_tilt_head",
+            "pan_tilt_pan",
+            "base_link",
+            "link1",
+            "link2",
+            "link3",
+            "link4",
+            "link5",
+            "link6",
+        ]
+
+        urdf_path = "./assets/devkit_base_descr.urdf"
+        urdf_path = os.path.abspath(urdf_path)
+
+        print(f"Attempting to load URDF from: {urdf_path}")
+
+        env = DrakeKinematicsEnv(urdf_path, kinematic_chain_joints, links_to_ignore)
+        env.set_joint_positions([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
+        transform = env.get_transform("world", "camera_center_link")
+        print(
+            transform.transform.translation.x,
+            transform.transform.translation.y,
+            transform.transform.translation.z,
+        )
+        print(
+            transform.transform.rotation.w,
+            transform.transform.rotation.x,
+            transform.transform.rotation.y,
+            transform.transform.rotation.z,
+        )
+
+        # Keep the visualization alive
+        print("\nVisualization is running. Press Ctrl+C to exit.")
+        while True:
+            time.sleep(1)
+
+    except KeyboardInterrupt:
+        print("\nExiting...")
+    except Exception as e:
+        print(f"Error: {e}")
+        import traceback
+
+        traceback.print_exc()
diff --git a/tests/zed_neural_depth_demo.py b/tests/zed_neural_depth_demo.py
new file mode 100755
index 0000000000..5edce9633f
--- /dev/null
+++ b/tests/zed_neural_depth_demo.py
@@ -0,0 +1,450 @@
+#!/usr/bin/env python3
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+ZED Camera Neural Depth Demo - OpenCV Live Visualization with Data Saving
+
+This script demonstrates live visualization of ZED camera RGB and depth data using OpenCV.
+Press SPACE to save RGB and depth images to rgbd_data2 folder.
+Press ESC or 'q' to quit.
+"""
+
+import os
+import sys
+import time
+import argparse
+import logging
+from pathlib import Path
+import numpy as np
+import cv2
+import yaml
+from datetime import datetime
+import open3d as o3d
+
+# Add the project root to Python path
+sys.path.append(str(Path(__file__).parent.parent))
+
+try:
+    import pyzed.sl as sl
+except ImportError:
+    print("ERROR: ZED SDK not found. Please install the ZED SDK and pyzed Python package.")
+    print("Download from: https://www.stereolabs.com/developers/release/")
+    sys.exit(1)
+
+from dimos.hardware.zed_camera import ZEDCamera
+from dimos.perception.pointcloud.utils import visualize_pcd, visualize_clustered_point_clouds
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+class ZEDLiveVisualizer:
+    """Live OpenCV visualization for ZED camera data with saving functionality."""
+
+    def __init__(self, camera, max_depth=10.0, output_dir="assets/rgbd_data2"):
+        self.camera = camera
+        self.max_depth = max_depth
+        self.output_dir = Path(output_dir)
+        self.save_counter = 0
+
+        # Store captured pointclouds for later visualization
+        self.captured_pointclouds = []
+
+        # Display settings for 480p
+        self.display_width = 640
+        self.display_height = 480
+
+        # Create output directory structure
+        self.setup_output_directory()
+
+        # Get camera info for saving
+        self.camera_info = camera.get_camera_info()
+
+        # Save camera info files once
+        self.save_camera_info()
+
+        # OpenCV window name (single window)
+        self.window_name = "ZED Camera - RGB + Depth"
+
+        # Create window
+        cv2.namedWindow(self.window_name, cv2.WINDOW_AUTOSIZE)
+
+    def setup_output_directory(self):
+        """Create the output directory structure."""
+        self.output_dir.mkdir(exist_ok=True)
+        (self.output_dir / "color").mkdir(exist_ok=True)
+        (self.output_dir / "depth").mkdir(exist_ok=True)
+        (self.output_dir / "pointclouds").mkdir(exist_ok=True)
+        logger.info(f"Created output directory: {self.output_dir}")
+
+    def save_camera_info(self):
+        """Save camera info YAML files with ZED camera parameters."""
+        # Get current timestamp
+        now = datetime.now()
+        timestamp_sec = int(now.timestamp())
+        timestamp_nanosec = int((now.timestamp() % 1) * 1e9)
+
+        # Get camera resolution
+        resolution = self.camera_info.get("resolution", {})
+        width = int(resolution.get("width", 1280))
+        height = int(resolution.get("height", 720))
+
+        # Extract left camera parameters (for RGB) from already available camera_info
+        left_cam = self.camera_info.get("left_cam", {})
+        # Convert numpy values to Python floats
+        fx = float(left_cam.get("fx", 749.341552734375))
+        fy = float(left_cam.get("fy", 748.5587768554688))
+        cx = float(left_cam.get("cx", 639.4312744140625))
+        cy = float(left_cam.get("cy", 357.2478942871094))
+
+        # Build distortion coefficients from ZED format
+        # ZED provides k1, k2, p1, p2, k3 - convert to rational_polynomial format
+        k1 = float(left_cam.get("k1", 0.0))
+        k2 = float(left_cam.get("k2", 0.0))
+        p1 = float(left_cam.get("p1", 0.0))
+        p2 = float(left_cam.get("p2", 0.0))
+        k3 = float(left_cam.get("k3", 0.0))
+        distortion = [k1, k2, p1, p2, k3, 0.0, 0.0, 0.0]
+
+        # Create camera info structure with plain Python types
+        camera_info = {
+            "D": distortion,
+            "K": [fx, 0.0, cx, 0.0, fy, cy, 0.0, 0.0, 1.0],
+            "P": [fx, 0.0, cx, 0.0, 0.0, fy, cy, 0.0, 0.0, 0.0, 1.0, 0.0],
+            "R": [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
+            "binning_x": 0,
+            "binning_y": 0,
+            "distortion_model": "rational_polynomial",
+            "header": {
+                "frame_id": "camera_color_optical_frame",
+                "stamp": {"nanosec": timestamp_nanosec, "sec": timestamp_sec},
+            },
+            "height": height,
+            "roi": {"do_rectify": False, "height": 0, "width": 0, "x_offset": 0, "y_offset": 0},
+            "width": width,
+        }
+
+        # Save color camera info
+        color_info_path = self.output_dir / "color_camera_info.yaml"
+        with open(color_info_path, "w") as f:
+            yaml.dump(camera_info, f, default_flow_style=False)
+
+        # Save depth camera info (same as color for ZED)
+        depth_info_path = self.output_dir / "depth_camera_info.yaml"
+        with open(depth_info_path, "w") as f:
+            yaml.dump(camera_info, f, default_flow_style=False)
+
+        logger.info(f"Saved camera info files to {self.output_dir}")
+
+    def normalize_depth_for_display(self, depth_map):
+        """Normalize depth map for OpenCV visualization."""
+        # Handle invalid values
+        valid_mask = (depth_map > 0) & np.isfinite(depth_map)
+
+        if not np.any(valid_mask):
+            return np.zeros_like(depth_map, dtype=np.uint8)
+
+        # Normalize to 0-255 for display
+        depth_norm = np.zeros_like(depth_map, dtype=np.float32)
+        depth_clipped = np.clip(depth_map[valid_mask], 0, self.max_depth)
+        depth_norm[valid_mask] = depth_clipped / self.max_depth
+
+        # Convert to 8-bit and apply colormap
+        depth_8bit = (depth_norm * 255).astype(np.uint8)
+        depth_colored = cv2.applyColorMap(depth_8bit, cv2.COLORMAP_JET)
+
+        return depth_colored
+
+    def save_frame(self, rgb_img, depth_map):
+        """Save RGB, depth images, and pointcloud with proper naming convention."""
+        # Generate filename with 5-digit zero-padding
+        filename = f"{self.save_counter:05d}.png"
+        pcd_filename = f"{self.save_counter:05d}.ply"
+
+        # Save RGB image
+        rgb_path = self.output_dir / "color" / filename
+        cv2.imwrite(str(rgb_path), rgb_img)
+
+        # Save depth image (convert to 16-bit for proper depth storage)
+        depth_path = self.output_dir / "depth" / filename
+        # Convert meters to millimeters and save as 16-bit
+        depth_mm = (depth_map * 1000).astype(np.uint16)
+        cv2.imwrite(str(depth_path), depth_mm)
+
+        # Capture and save pointcloud
+        pcd = self.camera.capture_pointcloud()
+        if pcd is not None and len(np.asarray(pcd.points)) > 0:
+            pcd_path = self.output_dir / "pointclouds" / pcd_filename
+            o3d.io.write_point_cloud(str(pcd_path), pcd)
+
+            # Store pointcloud for later visualization
+            self.captured_pointclouds.append(pcd)
+
+            logger.info(
+                f"Saved frame {self.save_counter}: {rgb_path}, {depth_path}, and {pcd_path}"
+            )
+        else:
+            logger.warning(f"Failed to capture pointcloud for frame {self.save_counter}")
+            logger.info(f"Saved frame {self.save_counter}: {rgb_path} and {depth_path}")
+
+        self.save_counter += 1
+
+    def visualize_captured_pointclouds(self):
+        """Visualize all captured pointclouds using Open3D, one by one."""
+        if not self.captured_pointclouds:
+            logger.info("No pointclouds captured to visualize")
+            return
+
+        logger.info(
+            f"Visualizing {len(self.captured_pointclouds)} captured pointclouds one by one..."
+        )
+        logger.info("Close each pointcloud window to proceed to the next one")
+
+        for i, pcd in enumerate(self.captured_pointclouds):
+            if len(np.asarray(pcd.points)) > 0:
+                logger.info(f"Displaying pointcloud {i + 1}/{len(self.captured_pointclouds)}")
+                visualize_pcd(pcd, window_name=f"ZED Pointcloud {i + 1:05d}", point_size=2.0)
+            else:
+                logger.warning(f"Pointcloud {i + 1} is empty, skipping...")
+
+        logger.info("Finished displaying all pointclouds")
+
+    def update_display(self):
+        """Update the live display with new frames."""
+        # Capture frame
+        left_img, right_img, depth_map = self.camera.capture_frame()
+
+        if left_img is None or depth_map is None:
+            return False, None, None
+
+        # Resize RGB to 480p
+        rgb_resized = cv2.resize(left_img, (self.display_width, self.display_height))
+
+        # Create depth visualization
+        depth_colored = self.normalize_depth_for_display(depth_map)
+
+        # Resize depth to 480p
+        depth_resized = cv2.resize(depth_colored, (self.display_width, self.display_height))
+
+        # Add text overlays
+        text_color = (255, 255, 255)
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        font_scale = 0.6
+        thickness = 2
+
+        # Add title and instructions to RGB
+        cv2.putText(
+            rgb_resized, "RGB Camera Feed", (10, 25), font, font_scale, text_color, thickness
+        )
+        cv2.putText(
+            rgb_resized,
+            "SPACE: Save | ESC/Q: Quit",
+            (10, 50),
+            font,
+            font_scale - 0.1,
+            text_color,
+            thickness,
+        )
+
+        # Add title and stats to depth
+        cv2.putText(
+            depth_resized,
+            f"Depth Map (0-{self.max_depth}m)",
+            (10, 25),
+            font,
+            font_scale,
+            text_color,
+            thickness,
+        )
+        cv2.putText(
+            depth_resized,
+            f"Saved: {self.save_counter} frames",
+            (10, 50),
+            font,
+            font_scale - 0.1,
+            text_color,
+            thickness,
+        )
+
+        # Stack images horizontally
+        combined_display = np.hstack((rgb_resized, depth_resized))
+
+        # Display combined image
+        cv2.imshow(self.window_name, combined_display)
+
+        return True, left_img, depth_map
+
+    def handle_key_events(self, rgb_img, depth_map):
+        """Handle keyboard input."""
+        key = cv2.waitKey(1) & 0xFF
+
+        if key == ord(" "):  # Space key - save frame
+            if rgb_img is not None and depth_map is not None:
+                self.save_frame(rgb_img, depth_map)
+                return "save"
+        elif key == 27 or key == ord("q"):  # ESC or 'q' - quit
+            return "quit"
+
+        return "continue"
+
+    def cleanup(self):
+        """Clean up OpenCV windows."""
+        cv2.destroyAllWindows()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="ZED Camera Neural Depth Demo - OpenCV with Data Saving"
+    )
+    parser.add_argument("--camera-id", type=int, default=0, help="ZED camera ID (default: 0)")
+    parser.add_argument(
+        "--resolution",
+        type=str,
+        default="HD1080",
+        choices=["HD2K", "HD1080", "HD720", "VGA"],
+        help="Camera resolution (default: HD1080)",
+    )
+    parser.add_argument(
+        "--max-depth",
+        type=float,
+        default=10.0,
+        help="Maximum depth for visualization in meters (default: 10.0)",
+    )
+    parser.add_argument(
+        "--camera-fps", type=int, default=15, help="Camera capture FPS (default: 30)"
+    )
+    parser.add_argument(
+        "--depth-mode",
+        type=str,
+        default="NEURAL",
+        choices=["NEURAL", "NEURAL_PLUS"],
+        help="Depth mode (NEURAL=faster, NEURAL_PLUS=more accurate)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="assets/rgbd_data2",
+        help="Output directory for saved data (default: rgbd_data2)",
+    )
+
+    args = parser.parse_args()
+
+    # Map resolution string to ZED enum
+    resolution_map = {
+        "HD2K": sl.RESOLUTION.HD2K,
+        "HD1080": sl.RESOLUTION.HD1080,
+        "HD720": sl.RESOLUTION.HD720,
+        "VGA": sl.RESOLUTION.VGA,
+    }
+
+    depth_mode_map = {"NEURAL": sl.DEPTH_MODE.NEURAL, "NEURAL_PLUS": sl.DEPTH_MODE.NEURAL_PLUS}
+
+    try:
+        # Initialize ZED camera with neural depth
+        logger.info(
+            f"Initializing ZED camera with {args.depth_mode} depth processing at {args.camera_fps} FPS..."
+        )
+        camera = ZEDCamera(
+            camera_id=args.camera_id,
+            resolution=resolution_map[args.resolution],
+            depth_mode=depth_mode_map[args.depth_mode],
+            fps=args.camera_fps,
+        )
+
+        # Open camera
+        with camera:
+            # Get camera information
+            info = camera.get_camera_info()
+            logger.info(f"Camera Model: {info.get('model', 'Unknown')}")
+            logger.info(f"Serial Number: {info.get('serial_number', 'Unknown')}")
+            logger.info(f"Firmware: {info.get('firmware', 'Unknown')}")
+            logger.info(f"Resolution: {info.get('resolution', {})}")
+            logger.info(f"Baseline: {info.get('baseline', 0):.3f}m")
+
+            # Initialize visualizer
+            visualizer = ZEDLiveVisualizer(
+                camera, max_depth=args.max_depth, output_dir=args.output_dir
+            )
+
+            logger.info("Starting live visualization...")
+            logger.info("Controls:")
+            logger.info("  SPACE - Save current RGB and depth frame")
+            logger.info("  ESC/Q - Quit")
+
+            frame_count = 0
+            start_time = time.time()
+
+            try:
+                while True:
+                    loop_start = time.time()
+
+                    # Update display
+                    success, rgb_img, depth_map = visualizer.update_display()
+
+                    if success:
+                        frame_count += 1
+
+                        # Handle keyboard events
+                        action = visualizer.handle_key_events(rgb_img, depth_map)
+
+                        if action == "quit":
+                            break
+                        elif action == "save":
+                            # Frame was saved, no additional action needed
+                            pass
+
+                        # Print performance stats every 60 frames
+                        if frame_count % 60 == 0:
+                            elapsed = time.time() - start_time
+                            fps = frame_count / elapsed
+                            logger.info(
+                                f"Frame {frame_count} | FPS: {fps:.1f} | Saved: {visualizer.save_counter}"
+                            )
+
+                    # Small delay to prevent CPU overload
+                    elapsed = time.time() - loop_start
+                    min_frame_time = 1.0 / 60.0  # Cap at 60 FPS
+                    if elapsed < min_frame_time:
+                        time.sleep(min_frame_time - elapsed)
+
+            except KeyboardInterrupt:
+                logger.info("Stopped by user")
+
+            # Final stats
+            total_time = time.time() - start_time
+            if total_time > 0:
+                avg_fps = frame_count / total_time
+                logger.info(
+                    f"Final stats: {frame_count} frames in {total_time:.1f}s (avg {avg_fps:.1f} FPS)"
+                )
+                logger.info(f"Total saved frames: {visualizer.save_counter}")
+
+            # Visualize captured pointclouds
+            visualizer.visualize_captured_pointclouds()
+
+    except Exception as e:
+        logger.error(f"Error during execution: {e}")
+        raise
+    finally:
+        if "visualizer" in locals():
+            visualizer.cleanup()
+        logger.info("Demo completed")
+
+
+if __name__ == "__main__":
+    main()

From 6c2ec82fb508d1d1093430bf885fdffb035dee2f Mon Sep 17 00:00:00 2001
From: alexlin2 <alex.lin416@outlook.com>
Date: Wed, 16 Jul 2025 12:46:41 -0700
Subject: [PATCH 2/7] added pytest to pointcloud filtering

---
 .../pointcloud/pointcloud_filtering.py        | 314 ------------------
 .../pointcloud/test_pointcloud_filtering.py   | 259 +++++++++++++++
 ...test_manipulation_pipeline_single_frame.py |   9 +-
 ..._manipulation_pipeline_single_frame_lcm.py |   6 +-
 4 files changed, 263 insertions(+), 325 deletions(-)
 create mode 100644 dimos/perception/pointcloud/test_pointcloud_filtering.py

diff --git a/dimos/perception/pointcloud/pointcloud_filtering.py b/dimos/perception/pointcloud/pointcloud_filtering.py
index 64fd8953a6..ea50cd03d2 100644
--- a/dimos/perception/pointcloud/pointcloud_filtering.py
+++ b/dimos/perception/pointcloud/pointcloud_filtering.py
@@ -358,317 +358,3 @@ def cleanup(self):
         """Clean up resources."""
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-
-
-def create_test_pipeline(data_dir: str) -> tuple:
-    """
-    Create a test pipeline with default settings.
-
-    Args:
-        data_dir: Directory containing camera info files
-
-    Returns:
-        Tuple of (filter_pipeline, color_info_path, depth_info_path)
-    """
-    color_info_path = os.path.join(data_dir, "color_camera_info.yaml")
-    depth_info_path = os.path.join(data_dir, "depth_camera_info.yaml")
-
-    # Default pipeline with subsampling disabled by default
-    filter_pipeline = PointcloudFiltering(
-        color_intrinsics=color_info_path,
-        depth_intrinsics=depth_info_path,
-    )
-
-    return filter_pipeline, color_info_path, depth_info_path
-
-
-def load_test_images(data_dir: str) -> tuple:
-    """
-    Load the first available test images from data directory.
-
-    Args:
-        data_dir: Directory containing color and depth subdirectories
-
-    Returns:
-        Tuple of (color_img, depth_img) or raises FileNotFoundError
-    """
-
-    def find_first_image(directory):
-        """Find the first image file in the given directory."""
-        if not os.path.exists(directory):
-            return None
-
-        image_extensions = [".jpg", ".jpeg", ".png", ".bmp"]
-        for filename in sorted(os.listdir(directory)):
-            if any(filename.lower().endswith(ext) for ext in image_extensions):
-                return os.path.join(directory, filename)
-        return None
-
-    color_dir = os.path.join(data_dir, "color")
-    depth_dir = os.path.join(data_dir, "depth")
-
-    color_img_path = find_first_image(color_dir)
-    depth_img_path = find_first_image(depth_dir)
-
-    if not color_img_path or not depth_img_path:
-        raise FileNotFoundError(f"Could not find color or depth images in {data_dir}")
-
-    # Load color image
-    color_img = cv2.imread(color_img_path)
-    if color_img is None:
-        raise FileNotFoundError(f"Could not load color image from {color_img_path}")
-    color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB)
-
-    # Load depth image
-    depth_img = cv2.imread(depth_img_path, cv2.IMREAD_UNCHANGED)
-    if depth_img is None:
-        raise FileNotFoundError(f"Could not load depth image from {depth_img_path}")
-
-    # Convert depth to meters if needed
-    if depth_img.dtype == np.uint16:
-        depth_img = depth_img.astype(np.float32) / 1000.0
-
-    return color_img, depth_img
-
-
-def run_segmentation(color_img: np.ndarray, device: str = "auto") -> List[ObjectData]:
-    """
-    Run segmentation on color image and return ObjectData objects.
-
-    Args:
-        color_img: RGB color image
-        device: Device to use ('auto', 'cuda', or 'cpu')
-
-    Returns:
-        List of ObjectData objects with segmentation results
-    """
-    if device == "auto":
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    # Import here to avoid circular imports
-    from dimos.perception.segmentation import Sam2DSegmenter
-
-    segmenter = Sam2DSegmenter(
-        model_path="FastSAM-s.pt", device=device, use_tracker=False, use_analyzer=False
-    )
-
-    try:
-        masks, bboxes, target_ids, probs, names = segmenter.process_image(np.array(color_img))
-
-        # Create ObjectData objects
-        objects = []
-        for i in range(len(bboxes)):
-            obj_data: ObjectData = {
-                "object_id": target_ids[i] if i < len(target_ids) else i,
-                "bbox": bboxes[i],
-                "depth": -1.0,  # Will be populated by pointcloud filtering
-                "confidence": probs[i] if i < len(probs) else 1.0,
-                "class_id": i,
-                "label": names[i] if i < len(names) else "",
-                "segmentation_mask": masks[i].cpu().numpy()
-                if hasattr(masks[i], "cpu")
-                else masks[i],
-                "position": Vector(0, 0, 0),  # Will be populated by pointcloud filtering
-                "rotation": Vector(0, 0, 0),  # Will be populated by pointcloud filtering
-                "size": {
-                    "width": 0.0,
-                    "height": 0.0,
-                    "depth": 0.0,
-                },  # Will be populated by pointcloud filtering
-            }
-            objects.append(obj_data)
-
-        return objects
-
-    finally:
-        segmenter.cleanup()
-
-
-def visualize_results(objects: List[ObjectData]):
-    """
-    Visualize point cloud filtering results with 3D bounding boxes.
-
-    Args:
-        objects: List of ObjectData with point clouds
-    """
-    all_pcds = []
-
-    for obj in objects:
-        if "point_cloud" in obj and obj["point_cloud"] is not None:
-            pcd = obj["point_cloud"]
-            all_pcds.append(pcd)
-
-            # Draw 3D bounding box if position, rotation, and size are available
-            if (
-                "position" in obj
-                and "rotation" in obj
-                and "size" in obj
-                and obj["position"] is not None
-                and obj["rotation"] is not None
-                and obj["size"] is not None
-            ):
-                try:
-                    position = obj["position"]
-                    rotation = obj["rotation"]
-                    size = obj["size"]
-
-                    # Convert position to numpy array
-                    if hasattr(position, "x"):  # Vector object
-                        center = np.array([position.x, position.y, position.z])
-                    else:  # Dictionary
-                        center = np.array([position["x"], position["y"], position["z"]])
-
-                    # Convert rotation (euler angles) to rotation matrix
-                    if hasattr(rotation, "x"):  # Vector object (roll, pitch, yaw)
-                        roll, pitch, yaw = rotation.x, rotation.y, rotation.z
-                    else:  # Dictionary
-                        roll, pitch, yaw = rotation["roll"], rotation["pitch"], rotation["yaw"]
-
-                    # Create rotation matrix from euler angles (ZYX order)
-                    # Roll (X), Pitch (Y), Yaw (Z)
-                    cos_r, sin_r = np.cos(roll), np.sin(roll)
-                    cos_p, sin_p = np.cos(pitch), np.sin(pitch)
-                    cos_y, sin_y = np.cos(yaw), np.sin(yaw)
-
-                    # Rotation matrix for ZYX euler angles
-                    R = np.array(
-                        [
-                            [
-                                cos_y * cos_p,
-                                cos_y * sin_p * sin_r - sin_y * cos_r,
-                                cos_y * sin_p * cos_r + sin_y * sin_r,
-                            ],
-                            [
-                                sin_y * cos_p,
-                                sin_y * sin_p * sin_r + cos_y * cos_r,
-                                sin_y * sin_p * cos_r - cos_y * sin_r,
-                            ],
-                            [-sin_p, cos_p * sin_r, cos_p * cos_r],
-                        ]
-                    )
-
-                    # Get dimensions
-                    width = size.get("width", 0.1)
-                    height = size.get("height", 0.1)
-                    depth = size.get("depth", 0.1)
-                    extent = np.array([width, height, depth])
-
-                    # Create oriented bounding box
-                    obb = o3d.geometry.OrientedBoundingBox(center=center, R=R, extent=extent)
-                    obb.color = [1, 0, 0]  # Red bounding boxes
-                    all_pcds.append(obb)
-
-                except Exception as e:
-                    print(
-                        f"Warning: Failed to create bounding box for object {obj.get('object_id', 'unknown')}: {e}"
-                    )
-
-    # Add coordinate frame
-    coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
-    all_pcds.append(coordinate_frame)
-
-    # Visualize
-    if all_pcds:
-        o3d.visualization.draw_geometries(
-            all_pcds,
-            window_name="Filtered Point Clouds with 3D Bounding Boxes",
-            width=1280,
-            height=720,
-        )
-
-
-def main():
-    """Main function to demonstrate the PointcloudFiltering pipeline."""
-    parser = argparse.ArgumentParser(description="Point cloud filtering pipeline demonstration")
-    parser.add_argument(
-        "--save-pickle",
-        type=str,
-        help="Save generated ObjectData to pickle file (provide filename)",
-    )
-    parser.add_argument(
-        "--data-dir", type=str, help="Directory containing RGBD data (default: auto-detect)"
-    )
-    args = parser.parse_args()
-
-    try:
-        # Setup paths
-        if args.data_dir:
-            data_dir = args.data_dir
-        else:
-            script_dir = os.path.dirname(os.path.abspath(__file__))
-            dimos_dir = os.path.abspath(os.path.join(script_dir, "../../../"))
-            data_dir = os.path.join(dimos_dir, "assets/rgbd_data")
-
-        # Load test data
-        print("Loading test images...")
-        color_img, depth_img = load_test_images(data_dir)
-        print(f"Loaded images: color {color_img.shape}, depth {depth_img.shape}")
-
-        # Run segmentation
-        print("Running segmentation...")
-        objects = run_segmentation(color_img)
-        print(f"Found {len(objects)} objects")
-
-        # Create filtering pipeline
-        print("Creating filtering pipeline...")
-        filter_pipeline, _, _ = create_test_pipeline(data_dir)
-
-        # Process images
-        print("Processing point clouds...")
-        updated_objects = filter_pipeline.process_images(color_img, depth_img, objects)
-
-        # Print results
-        print(f"Processing complete:")
-        print(f"  Objects processed: {len(updated_objects)}/{len(objects)}")
-
-        # Print per-object stats
-        for i, obj in enumerate(updated_objects):
-            if "point_cloud" in obj and obj["point_cloud"] is not None:
-                num_points = len(np.asarray(obj["point_cloud"].points))
-                position = obj.get("position", Vector(0, 0, 0))
-                size = obj.get("size", {})
-                print(f"  Object {i + 1} (ID: {obj['object_id']}): {num_points} points")
-                print(f"    Position: ({position.x:.2f}, {position.y:.2f}, {position.z:.2f})")
-                print(
-                    f"    Size: {size.get('width', 0):.3f} x {size.get('height', 0):.3f} x {size.get('depth', 0):.3f}"
-                )
-
-        # Save to pickle file if requested
-        if args.save_pickle:
-            pickle_path = args.save_pickle
-            if not pickle_path.endswith(".pkl"):
-                pickle_path += ".pkl"
-
-            print(f"Saving ObjectData to {pickle_path}...")
-
-            # Create serializable objects (exclude Open3D point clouds)
-            serializable_objects = []
-            for obj in updated_objects:
-                obj_copy = obj.copy()
-                # Remove the Open3D point cloud object (can't be pickled)
-                if "point_cloud" in obj_copy:
-                    del obj_copy["point_cloud"]
-                serializable_objects.append(obj_copy)
-
-            with open(pickle_path, "wb") as f:
-                pickle.dump(serializable_objects, f)
-
-            print(f"Successfully saved {len(serializable_objects)} objects to {pickle_path}")
-            print("To load: objects = pickle.load(open('filename.pkl', 'rb'))")
-            print(
-                "Note: Open3D point clouds excluded - use point_cloud_numpy and colors_numpy for processing"
-            )
-
-        # Visualize results
-        print("Visualizing results...")
-        visualize_results(updated_objects)
-
-    except Exception as e:
-        print(f"Error: {e}")
-        import traceback
-
-        traceback.print_exc()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/dimos/perception/pointcloud/test_pointcloud_filtering.py b/dimos/perception/pointcloud/test_pointcloud_filtering.py
new file mode 100644
index 0000000000..4b4e5c7c4f
--- /dev/null
+++ b/dimos/perception/pointcloud/test_pointcloud_filtering.py
@@ -0,0 +1,259 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import numpy as np
+import pytest
+import open3d as o3d
+
+from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering
+from dimos.perception.pointcloud.utils import load_camera_matrix_from_yaml
+from dimos.types.manipulation import ObjectData
+
+
+class TestPointcloudFiltering:
+    def test_pointcloud_filtering_initialization(self):
+        """Test PointcloudFiltering initializes correctly with default parameters."""
+        try:
+            filtering = PointcloudFiltering()
+            assert filtering is not None
+            assert filtering.color_weight == 0.3
+            assert filtering.enable_statistical_filtering == True
+            assert filtering.enable_radius_filtering == True
+            assert filtering.enable_subsampling == True
+        except Exception as e:
+            pytest.skip(f"Skipping test due to initialization error: {e}")
+
+    def test_pointcloud_filtering_with_custom_params(self):
+        """Test PointcloudFiltering with custom parameters."""
+        try:
+            filtering = PointcloudFiltering(
+                color_weight=0.5,
+                enable_statistical_filtering=False,
+                enable_radius_filtering=False,
+                voxel_size=0.01,
+                max_num_objects=5,
+            )
+            assert filtering.color_weight == 0.5
+            assert filtering.enable_statistical_filtering == False
+            assert filtering.enable_radius_filtering == False
+            assert filtering.voxel_size == 0.01
+            assert filtering.max_num_objects == 5
+        except Exception as e:
+            pytest.skip(f"Skipping test due to initialization error: {e}")
+
+    def test_pointcloud_filtering_process_images(self):
+        """Test PointcloudFiltering can process RGB-D images and return filtered point clouds."""
+        try:
+            # Import data inside method to avoid pytest fixture confusion
+            from dimos.utils.data import get_data
+
+            # Load test RGB-D data
+            data_dir = get_data("rgbd_frames")
+
+            # Load first frame
+            color_path = os.path.join(data_dir, "color", "00000.png")
+            depth_path = os.path.join(data_dir, "depth", "00000.png")
+            intrinsics_path = os.path.join(data_dir, "color_camera_info.yaml")
+
+            assert os.path.exists(color_path), f"Color image not found: {color_path}"
+            assert os.path.exists(depth_path), f"Depth image not found: {depth_path}"
+            assert os.path.exists(intrinsics_path), f"Intrinsics file not found: {intrinsics_path}"
+
+            # Load images
+            color_img = cv2.imread(color_path)
+            color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB)
+
+            depth_img = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
+            if depth_img.dtype == np.uint16:
+                depth_img = depth_img.astype(np.float32) / 1000.0
+
+            # Load camera intrinsics
+            camera_matrix = load_camera_matrix_from_yaml(intrinsics_path)
+            if camera_matrix is None:
+                pytest.skip("Failed to load camera intrinsics")
+
+            # Create mock objects with segmentation masks
+            height, width = color_img.shape[:2]
+
+            # Create simple rectangular masks for testing
+            mock_objects = []
+
+            # Object 1: Top-left quadrant
+            mask1 = np.zeros((height, width), dtype=bool)
+            mask1[height // 4 : height // 2, width // 4 : width // 2] = True
+
+            obj1: ObjectData = {
+                "object_id": 1,
+                "confidence": 0.9,
+                "bbox": [width // 4, height // 4, width // 2, height // 2],
+                "segmentation_mask": mask1,
+                "name": "test_object_1",
+            }
+            mock_objects.append(obj1)
+
+            # Object 2: Bottom-right quadrant
+            mask2 = np.zeros((height, width), dtype=bool)
+            mask2[height // 2 : 3 * height // 4, width // 2 : 3 * width // 4] = True
+
+            obj2: ObjectData = {
+                "object_id": 2,
+                "confidence": 0.8,
+                "bbox": [width // 2, height // 2, 3 * width // 4, 3 * height // 4],
+                "segmentation_mask": mask2,
+                "name": "test_object_2",
+            }
+            mock_objects.append(obj2)
+
+            # Initialize filtering with intrinsics
+            filtering = PointcloudFiltering(
+                color_intrinsics=camera_matrix,
+                depth_intrinsics=camera_matrix,
+                enable_statistical_filtering=False,  # Disable for faster testing
+                enable_radius_filtering=False,  # Disable for faster testing
+                voxel_size=0.01,  # Larger voxel for faster processing
+            )
+
+            # Process images
+            results = filtering.process_images(color_img, depth_img, mock_objects)
+
+            print(
+                f"Processing results - Input objects: {len(mock_objects)}, Output objects: {len(results)}"
+            )
+
+            # Verify results
+            assert isinstance(results, list), "Results should be a list"
+            assert len(results) <= len(mock_objects), "Should not return more objects than input"
+
+            # Check each result object
+            for i, result in enumerate(results):
+                print(f"Object {i}: {result.get('name', 'unknown')}")
+
+                # Verify required fields exist
+                assert "point_cloud" in result, "Result should contain point_cloud"
+                assert "color" in result, "Result should contain color"
+                assert "point_cloud_numpy" in result, "Result should contain point_cloud_numpy"
+
+                # Verify point cloud is valid Open3D object
+                pcd = result["point_cloud"]
+                assert isinstance(pcd, o3d.geometry.PointCloud), (
+                    "point_cloud should be Open3D PointCloud"
+                )
+
+                # Verify numpy arrays
+                points_array = result["point_cloud_numpy"]
+                assert isinstance(points_array, np.ndarray), (
+                    "point_cloud_numpy should be numpy array"
+                )
+                assert points_array.shape[1] == 3, "Point array should have 3 columns (x,y,z)"
+                assert points_array.dtype == np.float32, "Point array should be float32"
+
+                # Verify color
+                color = result["color"]
+                assert isinstance(color, np.ndarray), "Color should be numpy array"
+                assert color.shape == (3,), "Color should be RGB triplet"
+                assert color.dtype == np.uint8, "Color should be uint8"
+
+                # Check if 3D information was added (when enough points for cuboid fitting)
+                points = np.asarray(pcd.points)
+                if len(points) >= filtering.min_points_for_cuboid:
+                    if "position" in result:
+                        assert "rotation" in result, "Should have rotation if position exists"
+                        assert "size" in result, "Should have size if position exists"
+
+                        # Verify position format
+                        from dimos.types.vector import Vector
+
+                        position = result["position"]
+                        assert isinstance(position, Vector), "Position should be Vector"
+
+                        # Verify size format
+                        size = result["size"]
+                        assert isinstance(size, dict), "Size should be dict"
+                        assert "width" in size and "height" in size and "depth" in size
+
+                print(f"  - Points: {len(points)}")
+                print(f"  - Color: {color}")
+                if "position" in result:
+                    print(f"  - Position: {result['position']}")
+                    print(f"  - Size: {result['size']}")
+
+            # Test full point cloud access
+            full_pcd = filtering.get_full_point_cloud()
+            if full_pcd is not None:
+                assert isinstance(full_pcd, o3d.geometry.PointCloud), (
+                    "Full point cloud should be Open3D PointCloud"
+                )
+                full_points = np.asarray(full_pcd.points)
+                print(f"Full point cloud points: {len(full_points)}")
+
+            print("All pointcloud filtering tests passed!")
+
+        except Exception as e:
+            pytest.skip(f"Skipping test due to error: {e}")
+
+    def test_pointcloud_filtering_empty_objects(self):
+        """Test PointcloudFiltering with empty object list."""
+        try:
+            from dimos.utils.data import get_data
+
+            # Load test data
+            data_dir = get_data("rgbd_frames")
+            color_path = os.path.join(data_dir, "color", "00000.png")
+            depth_path = os.path.join(data_dir, "depth", "00000.png")
+
+            if not (os.path.exists(color_path) and os.path.exists(depth_path)):
+                pytest.skip("Test images not found")
+
+            color_img = cv2.imread(color_path)
+            color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB)
+            depth_img = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
+            if depth_img.dtype == np.uint16:
+                depth_img = depth_img.astype(np.float32) / 1000.0
+
+            filtering = PointcloudFiltering()
+
+            # Test with empty object list
+            results = filtering.process_images(color_img, depth_img, [])
+
+            assert isinstance(results, list), "Results should be a list"
+            assert len(results) == 0, "Should return empty list for empty input"
+
+        except Exception as e:
+            pytest.skip(f"Skipping test due to error: {e}")
+
+    def test_color_generation_consistency(self):
+        """Test that color generation is consistent for the same object ID."""
+        try:
+            filtering = PointcloudFiltering()
+
+            # Test color generation consistency
+            color1 = filtering.generate_color_from_id(42)
+            color2 = filtering.generate_color_from_id(42)
+            color3 = filtering.generate_color_from_id(43)
+
+            assert np.array_equal(color1, color2), "Same ID should generate same color"
+            assert not np.array_equal(color1, color3), (
+                "Different IDs should generate different colors"
+            )
+            assert color1.shape == (3,), "Color should be RGB triplet"
+            assert color1.dtype == np.uint8, "Color should be uint8"
+
+        except Exception as e:
+            pytest.skip(f"Skipping test due to error: {e}")
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", __file__])
diff --git a/tests/test_manipulation_pipeline_single_frame.py b/tests/test_manipulation_pipeline_single_frame.py
index 1fc42d0810..061eb9035e 100644
--- a/tests/test_manipulation_pipeline_single_frame.py
+++ b/tests/test_manipulation_pipeline_single_frame.py
@@ -15,12 +15,12 @@
 """Test manipulation processor with direct visualization and grasp data output."""
 
 import os
-import sys
 import cv2
 import numpy as np
-import time
 import argparse
 import matplotlib
+import tests.test_header
+from dimos.utils.data import get_data
 
 # Try to use TkAgg backend for live display, fallback to Agg if not available
 try:
@@ -32,9 +32,6 @@
         matplotlib.use("Agg")  # Fallback to non-interactive
 import matplotlib.pyplot as plt
 import open3d as o3d
-from typing import Dict, List
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid
 from dimos.perception.manip_aio_processer import ManipulationProcessor
@@ -114,7 +111,7 @@ def run_processor(color_img, depth_img, intrinsics, grasp_server_url=None):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--data-dir", default="assets/rgbd_data")
+    parser.add_argument("--data-dir", default=get_data("rgbd_frames"))
     parser.add_argument("--wait-time", type=float, default=5.0)
     parser.add_argument(
         "--grasp-server-url",
diff --git a/tests/test_manipulation_pipeline_single_frame_lcm.py b/tests/test_manipulation_pipeline_single_frame_lcm.py
index d8090575eb..635f82c9c9 100644
--- a/tests/test_manipulation_pipeline_single_frame_lcm.py
+++ b/tests/test_manipulation_pipeline_single_frame_lcm.py
@@ -18,13 +18,11 @@
 import sys
 import cv2
 import numpy as np
-import time
 import argparse
 import threading
 import pickle
 import matplotlib
-import json
-import copy
+import tests.test_header
 
 # Try to use TkAgg backend for live display, fallback to Agg if not available
 try:
@@ -43,8 +41,6 @@
 from lcm_msgs.sensor_msgs import Image as LCMImage
 from lcm_msgs.sensor_msgs import CameraInfo as LCMCameraInfo
 
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
 from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid
 from dimos.perception.manip_aio_processer import ManipulationProcessor
 from dimos.perception.grasp_generation.utils import visualize_grasps_3d

From 05c1b4ff08daad2911ab7e854688089d336c75ed Mon Sep 17 00:00:00 2001
From: alexlin2 <alex.lin416@outlook.com>
Date: Wed, 16 Jul 2025 14:24:39 -0700
Subject: [PATCH 3/7] Added RGB-D to LFS

---
 data/.lfs/rgbd_frames.tar.gz | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 data/.lfs/rgbd_frames.tar.gz

diff --git a/data/.lfs/rgbd_frames.tar.gz b/data/.lfs/rgbd_frames.tar.gz
new file mode 100644
index 0000000000..8081c76961
--- /dev/null
+++ b/data/.lfs/rgbd_frames.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:381b9fd296a885f5211a668df16c68581d2aee458c8734c3256a7461f0decccd
+size 948391033

From 37a5621db0510f45981ba86c1811393784d8e06b Mon Sep 17 00:00:00 2001
From: alexlin2 <alex.lin416@outlook.com>
Date: Wed, 16 Jul 2025 14:25:46 -0700
Subject: [PATCH 4/7] Don't gitignore /data/.lfs .tar.gz files

---
 .gitignore | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6378d36468..f5c93cf65f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,7 +37,8 @@ package-lock.json
 # Ignore build artifacts
 dist/
 
-# Ignore data and modelfiles
-data/
+# Ignore data directory but keep .lfs subdirectory
+data/*
+!data/.lfs/
 FastSAM-x.pt
 yolo11n.pt

From d20e5f9695e08f79b0c85b46d8003ed6aa921d0c Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 17 Jul 2025 23:08:11 -0700
Subject: [PATCH 5/7] Removed unused pointcloud utilities

---
 .../pointcloud/pointcloud_filtering.py        |   1 -
 dimos/perception/pointcloud/utils.py          | 285 ------------------
 2 files changed, 286 deletions(-)

diff --git a/dimos/perception/pointcloud/pointcloud_filtering.py b/dimos/perception/pointcloud/pointcloud_filtering.py
index ea50cd03d2..3de2f3ae6a 100644
--- a/dimos/perception/pointcloud/pointcloud_filtering.py
+++ b/dimos/perception/pointcloud/pointcloud_filtering.py
@@ -26,7 +26,6 @@
 from dimos.perception.pointcloud.utils import (
     load_camera_matrix_from_yaml,
     create_point_cloud_and_extract_masks,
-    o3d_point_cloud_to_numpy,
 )
 from dimos.perception.pointcloud.cuboid_fit import fit_cuboid
 
diff --git a/dimos/perception/pointcloud/utils.py b/dimos/perception/pointcloud/utils.py
index 16cb0e0800..0813c2ca0e 100644
--- a/dimos/perception/pointcloud/utils.py
+++ b/dimos/perception/pointcloud/utils.py
@@ -28,62 +28,6 @@
 from scipy.spatial import cKDTree
 
 
-def depth_to_point_cloud(depth_image, camera_intrinsics, subsample_factor=4):
-    """
-    Convert depth image to point cloud using camera intrinsics.
-    Subsamples points to reduce density.
-
-    Args:
-        depth_image: HxW depth image in meters
-        camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
-        subsample_factor: Factor to subsample points (higher = fewer points)
-
-    Returns:
-        Nx3 array of 3D points
-    """
-    # Filter out inf and nan values from depth image
-    depth_filtered = depth_image.copy()
-
-    # Create mask for valid depth values (finite, positive, non-zero)
-    valid_mask = np.isfinite(depth_filtered) & (depth_filtered > 0)
-
-    # Set invalid values to 0
-    depth_filtered[~valid_mask] = 0.0
-
-    # Extract camera parameters
-    if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
-        fx, fy, cx, cy = camera_intrinsics
-    else:
-        fx = camera_intrinsics[0, 0]
-        fy = camera_intrinsics[1, 1]
-        cx = camera_intrinsics[0, 2]
-        cy = camera_intrinsics[1, 2]
-
-    # Create pixel coordinate grid
-    rows, cols = depth_filtered.shape
-    x_grid, y_grid = np.meshgrid(
-        np.arange(0, cols, subsample_factor), np.arange(0, rows, subsample_factor)
-    )
-
-    # Flatten grid and depth
-    x = x_grid.flatten()
-    y = y_grid.flatten()
-    z = depth_filtered[y_grid, x_grid].flatten()
-
-    # Remove points with invalid depth (after filtering, this catches zeros)
-    valid = z > 0
-    x = x[valid]
-    y = y[valid]
-    z = z[valid]
-
-    # Convert to 3D points
-    X = (x - cx) * z / fx
-    Y = (y - cy) * z / fy
-    Z = z
-
-    return np.column_stack([X, Y, Z])
-
-
 def load_camera_matrix_from_yaml(
     camera_info: Optional[Union[str, List[float], np.ndarray, dict]],
 ) -> Optional[np.ndarray]:
@@ -251,123 +195,6 @@ def create_o3d_point_cloud_from_rgbd(
     return pcd
 
 
-def o3d_point_cloud_to_numpy(pcd: o3d.geometry.PointCloud) -> np.ndarray:
-    """
-    Convert Open3D point cloud to numpy array of XYZRGB points.
-
-    Args:
-        pcd: Open3D point cloud object
-
-    Returns:
-        Nx6 array of XYZRGB points (empty array if no points)
-    """
-    points = np.asarray(pcd.points)
-    if len(points) == 0:
-        return np.zeros((0, 6), dtype=np.float32)
-
-    # Get colors if available
-    if pcd.has_colors():
-        colors = np.asarray(pcd.colors) * 255.0  # Convert from [0,1] to [0,255]
-        return np.column_stack([points, colors]).astype(np.float32)
-    else:
-        # No colors available, return points with zero colors
-        zeros = np.zeros((len(points), 3), dtype=np.float32)
-        return np.column_stack([points, zeros]).astype(np.float32)
-
-
-def numpy_to_o3d_point_cloud(points_rgb: np.ndarray) -> o3d.geometry.PointCloud:
-    """
-    Convert numpy array of XYZRGB points to Open3D point cloud.
-
-    Args:
-        points_rgb: Nx6 array of XYZRGB points or Nx3 array of XYZ points
-
-    Returns:
-        Open3D point cloud object
-
-    Raises:
-        ValueError: If array shape is invalid
-    """
-    if len(points_rgb) == 0:
-        return o3d.geometry.PointCloud()
-
-    if points_rgb.shape[1] < 3:
-        raise ValueError(
-            f"points_rgb must have at least 3 columns (XYZ), got {points_rgb.shape[1]}"
-        )
-
-    pcd = o3d.geometry.PointCloud()
-    pcd.points = o3d.utility.Vector3dVector(points_rgb[:, :3])
-
-    # Add colors if available
-    if points_rgb.shape[1] >= 6:
-        colors = points_rgb[:, 3:6] / 255.0  # Convert from [0,255] to [0,1]
-        colors = np.clip(colors, 0.0, 1.0)  # Ensure valid range
-        pcd.colors = o3d.utility.Vector3dVector(colors)
-
-    return pcd
-
-
-def create_masked_point_cloud(color_img, depth_img, mask, intrinsic, depth_scale=1.0):
-    """
-    Create a point cloud for a masked region of RGBD data using Open3D.
-
-    Args:
-        color_img: RGB image (H, W, 3)
-        depth_img: Depth image (H, W)
-        mask: Boolean mask of the same size as color_img and depth_img
-        intrinsic: Camera intrinsic matrix (3x3 numpy array)
-        depth_scale: Scale factor to convert depth to meters
-
-    Returns:
-        Open3D point cloud object for the masked region
-    """
-    # Filter out inf and nan values from depth image
-    depth_filtered = depth_img.copy()
-
-    # Create mask for valid depth values (finite, positive, non-zero)
-    valid_depth_mask = np.isfinite(depth_filtered) & (depth_filtered > 0)
-
-    # Set invalid values to 0
-    depth_filtered[~valid_depth_mask] = 0.0
-
-    # Create masked color and depth images
-    masked_color = color_img.copy()
-    masked_depth = depth_filtered.copy()
-
-    # Apply mask
-    if not mask.shape[:2] == color_img.shape[:2]:
-        raise ValueError(f"Mask shape {mask.shape} doesn't match image shape {color_img.shape[:2]}")
-
-    # Create a boolean mask that is properly expanded for the RGB channels
-    # For RGB image, we need to properly broadcast the mask to all 3 channels
-    if len(color_img.shape) == 3 and color_img.shape[2] == 3:
-        # Properly broadcast mask to match the RGB dimensions
-        mask_rgb = np.broadcast_to(mask[:, :, np.newaxis], color_img.shape)
-        masked_color[~mask_rgb] = 0
-    else:
-        # For grayscale images
-        masked_color[~mask] = 0
-
-    # Apply mask to depth image
-    masked_depth[~mask] = 0
-
-    # Create point cloud
-    pcd = create_o3d_point_cloud_from_rgbd(masked_color, masked_depth, intrinsic, depth_scale)
-
-    # Remove points with coordinates at origin (0,0,0) which are likely from masked out regions
-    points = np.asarray(pcd.points)
-    if len(points) > 0:
-        # Find points that are not at origin
-        dist_from_origin = np.sum(points**2, axis=1)
-        valid_indices = dist_from_origin > 1e-6
-
-        # Filter points and colors
-        pcd = pcd.select_by_index(np.where(valid_indices)[0])
-
-    return pcd
-
-
 def create_point_cloud_and_extract_masks(
     color_img: np.ndarray,
     depth_img: np.ndarray,
@@ -437,89 +264,6 @@ def create_point_cloud_and_extract_masks(
     return full_pcd, masked_pcds
 
 
-def extract_masked_point_cloud_efficient(
-    full_pcd: o3d.geometry.PointCloud, depth_img: np.ndarray, mask: np.ndarray
-) -> o3d.geometry.PointCloud:
-    """
-    Extract a masked region from an existing point cloud efficiently.
-
-    This assumes the point cloud was created from the given depth image.
-    Use this when you have a pre-computed full point cloud and want to extract
-    individual masked regions.
-
-    Args:
-        full_pcd: Complete Open3D point cloud
-        depth_img: Depth image used to create the point cloud (H, W)
-        mask: Boolean mask (H, W)
-
-    Returns:
-        Open3D point cloud for the masked region
-
-    Raises:
-        ValueError: If mask shape doesn't match depth image
-    """
-    if mask.shape != depth_img.shape:
-        raise ValueError(
-            f"Mask shape {mask.shape} doesn't match depth image shape {depth_img.shape}"
-        )
-
-    # Early return if no points in full point cloud
-    if len(np.asarray(full_pcd.points)) == 0:
-        return o3d.geometry.PointCloud()
-
-    # Get valid depth mask
-    valid_depth = depth_img.flatten() > 0
-    mask_flat = mask.flatten()
-
-    # Find pixels that are both valid and in the mask
-    valid_mask_indices = mask_flat & valid_depth
-
-    # Get indices of valid points
-    point_indices = np.where(valid_mask_indices[valid_depth])[0]
-
-    # Extract the masked point cloud
-    if len(point_indices) > 0:
-        return full_pcd.select_by_index(point_indices)
-    else:
-        return o3d.geometry.PointCloud()
-
-
-def segment_and_remove_plane(pcd, distance_threshold=0.02, ransac_n=3, num_iterations=1000):
-    """
-    Segment the dominant plane from a point cloud using RANSAC and remove it.
-    Often used to remove table tops, floors, walls, or other planar surfaces.
-
-    Args:
-        pcd: Open3D point cloud object
-        distance_threshold: Maximum distance a point can be from the plane to be considered an inlier (in meters)
-        ransac_n: Number of points to sample for each RANSAC iteration
-        num_iterations: Number of RANSAC iterations
-
-    Returns:
-        Open3D point cloud with the dominant plane removed
-    """
-    # Make a copy of the input point cloud to avoid modifying the original
-    pcd_filtered = o3d.geometry.PointCloud()
-    pcd_filtered.points = o3d.utility.Vector3dVector(np.asarray(pcd.points))
-    if pcd.has_colors():
-        pcd_filtered.colors = o3d.utility.Vector3dVector(np.asarray(pcd.colors))
-    if pcd.has_normals():
-        pcd_filtered.normals = o3d.utility.Vector3dVector(np.asarray(pcd.normals))
-
-    # Check if point cloud has enough points
-    if len(pcd_filtered.points) < ransac_n:
-        return pcd_filtered
-
-    # Run RANSAC to find the largest plane
-    _, inliers = pcd_filtered.segment_plane(
-        distance_threshold=distance_threshold, ransac_n=ransac_n, num_iterations=num_iterations
-    )
-
-    # Remove the dominant plane (regardless of orientation)
-    pcd_without_dominant_plane = pcd_filtered.select_by_index(inliers, invert=True)
-    return pcd_without_dominant_plane
-
-
 def filter_point_cloud_statistical(
     pcd: o3d.geometry.PointCloud, nb_neighbors: int = 20, std_ratio: float = 2.0
 ) -> Tuple[o3d.geometry.PointCloud, np.ndarray]:
@@ -560,35 +304,6 @@ def filter_point_cloud_radius(
     return pcd.remove_radius_outlier(nb_points=nb_points, radius=radius)
 
 
-def compute_point_cloud_bounds(pcd: o3d.geometry.PointCloud) -> dict:
-    """
-    Compute bounding box information for a point cloud.
-
-    Args:
-        pcd: Input point cloud
-
-    Returns:
-        Dictionary with bounds information
-    """
-    points = np.asarray(pcd.points)
-    if len(points) == 0:
-        return {
-            "min": np.array([0, 0, 0]),
-            "max": np.array([0, 0, 0]),
-            "center": np.array([0, 0, 0]),
-            "size": np.array([0, 0, 0]),
-            "volume": 0.0,
-        }
-
-    min_bound = points.min(axis=0)
-    max_bound = points.max(axis=0)
-    center = (min_bound + max_bound) / 2
-    size = max_bound - min_bound
-    volume = np.prod(size)
-
-    return {"min": min_bound, "max": max_bound, "center": center, "size": size, "volume": volume}
-
-
 def project_3d_points_to_2d(
     points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray]
 ) -> np.ndarray:

From ffa68d33f075879bcb21e6f96bb80d4402ff2597 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Fri, 18 Jul 2025 00:34:59 -0700
Subject: [PATCH 6/7] Removed unused grasp utils

---
 dimos/perception/grasp_generation/utils.py | 51 ----------------------
 1 file changed, 51 deletions(-)

diff --git a/dimos/perception/grasp_generation/utils.py b/dimos/perception/grasp_generation/utils.py
index 5a79a7d3bc..94377363f2 100644
--- a/dimos/perception/grasp_generation/utils.py
+++ b/dimos/perception/grasp_generation/utils.py
@@ -63,32 +63,6 @@ def project_3d_points_to_2d(
     return points_2d
 
 
-def euler_to_rotation_matrix(roll: float, pitch: float, yaw: float) -> np.ndarray:
-    """
-    Convert Euler angles to rotation matrix.
-
-    Args:
-        roll: Roll angle in radians
-        pitch: Pitch angle in radians
-        yaw: Yaw angle in radians
-
-    Returns:
-        3x3 rotation matrix
-    """
-    Rx = np.array([[1, 0, 0], [0, np.cos(roll), -np.sin(roll)], [0, np.sin(roll), np.cos(roll)]])
-
-    Ry = np.array(
-        [[np.cos(pitch), 0, np.sin(pitch)], [0, 1, 0], [-np.sin(pitch), 0, np.cos(pitch)]]
-    )
-
-    Rz = np.array([[np.cos(yaw), -np.sin(yaw), 0], [np.sin(yaw), np.cos(yaw), 0], [0, 0, 1]])
-
-    # Combined rotation matrix
-    R = Rz @ Ry @ Rx
-
-    return R
-
-
 def create_gripper_geometry(
     grasp_data: dict,
     finger_length: float = 0.08,
@@ -527,31 +501,6 @@ def visualize_grasps_3d(
     o3d.visualization.draw_geometries(geometries, window_name="3D Grasp Visualization")
 
 
-def rotation_matrix_to_euler(rotation_matrix: np.ndarray) -> Tuple[float, float, float]:
-    """
-    Convert 3x3 rotation matrix to Euler angles (roll, pitch, yaw).
-
-    Args:
-        rotation_matrix: 3x3 rotation matrix
-
-    Returns:
-        Tuple of (roll, pitch, yaw) in radians
-    """
-    sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2)
-    singular = sy < 1e-6
-
-    if not singular:
-        x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2])  # roll
-        y = np.arctan2(-rotation_matrix[2, 0], sy)  # pitch
-        z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0])  # yaw
-    else:
-        x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1])  # roll
-        y = np.arctan2(-rotation_matrix[2, 0], sy)  # pitch
-        z = 0  # yaw
-
-    return x, y, z
-
-
 def parse_grasp_results(grasps: List[Dict]) -> List[Dict]:
     """
     Parse grasp results into visualization format.

From 256d9f9697bd50e2d906db3751e0419d0ccce079 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Fri, 18 Jul 2025 00:40:32 -0700
Subject: [PATCH 7/7] Fixed file rename

---
 ...on_pipeline.py.py => test_manipulation_perception_pipeline.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{test_manipulation_perception_pipeline.py.py => test_manipulation_perception_pipeline.py} (100%)

diff --git a/tests/test_manipulation_perception_pipeline.py.py b/tests/test_manipulation_perception_pipeline.py
similarity index 100%
rename from tests/test_manipulation_perception_pipeline.py.py
rename to tests/test_manipulation_perception_pipeline.py