diff --git a/data/.lfs/models_edgetam.tar.gz b/data/.lfs/models_edgetam.tar.gz
new file mode 100644
index 0000000000..64baa5d139
--- /dev/null
+++ b/data/.lfs/models_edgetam.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd452096f91415ce7ca90548a06a87354ccdb19a66925c0242413c80b08f5c57
+size 51988780
diff --git a/data/.lfs/models_yoloe.tar.gz b/data/.lfs/models_yoloe.tar.gz
new file mode 100644
index 0000000000..a0870d71d2
--- /dev/null
+++ b/data/.lfs/models_yoloe.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a78e39477667b25c9454f846cd66dc044dd05981b2f7ebb0d331ef3626de9bc
+size 184892540
diff --git a/dimos/hardware/sensors/camera/realsense/README.md b/dimos/hardware/sensors/camera/realsense/README.md
new file mode 100644
index 0000000000..665833047e
--- /dev/null
+++ b/dimos/hardware/sensors/camera/realsense/README.md
@@ -0,0 +1,9 @@
+# RealSense SDK Install
+
+1) Install the Intel RealSense SDK:
+   - https://github.com/IntelRealSense/librealsense
+
+2) Install the Python bindings:
+   ```bash
+   pip install pyrealsense2
+   ```
diff --git a/dimos/manipulation/manip_aio_pipeline.py b/dimos/manipulation/manip_aio_pipeline.py
deleted file mode 100644
index fe3598ab1e..0000000000
--- a/dimos/manipulation/manip_aio_pipeline.py
+++ /dev/null
@@ -1,592 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Asynchronous, reactive manipulation pipeline for realtime detection, filtering, and grasp generation.
-"""
-
-import asyncio
-import json
-import threading
-import time
-
-import cv2
-import numpy as np
-import reactivex as rx
-import reactivex.operators as ops
-import websockets
-
-from dimos.perception.common.utils import colorize_depth
-from dimos.perception.detection2d.detic_2d_det import (  # type: ignore[import-not-found, import-untyped]
-    Detic2DDetector,
-)
-from dimos.perception.grasp_generation.utils import draw_grasps_on_image
-from dimos.perception.object_detection_stream import ObjectDetectionStream
-from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering
-from dimos.perception.pointcloud.utils import create_point_cloud_overlay_visualization
-from dimos.utils.logging_config import setup_logger
-
-logger = setup_logger()
-
-
-class ManipulationPipeline:
-    """
-    Clean separated stream pipeline with frame buffering.
-
-    - Object detection runs independently on RGB stream
-    - Point cloud processing subscribes to both detection and ZED streams separately
-    - Simple frame buffering to match RGB+depth+objects
-    """
-
-    def __init__(
-        self,
-        camera_intrinsics: list[float],  # [fx, fy, cx, cy]
-        min_confidence: float = 0.6,
-        max_objects: int = 10,
-        vocabulary: str | None = None,
-        grasp_server_url: str | None = None,
-        enable_grasp_generation: bool = False,
-    ) -> None:
-        """
-        Initialize the manipulation pipeline.
-
-        Args:
-            camera_intrinsics: [fx, fy, cx, cy] camera parameters
-            min_confidence: Minimum detection confidence threshold
-            max_objects: Maximum number of objects to process
-            vocabulary: Optional vocabulary for Detic detector
-            grasp_server_url: Optional WebSocket URL for Dimensional Grasp server
-            enable_grasp_generation: Whether to enable async grasp generation
-        """
-        self.camera_intrinsics = camera_intrinsics
-        self.min_confidence = min_confidence
-
-        # Grasp generation settings
-        self.grasp_server_url = grasp_server_url
-        self.enable_grasp_generation = enable_grasp_generation
-
-        # Asyncio event loop for WebSocket communication
-        self.grasp_loop = None
-        self.grasp_loop_thread = None
-
-        # Storage for grasp results and filtered objects
-        self.latest_grasps: list[dict] = []  # type: ignore[type-arg]  # Simplified: just a list of grasps
-        self.grasps_consumed = False
-        self.latest_filtered_objects = []  # type: ignore[var-annotated]
-        self.latest_rgb_for_grasps = None  # Store RGB image for grasp overlay
-        self.grasp_lock = threading.Lock()
-
-        # Track pending requests - simplified to single task
-        self.grasp_task: asyncio.Task | None = None  # type: ignore[type-arg]
-
-        # Reactive subjects for streaming filtered objects and grasps
-        self.filtered_objects_subject = rx.subject.Subject()  # type: ignore[var-annotated]
-        self.grasps_subject = rx.subject.Subject()  # type: ignore[var-annotated]
-        self.grasp_overlay_subject = rx.subject.Subject()  # type: ignore[var-annotated]  # Add grasp overlay subject
-
-        # Initialize grasp client if enabled
-        if self.enable_grasp_generation and self.grasp_server_url:
-            self._start_grasp_loop()
-
-        # Initialize object detector
-        self.detector = Detic2DDetector(vocabulary=vocabulary, threshold=min_confidence)
-
-        # Initialize point cloud processor
-        self.pointcloud_filter = PointcloudFiltering(
-            color_intrinsics=camera_intrinsics,
-            depth_intrinsics=camera_intrinsics,  # ZED uses same intrinsics
-            max_num_objects=max_objects,
-        )
-
-        logger.info(f"Initialized ManipulationPipeline with confidence={min_confidence}")
-
-    def create_streams(self, zed_stream: rx.Observable) -> dict[str, rx.Observable]:  # type: ignore[type-arg]
-        """
-        Create streams using exact old main logic.
-        """
-        # Create ZED streams (from old main)
-        zed_frame_stream = zed_stream.pipe(ops.share())
-
-        # RGB stream for object detection (from old main)
-        video_stream = zed_frame_stream.pipe(
-            ops.map(lambda x: x.get("rgb") if x is not None else None),  # type: ignore[attr-defined]
-            ops.filter(lambda x: x is not None),
-            ops.share(),
-        )
-        object_detector = ObjectDetectionStream(
-            camera_intrinsics=self.camera_intrinsics,
-            min_confidence=self.min_confidence,
-            class_filter=None,
-            detector=self.detector,
-            video_stream=video_stream,
-            disable_depth=True,
-        )
-
-        # Store latest frames for point cloud processing (from old main)
-        latest_rgb = None
-        latest_depth = None
-        latest_point_cloud_overlay = None
-        frame_lock = threading.Lock()
-
-        # Subscribe to combined ZED frames (from old main)
-        def on_zed_frame(zed_data) -> None:  # type: ignore[no-untyped-def]
-            nonlocal latest_rgb, latest_depth
-            if zed_data is not None:
-                with frame_lock:
-                    latest_rgb = zed_data.get("rgb")
-                    latest_depth = zed_data.get("depth")
-
-        # Depth stream for point cloud filtering (from old main)
-        def get_depth_or_overlay(zed_data):  # type: ignore[no-untyped-def]
-            if zed_data is None:
-                return None
-
-            # Check if we have a point cloud overlay available
-            with frame_lock:
-                overlay = latest_point_cloud_overlay
-
-            if overlay is not None:
-                return overlay
-            else:
-                # Return regular colorized depth
-                return colorize_depth(zed_data.get("depth"), max_depth=10.0)
-
-        depth_stream = zed_frame_stream.pipe(
-            ops.map(get_depth_or_overlay), ops.filter(lambda x: x is not None), ops.share()
-        )
-
-        # Process object detection results with point cloud filtering (from old main)
-        def on_detection_next(result) -> None:  # type: ignore[no-untyped-def]
-            nonlocal latest_point_cloud_overlay
-            if result.get("objects"):
-                # Get latest RGB and depth frames
-                with frame_lock:
-                    rgb = latest_rgb
-                    depth = latest_depth
-
-                if rgb is not None and depth is not None:
-                    try:
-                        filtered_objects = self.pointcloud_filter.process_images(
-                            rgb, depth, result["objects"]
-                        )
-
-                        if filtered_objects:
-                            # Store filtered objects
-                            with self.grasp_lock:
-                                self.latest_filtered_objects = filtered_objects
-                            self.filtered_objects_subject.on_next(filtered_objects)
-
-                            # Create base image (colorized depth)
-                            base_image = colorize_depth(depth, max_depth=10.0)
-
-                            # Create point cloud overlay visualization
-                            overlay_viz = create_point_cloud_overlay_visualization(
-                                base_image=base_image,  # type: ignore[arg-type]
-                                objects=filtered_objects,  # type: ignore[arg-type]
-                                intrinsics=self.camera_intrinsics,  # type: ignore[arg-type]
-                            )
-
-                            # Store the overlay for the stream
-                            with frame_lock:
-                                latest_point_cloud_overlay = overlay_viz
-
-                            # Request grasps if enabled
-                            if self.enable_grasp_generation and len(filtered_objects) > 0:
-                                # Save RGB image for later grasp overlay
-                                with frame_lock:
-                                    self.latest_rgb_for_grasps = rgb.copy()
-
-                                task = self.request_scene_grasps(filtered_objects)  # type: ignore[arg-type]
-                                if task:
-                                    # Check for results after a delay
-                                    def check_grasps_later() -> None:
-                                        time.sleep(2.0)  # Wait for grasp processing
-                                        # Wait for task to complete
-                                        if hasattr(self, "grasp_task") and self.grasp_task:
-                                            try:
-                                                self.grasp_task.result(  # type: ignore[call-arg]
-                                                    timeout=3.0
-                                                )  # Get result with timeout
-                                            except Exception as e:
-                                                logger.warning(f"Grasp task failed or timeout: {e}")
-
-                                        # Try to get latest grasps and create overlay
-                                        with self.grasp_lock:
-                                            grasps = self.latest_grasps
-
-                                        if grasps and hasattr(self, "latest_rgb_for_grasps"):
-                                            # Create grasp overlay on the saved RGB image
-                                            try:
-                                                bgr_image = cv2.cvtColor(  # type: ignore[call-overload]
-                                                    self.latest_rgb_for_grasps, cv2.COLOR_RGB2BGR
-                                                )
-                                                result_bgr = draw_grasps_on_image(
-                                                    bgr_image,
-                                                    grasps,
-                                                    self.camera_intrinsics,
-                                                    max_grasps=-1,  # Show all grasps
-                                                )
-                                                result_rgb = cv2.cvtColor(
-                                                    result_bgr, cv2.COLOR_BGR2RGB
-                                                )
-
-                                                # Emit grasp overlay immediately
-                                                self.grasp_overlay_subject.on_next(result_rgb)
-
-                                            except Exception as e:
-                                                logger.error(f"Error creating grasp overlay: {e}")
-
-                                            # Emit grasps to stream
-                                            self.grasps_subject.on_next(grasps)
-
-                                    threading.Thread(target=check_grasps_later, daemon=True).start()
-                                else:
-                                    logger.warning("Failed to create grasp task")
-                    except Exception as e:
-                        logger.error(f"Error in point cloud filtering: {e}")
-                        with frame_lock:
-                            latest_point_cloud_overlay = None
-
-        def on_error(error) -> None:  # type: ignore[no-untyped-def]
-            logger.error(f"Error in stream: {error}")
-
-        def on_completed() -> None:
-            logger.info("Stream completed")
-
-        def start_subscriptions() -> None:
-            """Start subscriptions in background thread (from old main)"""
-            # Subscribe to combined ZED frames
-            zed_frame_stream.subscribe(on_next=on_zed_frame)
-
-        # Start subscriptions in background thread (from old main)
-        subscription_thread = threading.Thread(target=start_subscriptions, daemon=True)
-        subscription_thread.start()
-        time.sleep(2)  # Give subscriptions time to start
-
-        # Subscribe to object detection stream (from old main)
-        object_detector.get_stream().subscribe(  # type: ignore[no-untyped-call]
-            on_next=on_detection_next, on_error=on_error, on_completed=on_completed
-        )
-
-        # Create visualization stream for web interface (from old main)
-        viz_stream = object_detector.get_stream().pipe(  # type: ignore[no-untyped-call]
-            ops.map(lambda x: x["viz_frame"] if x is not None else None),  # type: ignore[index]
-            ops.filter(lambda x: x is not None),
-        )
-
-        # Create filtered objects stream
-        filtered_objects_stream = self.filtered_objects_subject
-
-        # Create grasps stream
-        grasps_stream = self.grasps_subject
-
-        # Create grasp overlay subject for immediate emission
-        grasp_overlay_stream = self.grasp_overlay_subject
-
-        return {
-            "detection_viz": viz_stream,
-            "pointcloud_viz": depth_stream,
-            "objects": object_detector.get_stream().pipe(ops.map(lambda x: x.get("objects", []))),  # type: ignore[attr-defined, no-untyped-call]
-            "filtered_objects": filtered_objects_stream,
-            "grasps": grasps_stream,
-            "grasp_overlay": grasp_overlay_stream,
-        }
-
-    def _start_grasp_loop(self) -> None:
-        """Start asyncio event loop in a background thread for WebSocket communication."""
-
-        def run_loop() -> None:
-            self.grasp_loop = asyncio.new_event_loop()  # type: ignore[assignment]
-            asyncio.set_event_loop(self.grasp_loop)
-            self.grasp_loop.run_forever()  # type: ignore[attr-defined]
-
-        self.grasp_loop_thread = threading.Thread(target=run_loop, daemon=True)  # type: ignore[assignment]
-        self.grasp_loop_thread.start()  # type: ignore[attr-defined]
-
-        # Wait for loop to start
-        while self.grasp_loop is None:
-            time.sleep(0.01)
-
-    async def _send_grasp_request(
-        self,
-        points: np.ndarray,  # type: ignore[type-arg]
-        colors: np.ndarray | None,  # type: ignore[type-arg]
-    ) -> list[dict] | None:  # type: ignore[type-arg]
-        """Send grasp request to Dimensional Grasp server."""
-        try:
-            # Comprehensive client-side validation to prevent server errors
-
-            # Validate points array
-            if points is None:
-                logger.error("Points array is None")
-                return None
-            if not isinstance(points, np.ndarray):
-                logger.error(f"Points is not numpy array: {type(points)}")
-                return None
-            if points.size == 0:
-                logger.error("Points array is empty")
-                return None
-            if len(points.shape) != 2 or points.shape[1] != 3:
-                logger.error(f"Points has invalid shape {points.shape}, expected (N, 3)")
-                return None
-            if points.shape[0] < 100:  # Minimum points for stable grasp detection
-                logger.error(f"Insufficient points for grasp detection: {points.shape[0]} < 100")
-                return None
-
-            # Validate and prepare colors
-            if colors is not None:
-                if not isinstance(colors, np.ndarray):
-                    colors = None
-                elif colors.size == 0:
-                    colors = None
-                elif len(colors.shape) != 2 or colors.shape[1] != 3:
-                    colors = None
-                elif colors.shape[0] != points.shape[0]:
-                    colors = None
-
-            # If no valid colors, create default colors (required by server)
-            if colors is None:
-                # Create default white colors for all points
-                colors = np.ones((points.shape[0], 3), dtype=np.float32) * 0.5
-
-            # Ensure data types are correct (server expects float32)
-            points = points.astype(np.float32)
-            colors = colors.astype(np.float32)
-
-            # Validate ranges (basic sanity checks)
-            if np.any(np.isnan(points)) or np.any(np.isinf(points)):
-                logger.error("Points contain NaN or Inf values")
-                return None
-            if np.any(np.isnan(colors)) or np.any(np.isinf(colors)):
-                logger.error("Colors contain NaN or Inf values")
-                return None
-
-            # Clamp color values to valid range [0, 1]
-            colors = np.clip(colors, 0.0, 1.0)
-
-            async with websockets.connect(self.grasp_server_url) as websocket:  # type: ignore[arg-type]
-                request = {
-                    "points": points.tolist(),
-                    "colors": colors.tolist(),  # Always send colors array
-                    "lims": [-0.19, 0.12, 0.02, 0.15, 0.0, 1.0],  # Default workspace limits
-                }
-
-                await websocket.send(json.dumps(request))
-
-                response = await websocket.recv()
-                grasps = json.loads(response)
-
-                # Handle server response validation
-                if isinstance(grasps, dict) and "error" in grasps:
-                    logger.error(f"Server returned error: {grasps['error']}")
-                    return None
-                elif isinstance(grasps, int | float) and grasps == 0:
-                    return None
-                elif not isinstance(grasps, list):
-                    logger.error(
-                        f"Server returned unexpected response type: {type(grasps)}, value: {grasps}"
-                    )
-                    return None
-                elif len(grasps) == 0:
-                    return None
-
-                converted_grasps = self._convert_grasp_format(grasps)
-                with self.grasp_lock:
-                    self.latest_grasps = converted_grasps
-                    self.grasps_consumed = False  # Reset consumed flag
-
-                # Emit to reactive stream
-                self.grasps_subject.on_next(self.latest_grasps)
-
-                return converted_grasps
-        except websockets.exceptions.ConnectionClosed as e:
-            logger.error(f"WebSocket connection closed: {e}")
-        except websockets.exceptions.WebSocketException as e:
-            logger.error(f"WebSocket error: {e}")
-        except json.JSONDecodeError as e:
-            logger.error(f"Failed to parse server response as JSON: {e}")
-        except Exception as e:
-            logger.error(f"Error requesting grasps: {e}")
-
-        return None
-
-    def request_scene_grasps(self, objects: list[dict]) -> asyncio.Task | None:  # type: ignore[type-arg]
-        """Request grasps for entire scene by combining all object point clouds."""
-        if not self.grasp_loop or not objects:
-            return None
-
-        all_points = []
-        all_colors = []
-        valid_objects = 0
-
-        for _i, obj in enumerate(objects):
-            # Validate point cloud data
-            if "point_cloud_numpy" not in obj or obj["point_cloud_numpy"] is None:
-                continue
-
-            points = obj["point_cloud_numpy"]
-            if not isinstance(points, np.ndarray) or points.size == 0:
-                continue
-
-            # Ensure points have correct shape (N, 3)
-            if len(points.shape) != 2 or points.shape[1] != 3:
-                continue
-
-            # Validate colors if present
-            colors = None
-            if "colors_numpy" in obj and obj["colors_numpy"] is not None:
-                colors = obj["colors_numpy"]
-                if isinstance(colors, np.ndarray) and colors.size > 0:
-                    # Ensure colors match points count and have correct shape
-                    if colors.shape[0] != points.shape[0]:
-                        colors = None  # Ignore colors for this object
-                    elif len(colors.shape) != 2 or colors.shape[1] != 3:
-                        colors = None  # Ignore colors for this object
-
-            all_points.append(points)
-            if colors is not None:
-                all_colors.append(colors)
-            valid_objects += 1
-
-        if not all_points:
-            return None
-
-        try:
-            combined_points = np.vstack(all_points)
-
-            # Only combine colors if ALL objects have valid colors
-            combined_colors = None
-            if len(all_colors) == valid_objects and len(all_colors) > 0:
-                combined_colors = np.vstack(all_colors)
-
-            # Validate final combined data
-            if combined_points.size == 0:
-                logger.warning("Combined point cloud is empty")
-                return None
-
-            if combined_colors is not None and combined_colors.shape[0] != combined_points.shape[0]:
-                logger.warning(
-                    f"Color/point count mismatch: {combined_colors.shape[0]} colors vs {combined_points.shape[0]} points, dropping colors"
-                )
-                combined_colors = None
-
-        except Exception as e:
-            logger.error(f"Failed to combine point clouds: {e}")
-            return None
-
-        try:
-            # Check if there's already a grasp task running
-            if hasattr(self, "grasp_task") and self.grasp_task and not self.grasp_task.done():
-                return self.grasp_task
-
-            task = asyncio.run_coroutine_threadsafe(
-                self._send_grasp_request(combined_points, combined_colors), self.grasp_loop
-            )
-
-            self.grasp_task = task
-            return task
-        except Exception:
-            logger.warning("Failed to create grasp task")
-            return None
-
-    def get_latest_grasps(self, timeout: float = 5.0) -> list[dict] | None:  # type: ignore[type-arg]
-        """Get latest grasp results, waiting for new ones if current ones have been consumed."""
-        # Mark current grasps as consumed and get a reference
-        with self.grasp_lock:
-            current_grasps = self.latest_grasps
-            self.grasps_consumed = True
-
-        # If we already have grasps and they haven't been consumed, return them
-        if current_grasps is not None and not getattr(self, "grasps_consumed", False):
-            return current_grasps
-
-        # Wait for new grasps
-        start_time = time.time()
-        while time.time() - start_time < timeout:
-            with self.grasp_lock:
-                # Check if we have new grasps (different from what we marked as consumed)
-                if self.latest_grasps is not None and not getattr(self, "grasps_consumed", False):
-                    return self.latest_grasps
-            time.sleep(0.1)  # Check every 100ms
-
-        return None  # Timeout reached
-
-    def clear_grasps(self) -> None:
-        """Clear all stored grasp results."""
-        with self.grasp_lock:
-            self.latest_grasps = []
-
-    def _prepare_colors(self, colors: np.ndarray | None) -> np.ndarray | None:  # type: ignore[type-arg]
-        """Prepare colors array, converting from various formats if needed."""
-        if colors is None:
-            return None
-
-        if colors.max() > 1.0:
-            colors = colors / 255.0
-
-        return colors
-
-    def _convert_grasp_format(self, grasps: list[dict]) -> list[dict]:  # type: ignore[type-arg]
-        """Convert Grasp format to our visualization format."""
-        converted = []
-
-        for i, grasp in enumerate(grasps):
-            rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3)))
-            euler_angles = self._rotation_matrix_to_euler(rotation_matrix)
-
-            converted_grasp = {
-                "id": f"grasp_{i}",
-                "score": grasp.get("score", 0.0),
-                "width": grasp.get("width", 0.0),
-                "height": grasp.get("height", 0.0),
-                "depth": grasp.get("depth", 0.0),
-                "translation": grasp.get("translation", [0, 0, 0]),
-                "rotation_matrix": rotation_matrix.tolist(),
-                "euler_angles": euler_angles,
-            }
-            converted.append(converted_grasp)
-
-        converted.sort(key=lambda x: x["score"], reverse=True)
-
-        return converted
-
-    def _rotation_matrix_to_euler(self, rotation_matrix: np.ndarray) -> dict[str, float]:  # type: ignore[type-arg]
-        """Convert rotation matrix to Euler angles (in radians)."""
-        sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2)
-
-        singular = sy < 1e-6
-
-        if not singular:
-            x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2])
-            y = np.arctan2(-rotation_matrix[2, 0], sy)
-            z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0])
-        else:
-            x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1])
-            y = np.arctan2(-rotation_matrix[2, 0], sy)
-            z = 0
-
-        return {"roll": x, "pitch": y, "yaw": z}
-
-    def cleanup(self) -> None:
-        """Clean up resources."""
-        if hasattr(self.detector, "cleanup"):
-            self.detector.cleanup()
-
-        if self.grasp_loop and self.grasp_loop_thread:
-            self.grasp_loop.call_soon_threadsafe(self.grasp_loop.stop)
-            self.grasp_loop_thread.join(timeout=1.0)
-
-        if hasattr(self.pointcloud_filter, "cleanup"):
-            self.pointcloud_filter.cleanup()
-        logger.info("ManipulationPipeline cleaned up")
diff --git a/dimos/manipulation/manip_aio_processer.py b/dimos/manipulation/manip_aio_processer.py
deleted file mode 100644
index 71ed42bff3..0000000000
--- a/dimos/manipulation/manip_aio_processer.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Sequential manipulation processor for single-frame processing without reactive streams.
-"""
-
-import time
-from typing import Any
-
-import cv2
-import numpy as np
-
-from dimos.perception.common.utils import (
-    colorize_depth,
-    combine_object_data,
-    detection_results_to_object_data,
-)
-from dimos.perception.detection2d.detic_2d_det import (  # type: ignore[import-not-found, import-untyped]
-    Detic2DDetector,
-)
-from dimos.perception.grasp_generation.grasp_generation import HostedGraspGenerator
-from dimos.perception.grasp_generation.utils import create_grasp_overlay
-from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering
-from dimos.perception.pointcloud.utils import (
-    create_point_cloud_overlay_visualization,
-    extract_and_cluster_misc_points,
-    overlay_point_clouds_on_image,
-)
-from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter
-from dimos.utils.logging_config import setup_logger
-
-logger = setup_logger()
-
-
-class ManipulationProcessor:
-    """
-    Sequential manipulation processor for single-frame processing.
-
-    Processes RGB-D frames through object detection, point cloud filtering,
-    and grasp generation in a single thread without reactive streams.
-    """
-
-    def __init__(
-        self,
-        camera_intrinsics: list[float],  # [fx, fy, cx, cy]
-        min_confidence: float = 0.6,
-        max_objects: int = 20,
-        vocabulary: str | None = None,
-        enable_grasp_generation: bool = False,
-        grasp_server_url: str | None = None,  # Required when enable_grasp_generation=True
-        enable_segmentation: bool = True,
-    ) -> None:
-        """
-        Initialize the manipulation processor.
-
-        Args:
-            camera_intrinsics: [fx, fy, cx, cy] camera parameters
-            min_confidence: Minimum detection confidence threshold
-            max_objects: Maximum number of objects to process
-            vocabulary: Optional vocabulary for Detic detector
-            enable_grasp_generation: Whether to enable grasp generation
-            grasp_server_url: WebSocket URL for Dimensional Grasp server (required when enable_grasp_generation=True)
-            enable_segmentation: Whether to enable semantic segmentation
-            segmentation_model: Segmentation model to use (SAM 2 or FastSAM)
-        """
-        self.camera_intrinsics = camera_intrinsics
-        self.min_confidence = min_confidence
-        self.max_objects = max_objects
-        self.enable_grasp_generation = enable_grasp_generation
-        self.grasp_server_url = grasp_server_url
-        self.enable_segmentation = enable_segmentation
-
-        # Validate grasp generation requirements
-        if enable_grasp_generation and not grasp_server_url:
-            raise ValueError("grasp_server_url is required when enable_grasp_generation=True")
-
-        # Initialize object detector
-        self.detector = Detic2DDetector(vocabulary=vocabulary, threshold=min_confidence)
-
-        # Initialize point cloud processor
-        self.pointcloud_filter = PointcloudFiltering(
-            color_intrinsics=camera_intrinsics,
-            depth_intrinsics=camera_intrinsics,  # ZED uses same intrinsics
-            max_num_objects=max_objects,
-        )
-
-        # Initialize semantic segmentation
-        self.segmenter = None
-        if self.enable_segmentation:
-            self.segmenter = Sam2DSegmenter(
-                use_tracker=False,  # Disable tracker for simple segmentation
-                use_analyzer=False,  # Disable analyzer for simple segmentation
-            )
-
-        # Initialize grasp generator if enabled
-        self.grasp_generator = None
-        if self.enable_grasp_generation:
-            try:
-                self.grasp_generator = HostedGraspGenerator(server_url=grasp_server_url)  # type: ignore[arg-type]
-                logger.info("Hosted grasp generator initialized successfully")
-            except Exception as e:
-                logger.error(f"Failed to initialize hosted grasp generator: {e}")
-                self.grasp_generator = None
-                self.enable_grasp_generation = False
-
-        logger.info(
-            f"Initialized ManipulationProcessor with confidence={min_confidence}, "
-            f"grasp_generation={enable_grasp_generation}"
-        )
-
-    def process_frame(
-        self,
-        rgb_image: np.ndarray,  # type: ignore[type-arg]
-        depth_image: np.ndarray,  # type: ignore[type-arg]
-        generate_grasps: bool | None = None,
-    ) -> dict[str, Any]:
-        """
-        Process a single RGB-D frame through the complete pipeline.
-
-        Args:
-            rgb_image: RGB image (H, W, 3)
-            depth_image: Depth image (H, W) in meters
-            generate_grasps: Override grasp generation setting for this frame
-
-        Returns:
-            Dictionary containing:
-                - detection_viz: Visualization of object detection
-                - pointcloud_viz: Visualization of point cloud overlay
-                - segmentation_viz: Visualization of semantic segmentation (if enabled)
-                - detection2d_objects: Raw detection results as ObjectData
-                - segmentation2d_objects: Raw segmentation results as ObjectData (if enabled)
-                - detected_objects: Detection (Object Detection) objects with point clouds filtered
-                - all_objects: Combined objects with intelligent duplicate removal
-                - full_pointcloud: Complete scene point cloud (if point cloud processing enabled)
-                - misc_clusters: List of clustered background/miscellaneous point clouds (DBSCAN)
-                - misc_voxel_grid: Open3D voxel grid approximating all misc/background points
-                - misc_pointcloud_viz: Visualization of misc/background cluster overlay
-                - grasps: Grasp results (list of dictionaries, if enabled)
-                - grasp_overlay: Grasp visualization overlay (if enabled)
-                - processing_time: Total processing time
-        """
-        start_time = time.time()
-        results = {}
-
-        try:
-            # Step 1: Object Detection
-            step_start = time.time()
-            detection_results = self.run_object_detection(rgb_image)
-            results["detection2d_objects"] = detection_results.get("objects", [])
-            results["detection_viz"] = detection_results.get("viz_frame")
-            detection_time = time.time() - step_start
-
-            # Step 2: Semantic Segmentation (if enabled)
-            segmentation_time = 0
-            if self.enable_segmentation:
-                step_start = time.time()
-                segmentation_results = self.run_segmentation(rgb_image)
-                results["segmentation2d_objects"] = segmentation_results.get("objects", [])
-                results["segmentation_viz"] = segmentation_results.get("viz_frame")
-                segmentation_time = time.time() - step_start  # type: ignore[assignment]
-
-            # Step 3: Point Cloud Processing
-            pointcloud_time = 0
-            detection2d_objects = results.get("detection2d_objects", [])
-            segmentation2d_objects = results.get("segmentation2d_objects", [])
-
-            # Process detection objects if available
-            detected_objects = []
-            if detection2d_objects:
-                step_start = time.time()
-                detected_objects = self.run_pointcloud_filtering(
-                    rgb_image, depth_image, detection2d_objects
-                )
-                pointcloud_time += time.time() - step_start  # type: ignore[assignment]
-
-            # Process segmentation objects if available
-            segmentation_filtered_objects = []
-            if segmentation2d_objects:
-                step_start = time.time()
-                segmentation_filtered_objects = self.run_pointcloud_filtering(
-                    rgb_image, depth_image, segmentation2d_objects
-                )
-                pointcloud_time += time.time() - step_start  # type: ignore[assignment]
-
-            # Combine all objects using intelligent duplicate removal
-            all_objects = combine_object_data(
-                detected_objects,  # type: ignore[arg-type]
-                segmentation_filtered_objects,  # type: ignore[arg-type]
-                overlap_threshold=0.8,
-            )
-
-            # Get full point cloud
-            full_pcd = self.pointcloud_filter.get_full_point_cloud()
-
-            # Extract misc/background points and create voxel grid
-            misc_start = time.time()
-            misc_clusters, misc_voxel_grid = extract_and_cluster_misc_points(
-                full_pcd,
-                all_objects,  # type: ignore[arg-type]
-                eps=0.03,
-                min_points=100,
-                enable_filtering=True,
-                voxel_size=0.02,
-            )
-            misc_time = time.time() - misc_start
-
-            # Store results
-            results.update(
-                {
-                    "detected_objects": detected_objects,
-                    "all_objects": all_objects,
-                    "full_pointcloud": full_pcd,
-                    "misc_clusters": misc_clusters,
-                    "misc_voxel_grid": misc_voxel_grid,
-                }
-            )
-
-            # Create point cloud visualizations
-            base_image = colorize_depth(depth_image, max_depth=10.0)
-
-            # Create visualizations
-            results["pointcloud_viz"] = (
-                create_point_cloud_overlay_visualization(
-                    base_image=base_image,  # type: ignore[arg-type]
-                    objects=all_objects,  # type: ignore[arg-type]
-                    intrinsics=self.camera_intrinsics,  # type: ignore[arg-type]
-                )
-                if all_objects
-                else base_image
-            )
-
-            results["detected_pointcloud_viz"] = (
-                create_point_cloud_overlay_visualization(
-                    base_image=base_image,  # type: ignore[arg-type]
-                    objects=detected_objects,
-                    intrinsics=self.camera_intrinsics,  # type: ignore[arg-type]
-                )
-                if detected_objects
-                else base_image
-            )
-
-            if misc_clusters:
-                # Generate consistent colors for clusters
-                cluster_colors = [
-                    tuple((np.random.RandomState(i + 100).rand(3) * 255).astype(int))
-                    for i in range(len(misc_clusters))
-                ]
-                results["misc_pointcloud_viz"] = overlay_point_clouds_on_image(
-                    base_image=base_image,  # type: ignore[arg-type]
-                    point_clouds=misc_clusters,
-                    camera_intrinsics=self.camera_intrinsics,
-                    colors=cluster_colors,
-                    point_size=2,
-                    alpha=0.6,
-                )
-            else:
-                results["misc_pointcloud_viz"] = base_image
-
-            # Step 4: Grasp Generation (if enabled)
-            should_generate_grasps = (
-                generate_grasps if generate_grasps is not None else self.enable_grasp_generation
-            )
-
-            if should_generate_grasps and all_objects and full_pcd:
-                grasps = self.run_grasp_generation(all_objects, full_pcd)  # type: ignore[arg-type]
-                results["grasps"] = grasps
-                if grasps:
-                    results["grasp_overlay"] = create_grasp_overlay(
-                        rgb_image, grasps, self.camera_intrinsics
-                    )
-
-        except Exception as e:
-            logger.error(f"Error processing frame: {e}")
-            results["error"] = str(e)
-
-        # Add timing information
-        total_time = time.time() - start_time
-        results.update(
-            {
-                "processing_time": total_time,
-                "timing_breakdown": {
-                    "detection": detection_time if "detection_time" in locals() else 0,
-                    "segmentation": segmentation_time if "segmentation_time" in locals() else 0,
-                    "pointcloud": pointcloud_time if "pointcloud_time" in locals() else 0,
-                    "misc_extraction": misc_time if "misc_time" in locals() else 0,
-                    "total": total_time,
-                },
-            }
-        )
-
-        return results
-
-    def run_object_detection(self, rgb_image: np.ndarray) -> dict[str, Any]:  # type: ignore[type-arg]
-        """Run object detection on RGB image."""
-        try:
-            # Convert RGB to BGR for Detic detector
-            bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
-
-            # Use process_image method from Detic detector
-            bboxes, track_ids, class_ids, confidences, names, masks = self.detector.process_image(
-                bgr_image
-            )
-
-            # Convert to ObjectData format using utility function
-            objects = detection_results_to_object_data(
-                bboxes=bboxes,
-                track_ids=track_ids,
-                class_ids=class_ids,
-                confidences=confidences,
-                names=names,
-                masks=masks,
-                source="detection",
-            )
-
-            # Create visualization using detector's built-in method
-            viz_frame = self.detector.visualize_results(
-                rgb_image, bboxes, track_ids, class_ids, confidences, names
-            )
-
-            return {"objects": objects, "viz_frame": viz_frame}
-
-        except Exception as e:
-            logger.error(f"Object detection failed: {e}")
-            return {"objects": [], "viz_frame": rgb_image.copy()}
-
-    def run_pointcloud_filtering(
-        self,
-        rgb_image: np.ndarray,  # type: ignore[type-arg]
-        depth_image: np.ndarray,  # type: ignore[type-arg]
-        objects: list[dict],  # type: ignore[type-arg]
-    ) -> list[dict]:  # type: ignore[type-arg]
-        """Run point cloud filtering on detected objects."""
-        try:
-            filtered_objects = self.pointcloud_filter.process_images(
-                rgb_image,
-                depth_image,
-                objects,  # type: ignore[arg-type]
-            )
-            return filtered_objects if filtered_objects else []  # type: ignore[return-value]
-        except Exception as e:
-            logger.error(f"Point cloud filtering failed: {e}")
-            return []
-
-    def run_segmentation(self, rgb_image: np.ndarray) -> dict[str, Any]:  # type: ignore[type-arg]
-        """Run semantic segmentation on RGB image."""
-        if not self.segmenter:
-            return {"objects": [], "viz_frame": rgb_image.copy()}
-
-        try:
-            # Convert RGB to BGR for segmenter
-            bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
-
-            # Get segmentation results
-            masks, bboxes, track_ids, probs, names = self.segmenter.process_image(bgr_image)  # type: ignore[no-untyped-call]
-
-            # Convert to ObjectData format using utility function
-            objects = detection_results_to_object_data(
-                bboxes=bboxes,
-                track_ids=track_ids,
-                class_ids=list(range(len(bboxes))),  # Use indices as class IDs for segmentation
-                confidences=probs,
-                names=names,
-                masks=masks,
-                source="segmentation",
-            )
-
-            # Create visualization
-            if masks:
-                viz_bgr = self.segmenter.visualize_results(
-                    bgr_image, masks, bboxes, track_ids, probs, names
-                )
-                # Convert back to RGB
-                viz_frame = cv2.cvtColor(viz_bgr, cv2.COLOR_BGR2RGB)
-            else:
-                viz_frame = rgb_image.copy()
-
-            return {"objects": objects, "viz_frame": viz_frame}
-
-        except Exception as e:
-            logger.error(f"Segmentation failed: {e}")
-            return {"objects": [], "viz_frame": rgb_image.copy()}
-
-    def run_grasp_generation(self, filtered_objects: list[dict], full_pcd) -> list[dict] | None:  # type: ignore[no-untyped-def, type-arg]
-        """Run grasp generation using the configured generator."""
-        if not self.grasp_generator:
-            logger.warning("Grasp generation requested but no generator available")
-            return None
-
-        try:
-            # Generate grasps using the configured generator
-            grasps = self.grasp_generator.generate_grasps_from_objects(filtered_objects, full_pcd)  # type: ignore[arg-type]
-
-            # Return parsed results directly (list of grasp dictionaries)
-            return grasps
-
-        except Exception as e:
-            logger.error(f"Grasp generation failed: {e}")
-            return None
-
-    def cleanup(self) -> None:
-        """Clean up resources."""
-        if hasattr(self.detector, "cleanup"):
-            self.detector.cleanup()
-        if hasattr(self.pointcloud_filter, "cleanup"):
-            self.pointcloud_filter.cleanup()
-        if self.segmenter and hasattr(self.segmenter, "cleanup"):
-            self.segmenter.cleanup()
-        if self.grasp_generator and hasattr(self.grasp_generator, "cleanup"):
-            self.grasp_generator.cleanup()
-        logger.info("ManipulationProcessor cleaned up")
diff --git a/dimos/manipulation/manipulation_interface.py b/dimos/manipulation/manipulation_interface.py
index edeb99c0f0..10e71fbc66 100644
--- a/dimos/manipulation/manipulation_interface.py
+++ b/dimos/manipulation/manipulation_interface.py
@@ -26,7 +26,6 @@
 from dimos.manipulation.manipulation_history import (
     ManipulationHistory,
 )
-from dimos.perception.object_detection_stream import ObjectDetectionStream
 from dimos.types.manipulation import (
     AbstractConstraint,
     ManipulationTask,
@@ -53,7 +52,7 @@ def __init__(
         self,
         output_dir: str,
         new_memory: bool = False,
-        perception_stream: ObjectDetectionStream = None,  # type: ignore[assignment]
+        perception_stream: Any = None,
     ) -> None:
         """
         Initialize a new ManipulationInterface instance.
diff --git a/dimos/manipulation/visual_servoing/detection3d.py b/dimos/manipulation/visual_servoing/detection3d.py
deleted file mode 100644
index fca085df8c..0000000000
--- a/dimos/manipulation/visual_servoing/detection3d.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Real-time 3D object detection processor that extracts object poses from RGB-D data.
-"""
-
-import cv2
-from dimos_lcm.vision_msgs import (
-    BoundingBox2D,
-    BoundingBox3D,
-    Detection2D,
-    Detection3D,
-    ObjectHypothesis,
-    ObjectHypothesisWithPose,
-    Point2D,
-    Pose2D,
-)
-import numpy as np
-
-from dimos.manipulation.visual_servoing.utils import (
-    estimate_object_depth,
-    transform_pose,
-    visualize_detections_3d,
-)
-from dimos.msgs.geometry_msgs import Pose, Quaternion, Vector3
-from dimos.msgs.std_msgs import Header
-from dimos.msgs.vision_msgs import Detection2DArray, Detection3DArray
-from dimos.perception.common.utils import bbox2d_to_corners
-from dimos.perception.detection2d.utils import calculate_object_size_from_bbox
-from dimos.perception.pointcloud.utils import extract_centroids_from_masks
-from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter
-from dimos.utils.logging_config import setup_logger
-
-logger = setup_logger()
-
-
-class Detection3DProcessor:
-    """
-    Real-time 3D detection processor optimized for speed.
-
-    Uses Sam (FastSAM) for segmentation and mask generation, then extracts
-    3D centroids from depth data.
-    """
-
-    def __init__(
-        self,
-        camera_intrinsics: list[float],  # [fx, fy, cx, cy]
-        min_confidence: float = 0.6,
-        min_points: int = 30,
-        max_depth: float = 1.0,
-        max_object_size: float = 0.15,
-    ) -> None:
-        """
-        Initialize the real-time 3D detection processor.
-
-        Args:
-            camera_intrinsics: [fx, fy, cx, cy] camera parameters
-            min_confidence: Minimum detection confidence threshold
-            min_points: Minimum 3D points required for valid detection
-            max_depth: Maximum valid depth in meters
-        """
-        self.camera_intrinsics = camera_intrinsics
-        self.min_points = min_points
-        self.max_depth = max_depth
-        self.max_object_size = max_object_size
-
-        # Initialize Sam segmenter with tracking enabled but analysis disabled
-        self.detector = Sam2DSegmenter(
-            use_tracker=False,
-            use_analyzer=False,
-            use_filtering=True,
-        )
-
-        self.min_confidence = min_confidence
-
-        logger.info(
-            f"Initialized Detection3DProcessor with Sam segmenter, confidence={min_confidence}, "
-            f"min_points={min_points}, max_depth={max_depth}m, max_object_size={max_object_size}m"
-        )
-
-    def process_frame(
-        self,
-        rgb_image: np.ndarray,  # type: ignore[type-arg]
-        depth_image: np.ndarray,  # type: ignore[type-arg]
-        transform: np.ndarray | None = None,  # type: ignore[type-arg]
-    ) -> tuple[Detection3DArray, Detection2DArray]:
-        """
-        Process a single RGB-D frame to extract 3D object detections.
-
-        Args:
-            rgb_image: RGB image (H, W, 3)
-            depth_image: Depth image (H, W) in meters
-            transform: Optional 4x4 transformation matrix to transform objects from camera frame to desired frame
-
-        Returns:
-            Tuple of (Detection3DArray, Detection2DArray) with 3D and 2D information
-        """
-
-        # Convert RGB to BGR for Sam (OpenCV format)
-        bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
-
-        # Run Sam segmentation with tracking
-        masks, bboxes, track_ids, probs, names = self.detector.process_image(bgr_image)  # type: ignore[no-untyped-call]
-
-        if not masks or len(masks) == 0:
-            return Detection3DArray(
-                detections_length=0, header=Header(), detections=[]
-            ), Detection2DArray(detections_length=0, header=Header(), detections=[])
-
-        # Convert CUDA tensors to numpy arrays if needed
-        numpy_masks = []
-        for mask in masks:
-            if hasattr(mask, "cpu"):  # PyTorch tensor
-                numpy_masks.append(mask.cpu().numpy())
-            else:  # Already numpy array
-                numpy_masks.append(mask)
-
-        # Extract 3D centroids from masks
-        poses = extract_centroids_from_masks(
-            rgb_image=rgb_image,
-            depth_image=depth_image,
-            masks=numpy_masks,
-            camera_intrinsics=self.camera_intrinsics,
-        )
-
-        detections_3d = []
-        detections_2d = []
-        pose_dict = {p["mask_idx"]: p for p in poses if p["centroid"][2] < self.max_depth}
-
-        for i, (bbox, name, prob, track_id) in enumerate(
-            zip(bboxes, names, probs, track_ids, strict=False)
-        ):
-            if i not in pose_dict:
-                continue
-
-            pose = pose_dict[i]
-            obj_cam_pos = pose["centroid"]
-
-            if obj_cam_pos[2] > self.max_depth:
-                continue
-
-            # Calculate object size from bbox and depth
-            width_m, height_m = calculate_object_size_from_bbox(
-                bbox, obj_cam_pos[2], self.camera_intrinsics
-            )
-
-            # Calculate depth dimension using segmentation mask
-            depth_m = estimate_object_depth(
-                depth_image, numpy_masks[i] if i < len(numpy_masks) else None, bbox
-            )
-
-            size_x = max(width_m, 0.01)  # Minimum 1cm width
-            size_y = max(height_m, 0.01)  # Minimum 1cm height
-            size_z = max(depth_m, 0.01)  # Minimum 1cm depth
-
-            if min(size_x, size_y, size_z) > self.max_object_size:
-                continue
-
-            # Transform to desired frame if transform matrix is provided
-            if transform is not None:
-                # Get orientation as euler angles, default to no rotation if not available
-                obj_cam_orientation = pose.get(
-                    "rotation", np.array([0.0, 0.0, 0.0])
-                )  # Default to no rotation
-                transformed_pose = transform_pose(
-                    obj_cam_pos, obj_cam_orientation, transform, to_robot=True
-                )
-                center_pose = transformed_pose
-            else:
-                # If no transform, use camera coordinates
-                center_pose = Pose(
-                    position=Vector3(obj_cam_pos[0], obj_cam_pos[1], obj_cam_pos[2]),
-                    orientation=Quaternion(0.0, 0.0, 0.0, 1.0),  # Default orientation
-                )
-
-            # Create Detection3D object
-            detection = Detection3D(
-                results_length=1,
-                header=Header(),  # Empty header
-                results=[
-                    ObjectHypothesisWithPose(
-                        hypothesis=ObjectHypothesis(class_id=name, score=float(prob))
-                    )
-                ],
-                bbox=BoundingBox3D(center=center_pose, size=Vector3(size_x, size_y, size_z)),
-                id=str(track_id),
-            )
-
-            detections_3d.append(detection)
-
-            # Create corresponding Detection2D
-            x1, y1, x2, y2 = bbox
-            center_x = (x1 + x2) / 2.0
-            center_y = (y1 + y2) / 2.0
-            width = x2 - x1
-            height = y2 - y1
-
-            detection_2d = Detection2D(
-                results_length=1,
-                header=Header(),
-                results=[
-                    ObjectHypothesisWithPose(
-                        hypothesis=ObjectHypothesis(class_id=name, score=float(prob))
-                    )
-                ],
-                bbox=BoundingBox2D(
-                    center=Pose2D(position=Point2D(center_x, center_y), theta=0.0),
-                    size_x=float(width),
-                    size_y=float(height),
-                ),
-                id=str(track_id),
-            )
-            detections_2d.append(detection_2d)
-
-        # Create and return both arrays
-        return (
-            Detection3DArray(
-                detections_length=len(detections_3d), header=Header(), detections=detections_3d
-            ),
-            Detection2DArray(
-                detections_length=len(detections_2d), header=Header(), detections=detections_2d
-            ),
-        )
-
-    def visualize_detections(
-        self,
-        rgb_image: np.ndarray,  # type: ignore[type-arg]
-        detections_3d: list[Detection3D],
-        detections_2d: list[Detection2D],
-        show_coordinates: bool = True,
-    ) -> np.ndarray:  # type: ignore[type-arg]
-        """
-        Visualize detections with 3D position overlay next to bounding boxes.
-
-        Args:
-            rgb_image: Original RGB image
-            detections_3d: List of Detection3D objects
-            detections_2d: List of Detection2D objects (must be 1:1 correspondence)
-            show_coordinates: Whether to show 3D coordinates
-
-        Returns:
-            Visualization image
-        """
-        # Extract 2D bboxes from Detection2D objects
-
-        bboxes_2d = []
-        for det_2d in detections_2d:
-            if det_2d.bbox:
-                x1, y1, x2, y2 = bbox2d_to_corners(det_2d.bbox)
-                bboxes_2d.append([x1, y1, x2, y2])
-
-        return visualize_detections_3d(rgb_image, detections_3d, show_coordinates, bboxes_2d)
-
-    def get_closest_detection(
-        self, detections: list[Detection3D], class_filter: str | None = None
-    ) -> Detection3D | None:
-        """
-        Get the closest detection with valid 3D data.
-
-        Args:
-            detections: List of Detection3D objects
-            class_filter: Optional class name to filter by
-
-        Returns:
-            Closest Detection3D or None
-        """
-        valid_detections = []
-        for d in detections:
-            # Check if has valid bbox center position
-            if d.bbox and d.bbox.center and d.bbox.center.position:
-                # Check class filter if specified
-                if class_filter is None or (
-                    d.results_length > 0 and d.results[0].hypothesis.class_id == class_filter
-                ):
-                    valid_detections.append(d)
-
-        if not valid_detections:
-            return None
-
-        # Sort by depth (Z coordinate)
-        def get_z_coord(d):  # type: ignore[no-untyped-def]
-            return abs(d.bbox.center.position.z)
-
-        return min(valid_detections, key=get_z_coord)
-
-    def cleanup(self) -> None:
-        """Clean up resources."""
-        if hasattr(self.detector, "cleanup"):
-            self.detector.cleanup()
-        logger.info("Detection3DProcessor cleaned up")
diff --git a/dimos/manipulation/visual_servoing/manipulation_module.py b/dimos/manipulation/visual_servoing/manipulation_module.py
deleted file mode 100644
index 088db9eb26..0000000000
--- a/dimos/manipulation/visual_servoing/manipulation_module.py
+++ /dev/null
@@ -1,951 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Manipulation module for robotic grasping with visual servoing.
-Handles grasping logic, state machine, and hardware coordination as a Dimos module.
-"""
-
-from collections import deque
-from enum import Enum
-import threading
-import time
-from typing import Any
-
-import cv2
-from dimos_lcm.sensor_msgs import CameraInfo
-import numpy as np
-from reactivex.disposable import Disposable
-
-from dimos.core import In, Module, Out, rpc
-from dimos.hardware.manipulators.piper.piper_arm import (  # type: ignore[import-not-found, import-untyped]
-    PiperArm,
-)
-from dimos.manipulation.visual_servoing.detection3d import Detection3DProcessor
-from dimos.manipulation.visual_servoing.pbvs import PBVS
-from dimos.manipulation.visual_servoing.utils import (
-    create_manipulation_visualization,
-    is_target_reached,
-    select_points_from_depth,
-    transform_points_3d,
-    update_target_grasp_pose,
-)
-from dimos.msgs.geometry_msgs import Pose, Quaternion, Vector3
-from dimos.msgs.sensor_msgs import Image
-from dimos.msgs.vision_msgs import Detection2DArray, Detection3DArray
-from dimos.perception.common.utils import find_clicked_detection
-from dimos.utils.logging_config import setup_logger
-from dimos.utils.transform_utils import (
-    compose_transforms,
-    create_transform_from_6dof,
-    matrix_to_pose,
-    pose_to_matrix,
-)
-
-logger = setup_logger()
-
-
-class GraspStage(Enum):
-    """Enum for different grasp stages."""
-
-    IDLE = "idle"
-    PRE_GRASP = "pre_grasp"
-    GRASP = "grasp"
-    CLOSE_AND_RETRACT = "close_and_retract"
-    PLACE = "place"
-    RETRACT = "retract"
-
-
-class Feedback:
-    """Feedback data containing state information about the manipulation process."""
-
-    def __init__(
-        self,
-        grasp_stage: GraspStage,
-        target_tracked: bool,
-        current_executed_pose: Pose | None = None,
-        current_ee_pose: Pose | None = None,
-        current_camera_pose: Pose | None = None,
-        target_pose: Pose | None = None,
-        waiting_for_reach: bool = False,
-        success: bool | None = None,
-    ) -> None:
-        self.grasp_stage = grasp_stage
-        self.target_tracked = target_tracked
-        self.current_executed_pose = current_executed_pose
-        self.current_ee_pose = current_ee_pose
-        self.current_camera_pose = current_camera_pose
-        self.target_pose = target_pose
-        self.waiting_for_reach = waiting_for_reach
-        self.success = success
-
-
-class ManipulationModule(Module):
-    """
-    Manipulation module for visual servoing and grasping.
-
-    Subscribes to:
-        - ZED RGB images
-        - ZED depth images
-        - ZED camera info
-
-    Publishes:
-        - Visualization images
-
-    RPC methods:
-        - handle_keyboard_command: Process keyboard input
-        - pick_and_place: Execute pick and place task
-    """
-
-    # LCM inputs
-    rgb_image: In[Image]
-    depth_image: In[Image]
-    camera_info: In[CameraInfo]
-
-    # LCM outputs
-    viz_image: Out[Image]
-
-    def __init__(  # type: ignore[no-untyped-def]
-        self,
-        ee_to_camera_6dof: list | None = None,  # type: ignore[type-arg]
-        **kwargs,
-    ) -> None:
-        """
-        Initialize manipulation module.
-
-        Args:
-            ee_to_camera_6dof: EE to camera transform [x, y, z, rx, ry, rz] in meters and radians
-            workspace_min_radius: Minimum workspace radius in meters
-            workspace_max_radius: Maximum workspace radius in meters
-            min_grasp_pitch_degrees: Minimum grasp pitch angle (at max radius)
-            max_grasp_pitch_degrees: Maximum grasp pitch angle (at min radius)
-        """
-        super().__init__(**kwargs)
-
-        self.arm = PiperArm()
-
-        if ee_to_camera_6dof is None:
-            ee_to_camera_6dof = [-0.065, 0.03, -0.095, 0.0, -1.57, 0.0]
-        pos = Vector3(ee_to_camera_6dof[0], ee_to_camera_6dof[1], ee_to_camera_6dof[2])
-        rot = Vector3(ee_to_camera_6dof[3], ee_to_camera_6dof[4], ee_to_camera_6dof[5])
-        self.T_ee_to_camera = create_transform_from_6dof(pos, rot)
-
-        self.camera_intrinsics = None
-        self.detector = None
-        self.pbvs = None
-
-        # Control state
-        self.last_valid_target = None
-        self.waiting_for_reach = False
-        self.current_executed_pose = None  # Track the actual pose sent to arm
-        self.target_updated = False
-        self.waiting_start_time = None
-        self.reach_pose_timeout = 20.0
-
-        # Grasp parameters
-        self.grasp_width_offset = 0.03
-        self.pregrasp_distance = 0.25
-        self.grasp_distance_range = 0.03
-        self.grasp_close_delay = 2.0
-        self.grasp_reached_time = None
-        self.gripper_max_opening = 0.07
-
-        # Workspace limits and dynamic pitch parameters
-        self.workspace_min_radius = 0.2
-        self.workspace_max_radius = 0.75
-        self.min_grasp_pitch_degrees = 5.0
-        self.max_grasp_pitch_degrees = 60.0
-
-        # Grasp stage tracking
-        self.grasp_stage = GraspStage.IDLE
-
-        # Pose stabilization tracking
-        self.pose_history_size = 4
-        self.pose_stabilization_threshold = 0.01
-        self.stabilization_timeout = 25.0
-        self.stabilization_start_time = None
-        self.reached_poses = deque(maxlen=self.pose_history_size)  # type: ignore[var-annotated]
-        self.adjustment_count = 0
-
-        # Pose reachability tracking
-        self.ee_pose_history = deque(maxlen=20)  # type: ignore[var-annotated]  # Keep history of EE poses
-        self.stuck_pose_threshold = 0.001  # 1mm movement threshold
-        self.stuck_pose_adjustment_degrees = 5.0
-        self.stuck_count = 0
-        self.max_stuck_reattempts = 7
-
-        # State for visualization
-        self.current_visualization = None
-        self.last_detection_3d_array = None
-        self.last_detection_2d_array = None
-
-        # Grasp result and task tracking
-        self.pick_success = None
-        self.final_pregrasp_pose = None
-        self.task_failed = False
-        self.overall_success = None
-
-        # Task control
-        self.task_running = False
-        self.task_thread = None
-        self.stop_event = threading.Event()
-
-        # Latest sensor data
-        self.latest_rgb = None
-        self.latest_depth = None
-        self.latest_camera_info = None
-
-        # Target selection
-        self.target_click = None
-
-        # Place target position and object info
-        self.home_pose = Pose(
-            position=Vector3(0.0, 0.0, 0.0), orientation=Quaternion(0.0, 0.0, 0.0, 1.0)
-        )
-        self.place_target_position = None
-        self.target_object_height = None
-        self.retract_distance = 0.12
-        self.place_pose = None
-        self.retract_pose = None
-        self.arm.gotoObserve()
-
-    @rpc
-    def start(self) -> None:
-        """Start the manipulation module."""
-
-        unsub = self.rgb_image.subscribe(self._on_rgb_image)
-        self._disposables.add(Disposable(unsub))
-
-        unsub = self.depth_image.subscribe(self._on_depth_image)
-        self._disposables.add(Disposable(unsub))
-
-        unsub = self.camera_info.subscribe(self._on_camera_info)
-        self._disposables.add(Disposable(unsub))
-
-        logger.info("Manipulation module started")
-
-    @rpc
-    def stop(self) -> None:
-        """Stop the manipulation module."""
-        # Stop any running task
-        self.stop_event.set()
-        if self.task_thread and self.task_thread.is_alive():
-            self.task_thread.join(timeout=5.0)
-
-        self.reset_to_idle()
-
-        if self.detector and hasattr(self.detector, "cleanup"):
-            self.detector.cleanup()
-        self.arm.disable()
-
-        logger.info("Manipulation module stopped")
-
-    def _on_rgb_image(self, msg: Image) -> None:
-        """Handle RGB image messages."""
-        try:
-            self.latest_rgb = msg.data
-        except Exception as e:
-            logger.error(f"Error processing RGB image: {e}")
-
-    def _on_depth_image(self, msg: Image) -> None:
-        """Handle depth image messages."""
-        try:
-            self.latest_depth = msg.data
-        except Exception as e:
-            logger.error(f"Error processing depth image: {e}")
-
-    def _on_camera_info(self, msg: CameraInfo) -> None:
-        """Handle camera info messages."""
-        try:
-            self.camera_intrinsics = [msg.K[0], msg.K[4], msg.K[2], msg.K[5]]  # type: ignore[assignment]
-
-            if self.detector is None:
-                self.detector = Detection3DProcessor(self.camera_intrinsics)  # type: ignore[arg-type, assignment]
-                self.pbvs = PBVS()  # type: ignore[assignment]
-                logger.info("Initialized detection and PBVS processors")
-
-            self.latest_camera_info = msg
-        except Exception as e:
-            logger.error(f"Error processing camera info: {e}")
-
-    @rpc
-    def get_single_rgb_frame(self) -> np.ndarray | None:  # type: ignore[type-arg]
-        """
-        get the latest rgb frame from the camera
-        """
-        return self.latest_rgb
-
-    @rpc
-    def handle_keyboard_command(self, key: str) -> str:
-        """
-        Handle keyboard commands for robot control.
-
-        Args:
-            key: Keyboard key as string
-
-        Returns:
-            Action taken as string, or empty string if no action
-        """
-        key_code = ord(key) if len(key) == 1 else int(key)
-
-        if key_code == ord("r"):
-            self.stop_event.set()
-            self.task_running = False
-            self.reset_to_idle()
-            return "reset"
-        elif key_code == ord("s"):
-            logger.info("SOFT STOP - Emergency stopping robot!")
-            self.arm.softStop()
-            self.stop_event.set()
-            self.task_running = False
-            return "stop"
-        elif key_code == ord(" ") and self.pbvs and self.pbvs.target_grasp_pose:
-            if self.grasp_stage == GraspStage.PRE_GRASP:
-                self.set_grasp_stage(GraspStage.GRASP)
-            logger.info("Executing target pose")
-            return "execute"
-        elif key_code == ord("g"):
-            logger.info("Opening gripper")
-            self.arm.release_gripper()
-            return "release"
-
-        return ""
-
-    @rpc
-    def pick_and_place(
-        self,
-        target_x: int | None = None,
-        target_y: int | None = None,
-        place_x: int | None = None,
-        place_y: int | None = None,
-    ) -> dict[str, Any]:
-        """
-        Start a pick and place task.
-
-        Args:
-            target_x: Optional X coordinate of target object
-            target_y: Optional Y coordinate of target object
-            place_x: Optional X coordinate of place location
-            place_y: Optional Y coordinate of place location
-
-        Returns:
-            Dict with status and message
-        """
-        if self.task_running:
-            return {"status": "error", "message": "Task already running"}
-
-        if self.camera_intrinsics is None:
-            return {"status": "error", "message": "Camera not initialized"}
-
-        if target_x is not None and target_y is not None:
-            self.target_click = (target_x, target_y)
-        if place_x is not None and self.latest_depth is not None:
-            points_3d_camera = select_points_from_depth(
-                self.latest_depth,
-                (place_x, place_y),
-                self.camera_intrinsics,
-                radius=10,
-            )
-
-            if points_3d_camera.size > 0:
-                ee_pose = self.arm.get_ee_pose()
-                ee_transform = pose_to_matrix(ee_pose)
-                camera_transform = compose_transforms(ee_transform, self.T_ee_to_camera)
-
-                points_3d_world = transform_points_3d(
-                    points_3d_camera,
-                    camera_transform,
-                    to_robot=True,
-                )
-
-                place_position = np.mean(points_3d_world, axis=0)
-                self.place_target_position = place_position
-                logger.info(
-                    f"Place target set at position: ({place_position[0]:.3f}, {place_position[1]:.3f}, {place_position[2]:.3f})"
-                )
-            else:
-                logger.warning("No valid depth points found at place location")
-                self.place_target_position = None
-        else:
-            self.place_target_position = None
-
-        self.task_failed = False
-        self.stop_event.clear()
-
-        if self.task_thread and self.task_thread.is_alive():
-            self.stop_event.set()
-            self.task_thread.join(timeout=1.0)
-        self.task_thread = threading.Thread(target=self._run_pick_and_place, daemon=True)
-        self.task_thread.start()
-
-        return {"status": "started", "message": "Pick and place task started"}
-
-    def _run_pick_and_place(self) -> None:
-        """Run the pick and place task loop."""
-        self.task_running = True
-        logger.info("Starting pick and place task")
-
-        try:
-            while not self.stop_event.is_set():
-                if self.task_failed:
-                    logger.error("Task failed, terminating pick and place")
-                    self.stop_event.set()
-                    break
-
-                feedback = self.update()
-                if feedback is None:
-                    time.sleep(0.01)
-                    continue
-
-                if feedback.success is not None:  # type: ignore[attr-defined]
-                    if feedback.success:  # type: ignore[attr-defined]
-                        logger.info("Pick and place completed successfully!")
-                    else:
-                        logger.warning("Pick and place failed")
-                    self.reset_to_idle()
-                    self.stop_event.set()
-                    break
-
-                time.sleep(0.01)
-
-        except Exception as e:
-            logger.error(f"Error in pick and place task: {e}")
-            self.task_failed = True
-        finally:
-            self.task_running = False
-            logger.info("Pick and place task ended")
-
-    def set_grasp_stage(self, stage: GraspStage) -> None:
-        """Set the grasp stage."""
-        self.grasp_stage = stage
-        logger.info(f"Grasp stage: {stage.value}")
-
-    def calculate_dynamic_grasp_pitch(self, target_pose: Pose) -> float:
-        """
-        Calculate grasp pitch dynamically based on distance from robot base.
-        Maps workspace radius to grasp pitch angle.
-
-        Args:
-            target_pose: Target pose
-
-        Returns:
-            Grasp pitch angle in degrees
-        """
-        # Calculate 3D distance from robot base (assumes robot at origin)
-        position = target_pose.position
-        distance = np.sqrt(position.x**2 + position.y**2 + position.z**2)
-
-        # Clamp distance to workspace limits
-        distance = np.clip(distance, self.workspace_min_radius, self.workspace_max_radius)
-
-        # Linear interpolation: min_radius -> max_pitch, max_radius -> min_pitch
-        # Normalized distance (0 to 1)
-        normalized_dist = (distance - self.workspace_min_radius) / (
-            self.workspace_max_radius - self.workspace_min_radius
-        )
-
-        # Inverse mapping: closer objects need higher pitch
-        pitch_degrees = self.max_grasp_pitch_degrees - (
-            normalized_dist * (self.max_grasp_pitch_degrees - self.min_grasp_pitch_degrees)
-        )
-
-        return pitch_degrees  # type: ignore[no-any-return]
-
-    def check_within_workspace(self, target_pose: Pose) -> bool:
-        """
-        Check if pose is within workspace limits and log error if not.
-
-        Args:
-            target_pose: Target pose to validate
-
-        Returns:
-            True if within workspace, False otherwise
-        """
-        # Calculate 3D distance from robot base
-        position = target_pose.position
-        distance = np.sqrt(position.x**2 + position.y**2 + position.z**2)
-
-        if not (self.workspace_min_radius <= distance <= self.workspace_max_radius):
-            logger.error(
-                f"Target outside workspace limits: distance {distance:.3f}m not in [{self.workspace_min_radius:.2f}, {self.workspace_max_radius:.2f}]"
-            )
-            return False
-
-        return True
-
-    def _check_reach_timeout(self) -> tuple[bool, float]:
-        """Check if robot has exceeded timeout while reaching pose.
-
-        Returns:
-            Tuple of (timed_out, time_elapsed)
-        """
-        if self.waiting_start_time:
-            time_elapsed = time.time() - self.waiting_start_time
-            if time_elapsed > self.reach_pose_timeout:
-                logger.warning(
-                    f"Robot failed to reach pose within {self.reach_pose_timeout}s timeout"
-                )
-                self.task_failed = True
-                self.reset_to_idle()
-                return True, time_elapsed
-            return False, time_elapsed
-        return False, 0.0
-
-    def _check_if_stuck(self) -> bool:
-        """
-        Check if robot is stuck by analyzing pose history.
-
-        Returns:
-            Tuple of (is_stuck, max_std_dev_mm)
-        """
-        if len(self.ee_pose_history) < self.ee_pose_history.maxlen:  # type: ignore[operator]
-            return False
-
-        # Extract positions from pose history
-        positions = np.array(
-            [[p.position.x, p.position.y, p.position.z] for p in self.ee_pose_history]
-        )
-
-        # Calculate standard deviation of positions
-        std_devs = np.std(positions, axis=0)
-        # Check if all standard deviations are below stuck threshold
-        is_stuck = np.all(std_devs < self.stuck_pose_threshold)
-
-        return is_stuck  # type: ignore[return-value]
-
-    def check_reach_and_adjust(self) -> bool:
-        """
-        Check if robot has reached the current executed pose while waiting.
-        Handles timeout internally by failing the task.
-        Also detects if the robot is stuck (not moving towards target).
-
-        Returns:
-            True if reached, False if still waiting or not in waiting state
-        """
-        if not self.waiting_for_reach or not self.current_executed_pose:
-            return False
-
-        # Get current end-effector pose
-        ee_pose = self.arm.get_ee_pose()
-        target_pose = self.current_executed_pose
-
-        # Check for timeout - this will fail task and reset if timeout occurred
-        timed_out, _time_elapsed = self._check_reach_timeout()
-        if timed_out:
-            return False
-
-        self.ee_pose_history.append(ee_pose)
-
-        # Check if robot is stuck
-        is_stuck = self._check_if_stuck()
-        if is_stuck:
-            if self.grasp_stage == GraspStage.RETRACT or self.grasp_stage == GraspStage.PLACE:
-                self.waiting_for_reach = False
-                self.waiting_start_time = None
-                self.stuck_count = 0
-                self.ee_pose_history.clear()
-                return True
-            self.stuck_count += 1
-            pitch_degrees = self.calculate_dynamic_grasp_pitch(target_pose)
-            if self.stuck_count % 2 == 0:
-                pitch_degrees += self.stuck_pose_adjustment_degrees * (1 + self.stuck_count // 2)
-            else:
-                pitch_degrees -= self.stuck_pose_adjustment_degrees * (1 + self.stuck_count // 2)
-
-            pitch_degrees = max(
-                self.min_grasp_pitch_degrees, min(self.max_grasp_pitch_degrees, pitch_degrees)
-            )
-            updated_target_pose = update_target_grasp_pose(target_pose, ee_pose, 0.0, pitch_degrees)
-            self.arm.cmd_ee_pose(updated_target_pose)
-            self.current_executed_pose = updated_target_pose
-            self.ee_pose_history.clear()
-            self.waiting_for_reach = True
-            self.waiting_start_time = time.time()
-            return False
-
-        if self.stuck_count >= self.max_stuck_reattempts:
-            self.task_failed = True
-            self.reset_to_idle()
-            return False
-
-        if is_target_reached(target_pose, ee_pose, self.pbvs.target_tolerance):
-            self.waiting_for_reach = False
-            self.waiting_start_time = None
-            self.stuck_count = 0
-            self.ee_pose_history.clear()
-            return True
-        return False
-
-    def _update_tracking(self, detection_3d_array: Detection3DArray | None) -> bool:
-        """Update tracking with new detections."""
-        if not detection_3d_array or not self.pbvs:
-            return False
-
-        target_tracked = self.pbvs.update_tracking(detection_3d_array)
-        if target_tracked:
-            self.target_updated = True
-            self.last_valid_target = self.pbvs.get_current_target()
-        return target_tracked
-
-    def reset_to_idle(self) -> None:
-        """Reset the manipulation system to IDLE state."""
-        if self.pbvs:
-            self.pbvs.clear_target()
-        self.grasp_stage = GraspStage.IDLE
-        self.reached_poses.clear()
-        self.ee_pose_history.clear()
-        self.adjustment_count = 0
-        self.waiting_for_reach = False
-        self.current_executed_pose = None
-        self.target_updated = False
-        self.stabilization_start_time = None
-        self.grasp_reached_time = None
-        self.waiting_start_time = None
-        self.pick_success = None
-        self.final_pregrasp_pose = None
-        self.overall_success = None
-        self.place_pose = None
-        self.retract_pose = None
-        self.stuck_count = 0
-
-        self.arm.gotoObserve()
-
-    def execute_idle(self) -> None:
-        """Execute idle stage."""
-        pass
-
-    def execute_pre_grasp(self) -> None:
-        """Execute pre-grasp stage: visual servoing to pre-grasp position."""
-        if self.waiting_for_reach:
-            if self.check_reach_and_adjust():
-                self.reached_poses.append(self.current_executed_pose)
-                self.target_updated = False
-                time.sleep(0.2)
-            return
-        if (
-            self.stabilization_start_time
-            and (time.time() - self.stabilization_start_time) > self.stabilization_timeout
-        ):
-            logger.warning(
-                f"Failed to get stable grasp after {self.stabilization_timeout} seconds, resetting"
-            )
-            self.task_failed = True
-            self.reset_to_idle()
-            return
-
-        ee_pose = self.arm.get_ee_pose()
-        dynamic_pitch = self.calculate_dynamic_grasp_pitch(self.pbvs.current_target.bbox.center)  # type: ignore[attr-defined]
-
-        _, _, _, has_target, target_pose = self.pbvs.compute_control(  # type: ignore[attr-defined]
-            ee_pose, self.pregrasp_distance, dynamic_pitch
-        )
-        if target_pose and has_target:
-            # Validate target pose is within workspace
-            if not self.check_within_workspace(target_pose):
-                self.task_failed = True
-                self.reset_to_idle()
-                return
-
-            if self.check_target_stabilized():
-                logger.info("Target stabilized, transitioning to GRASP")
-                self.final_pregrasp_pose = self.current_executed_pose
-                self.grasp_stage = GraspStage.GRASP
-                self.adjustment_count = 0
-                self.waiting_for_reach = False
-            elif not self.waiting_for_reach and self.target_updated:
-                self.arm.cmd_ee_pose(target_pose)
-                self.current_executed_pose = target_pose
-                self.waiting_for_reach = True
-                self.waiting_start_time = time.time()  # type: ignore[assignment]
-                self.target_updated = False
-                self.adjustment_count += 1
-                time.sleep(0.2)
-
-    def execute_grasp(self) -> None:
-        """Execute grasp stage: move to final grasp position."""
-        if self.waiting_for_reach:
-            if self.check_reach_and_adjust() and not self.grasp_reached_time:
-                self.grasp_reached_time = time.time()  # type: ignore[assignment]
-            return
-
-        if self.grasp_reached_time:
-            if (time.time() - self.grasp_reached_time) >= self.grasp_close_delay:
-                logger.info("Grasp delay completed, closing gripper")
-                self.grasp_stage = GraspStage.CLOSE_AND_RETRACT
-            return
-
-        if self.last_valid_target:
-            # Calculate dynamic pitch for current target
-            dynamic_pitch = self.calculate_dynamic_grasp_pitch(self.last_valid_target.bbox.center)
-            normalized_pitch = dynamic_pitch / 90.0
-            grasp_distance = -self.grasp_distance_range + (
-                2 * self.grasp_distance_range * normalized_pitch
-            )
-
-            ee_pose = self.arm.get_ee_pose()
-            _, _, _, has_target, target_pose = self.pbvs.compute_control(
-                ee_pose, grasp_distance, dynamic_pitch
-            )
-
-            if target_pose and has_target:
-                # Validate grasp pose is within workspace
-                if not self.check_within_workspace(target_pose):
-                    self.task_failed = True
-                    self.reset_to_idle()
-                    return
-
-                object_width = self.last_valid_target.bbox.size.x
-                gripper_opening = max(
-                    0.005, min(object_width + self.grasp_width_offset, self.gripper_max_opening)
-                )
-
-                logger.info(f"Executing grasp: gripper={gripper_opening * 1000:.1f}mm")
-                self.arm.cmd_gripper_ctrl(gripper_opening)
-                self.arm.cmd_ee_pose(target_pose, line_mode=True)
-                self.current_executed_pose = target_pose
-                self.waiting_for_reach = True
-                self.waiting_start_time = time.time()
-
-    def execute_close_and_retract(self) -> None:
-        """Execute the retraction sequence after gripper has been closed."""
-        if self.waiting_for_reach and self.final_pregrasp_pose:
-            if self.check_reach_and_adjust():
-                logger.info("Reached pre-grasp retraction position")
-                self.pick_success = self.arm.gripper_object_detected()
-                if self.pick_success:
-                    logger.info("Object successfully grasped!")
-                    if self.place_target_position is not None:
-                        logger.info("Transitioning to PLACE stage")
-                        self.grasp_stage = GraspStage.PLACE
-                    else:
-                        self.overall_success = True
-                else:
-                    logger.warning("No object detected in gripper")
-                    self.task_failed = True
-                    self.overall_success = False
-            return
-        if not self.waiting_for_reach:
-            logger.info("Retracting to pre-grasp position")
-            self.arm.cmd_ee_pose(self.final_pregrasp_pose, line_mode=True)
-            self.current_executed_pose = self.final_pregrasp_pose
-            self.arm.close_gripper()
-            self.waiting_for_reach = True
-            self.waiting_start_time = time.time()  # type: ignore[assignment]
-
-    def execute_place(self) -> None:
-        """Execute place stage: move to place position and release object."""
-        if self.waiting_for_reach:
-            # Use the already executed pose instead of recalculating
-            if self.check_reach_and_adjust():
-                logger.info("Reached place position, releasing gripper")
-                self.arm.release_gripper()
-                time.sleep(1.0)
-                self.place_pose = self.current_executed_pose
-                logger.info("Transitioning to RETRACT stage")
-                self.grasp_stage = GraspStage.RETRACT
-            return
-
-        if not self.waiting_for_reach:
-            place_pose = self.get_place_target_pose()
-            if place_pose:
-                logger.info("Moving to place position")
-                self.arm.cmd_ee_pose(place_pose, line_mode=True)
-                self.current_executed_pose = place_pose  # type: ignore[assignment]
-                self.waiting_for_reach = True
-                self.waiting_start_time = time.time()  # type: ignore[assignment]
-            else:
-                logger.error("Failed to get place target pose")
-                self.task_failed = True
-                self.overall_success = False  # type: ignore[assignment]
-
-    def execute_retract(self) -> None:
-        """Execute retract stage: retract from place position."""
-        if self.waiting_for_reach and self.retract_pose:
-            if self.check_reach_and_adjust():
-                logger.info("Reached retract position")
-                logger.info("Returning to observe position")
-                self.arm.gotoObserve()
-                self.arm.close_gripper()
-                self.overall_success = True
-                logger.info("Pick and place completed successfully!")
-            return
-
-        if not self.waiting_for_reach:
-            if self.place_pose:
-                pose_pitch = self.calculate_dynamic_grasp_pitch(self.place_pose)
-                self.retract_pose = update_target_grasp_pose(
-                    self.place_pose, self.home_pose, self.retract_distance, pose_pitch
-                )
-                logger.info("Retracting from place position")
-                self.arm.cmd_ee_pose(self.retract_pose, line_mode=True)
-                self.current_executed_pose = self.retract_pose
-                self.waiting_for_reach = True
-                self.waiting_start_time = time.time()
-            else:
-                logger.error("No place pose stored for retraction")
-                self.task_failed = True
-                self.overall_success = False  # type: ignore[assignment]
-
-    def capture_and_process(
-        self,
-    ) -> tuple[np.ndarray | None, Detection3DArray | None, Detection2DArray | None, Pose | None]:  # type: ignore[type-arg]
-        """Capture frame from camera data and process detections."""
-        if self.latest_rgb is None or self.latest_depth is None or self.detector is None:
-            return None, None, None, None
-
-        ee_pose = self.arm.get_ee_pose()
-        ee_transform = pose_to_matrix(ee_pose)
-        camera_transform = compose_transforms(ee_transform, self.T_ee_to_camera)
-        camera_pose = matrix_to_pose(camera_transform)
-        detection_3d_array, detection_2d_array = self.detector.process_frame(
-            self.latest_rgb, self.latest_depth, camera_transform
-        )
-
-        return self.latest_rgb, detection_3d_array, detection_2d_array, camera_pose
-
-    def pick_target(self, x: int, y: int) -> bool:
-        """Select a target object at the given pixel coordinates."""
-        if not self.last_detection_2d_array or not self.last_detection_3d_array:
-            logger.warning("No detections available for target selection")
-            return False
-
-        clicked_3d = find_clicked_detection(
-            (x, y), self.last_detection_2d_array.detections, self.last_detection_3d_array.detections
-        )
-        if clicked_3d and self.pbvs:
-            # Validate workspace
-            if not self.check_within_workspace(clicked_3d.bbox.center):
-                self.task_failed = True
-                return False
-
-            self.pbvs.set_target(clicked_3d)
-
-            if clicked_3d.bbox and clicked_3d.bbox.size:
-                self.target_object_height = clicked_3d.bbox.size.z
-                logger.info(f"Target object height: {self.target_object_height:.3f}m")
-
-            position = clicked_3d.bbox.center.position
-            logger.info(
-                f"Target selected: ID={clicked_3d.id}, pos=({position.x:.3f}, {position.y:.3f}, {position.z:.3f})"
-            )
-            self.grasp_stage = GraspStage.PRE_GRASP
-            self.reached_poses.clear()
-            self.adjustment_count = 0
-            self.waiting_for_reach = False
-            self.current_executed_pose = None
-            self.stabilization_start_time = time.time()
-            return True
-        return False
-
-    def update(self) -> dict[str, Any] | None:
-        """Main update function that handles capture, processing, control, and visualization."""
-        rgb, detection_3d_array, detection_2d_array, camera_pose = self.capture_and_process()
-        if rgb is None:
-            return None
-
-        self.last_detection_3d_array = detection_3d_array  # type: ignore[assignment]
-        self.last_detection_2d_array = detection_2d_array  # type: ignore[assignment]
-        if self.target_click:
-            x, y = self.target_click
-            if self.pick_target(x, y):
-                self.target_click = None
-
-        if (
-            detection_3d_array
-            and self.grasp_stage in [GraspStage.PRE_GRASP, GraspStage.GRASP]
-            and not self.waiting_for_reach
-        ):
-            self._update_tracking(detection_3d_array)
-        stage_handlers = {
-            GraspStage.IDLE: self.execute_idle,
-            GraspStage.PRE_GRASP: self.execute_pre_grasp,
-            GraspStage.GRASP: self.execute_grasp,
-            GraspStage.CLOSE_AND_RETRACT: self.execute_close_and_retract,
-            GraspStage.PLACE: self.execute_place,
-            GraspStage.RETRACT: self.execute_retract,
-        }
-        if self.grasp_stage in stage_handlers:
-            stage_handlers[self.grasp_stage]()
-
-        target_tracked = self.pbvs.get_current_target() is not None if self.pbvs else False
-        ee_pose = self.arm.get_ee_pose()
-        feedback = Feedback(
-            grasp_stage=self.grasp_stage,
-            target_tracked=target_tracked,
-            current_executed_pose=self.current_executed_pose,
-            current_ee_pose=ee_pose,
-            current_camera_pose=camera_pose,
-            target_pose=self.pbvs.target_grasp_pose if self.pbvs else None,
-            waiting_for_reach=self.waiting_for_reach,
-            success=self.overall_success,
-        )
-
-        if self.task_running:
-            self.current_visualization = create_manipulation_visualization(  # type: ignore[assignment]
-                rgb, feedback, detection_3d_array, detection_2d_array
-            )
-
-            if self.current_visualization is not None:
-                self._publish_visualization(self.current_visualization)
-
-        return feedback  # type: ignore[return-value]
-
-    def _publish_visualization(self, viz_image: np.ndarray) -> None:  # type: ignore[type-arg]
-        """Publish visualization image to LCM."""
-        try:
-            viz_rgb = cv2.cvtColor(viz_image, cv2.COLOR_BGR2RGB)
-            msg = Image.from_numpy(viz_rgb)
-            self.viz_image.publish(msg)
-        except Exception as e:
-            logger.error(f"Error publishing visualization: {e}")
-
-    def check_target_stabilized(self) -> bool:
-        """Check if the commanded poses have stabilized."""
-        if len(self.reached_poses) < self.reached_poses.maxlen:  # type: ignore[operator]
-            return False
-
-        positions = np.array(
-            [[p.position.x, p.position.y, p.position.z] for p in self.reached_poses]
-        )
-        std_devs = np.std(positions, axis=0)
-        return np.all(std_devs < self.pose_stabilization_threshold)  # type: ignore[return-value]
-
-    def get_place_target_pose(self) -> Pose | None:
-        """Get the place target pose with z-offset applied based on object height."""
-        if self.place_target_position is None:
-            return None
-
-        place_pos = self.place_target_position.copy()
-        if self.target_object_height is not None:
-            z_offset = self.target_object_height / 2.0
-            place_pos[2] += z_offset + 0.1
-
-        place_center_pose = Pose(
-            position=Vector3(place_pos[0], place_pos[1], place_pos[2]),
-            orientation=Quaternion(0.0, 0.0, 0.0, 1.0),
-        )
-
-        ee_pose = self.arm.get_ee_pose()
-
-        # Calculate dynamic pitch for place position
-        dynamic_pitch = self.calculate_dynamic_grasp_pitch(place_center_pose)
-
-        place_pose = update_target_grasp_pose(
-            place_center_pose,
-            ee_pose,
-            grasp_distance=0.0,
-            grasp_pitch_degrees=dynamic_pitch,
-        )
-
-        return place_pose
diff --git a/dimos/manipulation/visual_servoing/pbvs.py b/dimos/manipulation/visual_servoing/pbvs.py
deleted file mode 100644
index f94c233834..0000000000
--- a/dimos/manipulation/visual_servoing/pbvs.py
+++ /dev/null
@@ -1,488 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Position-Based Visual Servoing (PBVS) system for robotic manipulation.
-Supports both eye-in-hand and eye-to-hand configurations.
-"""
-
-from collections import deque
-
-from dimos_lcm.vision_msgs import Detection3D
-import numpy as np
-from scipy.spatial.transform import Rotation as R  # type: ignore[import-untyped]
-
-from dimos.manipulation.visual_servoing.utils import (
-    create_pbvs_visualization,
-    find_best_object_match,
-    is_target_reached,
-    update_target_grasp_pose,
-)
-from dimos.msgs.geometry_msgs import Pose, Quaternion, Vector3
-from dimos.msgs.vision_msgs import Detection3DArray
-from dimos.utils.logging_config import setup_logger
-
-logger = setup_logger()
-
-
-class PBVS:
-    """
-    High-level Position-Based Visual Servoing orchestrator.
-
-    Handles:
-    - Object tracking and target management
-    - Pregrasp distance computation
-    - Grasp pose generation
-    - Coordination with low-level controller
-
-    Note: This class is agnostic to camera mounting (eye-in-hand vs eye-to-hand).
-    The caller is responsible for providing appropriate camera and EE poses.
-    """
-
-    def __init__(
-        self,
-        position_gain: float = 0.5,
-        rotation_gain: float = 0.3,
-        max_velocity: float = 0.1,  # m/s
-        max_angular_velocity: float = 0.5,  # rad/s
-        target_tolerance: float = 0.01,  # 1cm
-        max_tracking_distance_threshold: float = 0.12,  # Max distance for target tracking (m)
-        min_size_similarity: float = 0.6,  # Min size similarity threshold (0.0-1.0)
-        direct_ee_control: bool = True,  # If True, output target poses instead of velocities
-    ) -> None:
-        """
-        Initialize PBVS system.
-
-        Args:
-            position_gain: Proportional gain for position control
-            rotation_gain: Proportional gain for rotation control
-            max_velocity: Maximum linear velocity command magnitude (m/s)
-            max_angular_velocity: Maximum angular velocity command magnitude (rad/s)
-            target_tolerance: Distance threshold for considering target reached (m)
-            max_tracking_distance: Maximum distance for valid target tracking (m)
-            min_size_similarity: Minimum size similarity for valid target tracking (0.0-1.0)
-            direct_ee_control: If True, output target poses instead of velocity commands
-        """
-        # Initialize low-level controller only if not in direct control mode
-        if not direct_ee_control:
-            self.controller = PBVSController(
-                position_gain=position_gain,
-                rotation_gain=rotation_gain,
-                max_velocity=max_velocity,
-                max_angular_velocity=max_angular_velocity,
-                target_tolerance=target_tolerance,
-            )
-        else:
-            self.controller = None  # type: ignore[assignment]
-
-        # Store parameters for direct mode error computation
-        self.target_tolerance = target_tolerance
-
-        # Target tracking parameters
-        self.max_tracking_distance_threshold = max_tracking_distance_threshold
-        self.min_size_similarity = min_size_similarity
-        self.direct_ee_control = direct_ee_control
-
-        # Target state
-        self.current_target = None
-        self.target_grasp_pose = None
-
-        # Detection history for robust tracking
-        self.detection_history_size = 3
-        self.detection_history = deque(maxlen=self.detection_history_size)  # type: ignore[var-annotated]
-
-        # For direct control mode visualization
-        self.last_position_error = None
-        self.last_target_reached = False
-
-        logger.info(
-            f"Initialized PBVS system with controller gains: pos={position_gain}, rot={rotation_gain}, "
-            f"tracking_thresholds: distance={max_tracking_distance_threshold}m, size={min_size_similarity:.2f}"
-        )
-
-    def set_target(self, target_object: Detection3D) -> bool:
-        """
-        Set a new target object for servoing.
-
-        Args:
-            target_object: Detection3D object
-
-        Returns:
-            True if target was set successfully
-        """
-        if target_object and target_object.bbox and target_object.bbox.center:
-            self.current_target = target_object
-            self.target_grasp_pose = None  # Will be computed when needed
-            logger.info(f"New target set: ID {target_object.id}")
-            return True
-        return False
-
-    def clear_target(self) -> None:
-        """Clear the current target."""
-        self.current_target = None
-        self.target_grasp_pose = None
-        self.last_position_error = None
-        self.last_target_reached = False
-        self.detection_history.clear()
-        if self.controller:
-            self.controller.clear_state()
-        logger.info("Target cleared")
-
-    def get_current_target(self) -> Detection3D | None:
-        """
-        Get the current target object.
-
-        Returns:
-            Current target Detection3D or None if no target selected
-        """
-        return self.current_target
-
-    def update_tracking(self, new_detections: Detection3DArray | None = None) -> bool:
-        """
-        Update target tracking with new detections using a rolling window.
-        If tracking is lost, keeps the old target pose.
-
-        Args:
-            new_detections: Optional new detections for target tracking
-
-        Returns:
-            True if target was successfully tracked, False if lost (but target is kept)
-        """
-        # Check if we have a current target
-        if not self.current_target:
-            return False
-
-        # Add new detections to history if provided
-        if new_detections is not None and new_detections.detections_length > 0:
-            self.detection_history.append(new_detections)
-
-        # If no detection history, can't track
-        if not self.detection_history:
-            logger.debug("No detection history for target tracking - using last known pose")
-            return False
-
-        # Collect all candidates from detection history
-        all_candidates = []
-        for detection_array in self.detection_history:
-            all_candidates.extend(detection_array.detections)
-
-        if not all_candidates:
-            logger.debug("No candidates in detection history")
-            return False
-
-        # Use stage-dependent distance threshold
-        max_distance = self.max_tracking_distance_threshold
-
-        # Find best match across all recent detections
-        match_result = find_best_object_match(
-            target_obj=self.current_target,
-            candidates=all_candidates,
-            max_distance=max_distance,
-            min_size_similarity=self.min_size_similarity,
-        )
-
-        if match_result.is_valid_match:
-            self.current_target = match_result.matched_object
-            self.target_grasp_pose = None  # Recompute grasp pose
-            logger.debug(
-                f"Target tracking successful: distance={match_result.distance:.3f}m, "
-                f"size_similarity={match_result.size_similarity:.2f}, "
-                f"confidence={match_result.confidence:.2f}"
-            )
-            return True
-
-        logger.debug(
-            f"Target tracking lost across {len(self.detection_history)} frames: "
-            f"distance={match_result.distance:.3f}m, "
-            f"size_similarity={match_result.size_similarity:.2f}, "
-            f"thresholds: distance={max_distance:.3f}m, size={self.min_size_similarity:.2f}"
-        )
-        return False
-
-    def compute_control(
-        self,
-        ee_pose: Pose,
-        grasp_distance: float = 0.15,
-        grasp_pitch_degrees: float = 45.0,
-    ) -> tuple[Vector3 | None, Vector3 | None, bool, bool, Pose | None]:
-        """
-        Compute PBVS control with position and orientation servoing.
-
-        Args:
-            ee_pose: Current end-effector pose
-            grasp_distance: Distance to maintain from target (meters)
-
-        Returns:
-            Tuple of (velocity_command, angular_velocity_command, target_reached, has_target, target_pose)
-            - velocity_command: Linear velocity vector or None if no target (None in direct_ee_control mode)
-            - angular_velocity_command: Angular velocity vector or None if no target (None in direct_ee_control mode)
-            - target_reached: True if within target tolerance
-            - has_target: True if currently tracking a target
-            - target_pose: Target EE pose (only in direct_ee_control mode, otherwise None)
-        """
-        # Check if we have a target
-        if not self.current_target:
-            return None, None, False, False, None
-
-        # Update target grasp pose with provided distance and pitch
-        self.target_grasp_pose = update_target_grasp_pose(
-            self.current_target.bbox.center, ee_pose, grasp_distance, grasp_pitch_degrees
-        )
-
-        if self.target_grasp_pose is None:
-            logger.warning("Failed to compute grasp pose")
-            return None, None, False, False, None
-
-        # Compute errors for visualization before checking if reached (in case pose gets cleared)
-        if self.direct_ee_control and self.target_grasp_pose:
-            self.last_position_error = Vector3(
-                self.target_grasp_pose.position.x - ee_pose.position.x,
-                self.target_grasp_pose.position.y - ee_pose.position.y,
-                self.target_grasp_pose.position.z - ee_pose.position.z,
-            )
-
-        # Check if target reached using our separate function
-        target_reached = is_target_reached(self.target_grasp_pose, ee_pose, self.target_tolerance)
-
-        # Return appropriate values based on control mode
-        if self.direct_ee_control:
-            # Direct control mode
-            if self.target_grasp_pose:
-                self.last_target_reached = target_reached
-                # Return has_target=True since we have a target
-                return None, None, target_reached, True, self.target_grasp_pose
-            else:
-                return None, None, False, True, None
-        else:
-            # Velocity control mode - use controller
-            velocity_cmd, angular_velocity_cmd, _controller_reached = (
-                self.controller.compute_control(ee_pose, self.target_grasp_pose)
-            )
-            # Return has_target=True since we have a target, regardless of tracking status
-            return velocity_cmd, angular_velocity_cmd, target_reached, True, None
-
-    def create_status_overlay(  # type: ignore[no-untyped-def]
-        self,
-        image: np.ndarray,  # type: ignore[type-arg]
-        grasp_stage=None,
-    ) -> np.ndarray:  # type: ignore[type-arg]
-        """
-        Create PBVS status overlay on image.
-
-        Args:
-            image: Input image
-            grasp_stage: Current grasp stage (optional)
-
-        Returns:
-            Image with PBVS status overlay
-        """
-        stage_value = grasp_stage.value if grasp_stage else "idle"
-        return create_pbvs_visualization(
-            image,
-            self.current_target,
-            self.last_position_error,
-            self.last_target_reached,
-            stage_value,
-        )
-
-
-class PBVSController:
-    """
-    Low-level Position-Based Visual Servoing controller.
-    Pure control logic that computes velocity commands from poses.
-
-    Handles:
-    - Position and orientation error computation
-    - Velocity command generation with gain control
-    - Target reached detection
-    """
-
-    def __init__(
-        self,
-        position_gain: float = 0.5,
-        rotation_gain: float = 0.3,
-        max_velocity: float = 0.1,  # m/s
-        max_angular_velocity: float = 0.5,  # rad/s
-        target_tolerance: float = 0.01,  # 1cm
-    ) -> None:
-        """
-        Initialize PBVS controller.
-
-        Args:
-            position_gain: Proportional gain for position control
-            rotation_gain: Proportional gain for rotation control
-            max_velocity: Maximum linear velocity command magnitude (m/s)
-            max_angular_velocity: Maximum angular velocity command magnitude (rad/s)
-            target_tolerance: Distance threshold for considering target reached (m)
-        """
-        self.position_gain = position_gain
-        self.rotation_gain = rotation_gain
-        self.max_velocity = max_velocity
-        self.max_angular_velocity = max_angular_velocity
-        self.target_tolerance = target_tolerance
-
-        self.last_position_error = None
-        self.last_rotation_error = None
-        self.last_velocity_cmd = None
-        self.last_angular_velocity_cmd = None
-        self.last_target_reached = False
-
-        logger.info(
-            f"Initialized PBVS controller: pos_gain={position_gain}, rot_gain={rotation_gain}, "
-            f"max_vel={max_velocity}m/s, max_ang_vel={max_angular_velocity}rad/s, "
-            f"target_tolerance={target_tolerance}m"
-        )
-
-    def clear_state(self) -> None:
-        """Clear controller state."""
-        self.last_position_error = None
-        self.last_rotation_error = None
-        self.last_velocity_cmd = None
-        self.last_angular_velocity_cmd = None
-        self.last_target_reached = False
-
-    def compute_control(
-        self, ee_pose: Pose, grasp_pose: Pose
-    ) -> tuple[Vector3 | None, Vector3 | None, bool]:
-        """
-        Compute PBVS control with position and orientation servoing.
-
-        Args:
-            ee_pose: Current end-effector pose
-            grasp_pose: Target grasp pose
-
-        Returns:
-            Tuple of (velocity_command, angular_velocity_command, target_reached)
-            - velocity_command: Linear velocity vector
-            - angular_velocity_command: Angular velocity vector
-            - target_reached: True if within target tolerance
-        """
-        # Calculate position error (target - EE position)
-        error = Vector3(
-            grasp_pose.position.x - ee_pose.position.x,
-            grasp_pose.position.y - ee_pose.position.y,
-            grasp_pose.position.z - ee_pose.position.z,
-        )
-        self.last_position_error = error  # type: ignore[assignment]
-
-        # Compute velocity command with proportional control
-        velocity_cmd = Vector3(
-            error.x * self.position_gain,
-            error.y * self.position_gain,
-            error.z * self.position_gain,
-        )
-
-        # Limit velocity magnitude
-        vel_magnitude = np.linalg.norm([velocity_cmd.x, velocity_cmd.y, velocity_cmd.z])
-        if vel_magnitude > self.max_velocity:
-            scale = self.max_velocity / vel_magnitude
-            velocity_cmd = Vector3(
-                float(velocity_cmd.x * scale),
-                float(velocity_cmd.y * scale),
-                float(velocity_cmd.z * scale),
-            )
-
-        self.last_velocity_cmd = velocity_cmd  # type: ignore[assignment]
-
-        # Compute angular velocity for orientation control
-        angular_velocity_cmd = self._compute_angular_velocity(grasp_pose.orientation, ee_pose)
-
-        # Check if target reached
-        error_magnitude = np.linalg.norm([error.x, error.y, error.z])
-        target_reached = bool(error_magnitude < self.target_tolerance)
-        self.last_target_reached = target_reached
-
-        return velocity_cmd, angular_velocity_cmd, target_reached
-
-    def _compute_angular_velocity(self, target_rot: Quaternion, current_pose: Pose) -> Vector3:
-        """
-        Compute angular velocity commands for orientation control.
-        Uses quaternion error computation for better numerical stability.
-
-        Args:
-            target_rot: Target orientation (quaternion)
-            current_pose: Current EE pose
-
-        Returns:
-            Angular velocity command as Vector3
-        """
-        # Use quaternion error for better numerical stability
-
-        # Convert to scipy Rotation objects
-        target_rot_scipy = R.from_quat([target_rot.x, target_rot.y, target_rot.z, target_rot.w])
-        current_rot_scipy = R.from_quat(
-            [
-                current_pose.orientation.x,
-                current_pose.orientation.y,
-                current_pose.orientation.z,
-                current_pose.orientation.w,
-            ]
-        )
-
-        # Compute rotation error: error = target * current^(-1)
-        error_rot = target_rot_scipy * current_rot_scipy.inv()
-
-        # Convert to axis-angle representation for control
-        error_axis_angle = error_rot.as_rotvec()
-
-        # Use axis-angle directly as angular velocity error (small angle approximation)
-        roll_error = error_axis_angle[0]
-        pitch_error = error_axis_angle[1]
-        yaw_error = error_axis_angle[2]
-
-        self.last_rotation_error = Vector3(roll_error, pitch_error, yaw_error)  # type: ignore[assignment]
-
-        # Apply proportional control
-        angular_velocity = Vector3(
-            roll_error * self.rotation_gain,
-            pitch_error * self.rotation_gain,
-            yaw_error * self.rotation_gain,
-        )
-
-        # Limit angular velocity magnitude
-        ang_vel_magnitude = np.sqrt(
-            angular_velocity.x**2 + angular_velocity.y**2 + angular_velocity.z**2
-        )
-        if ang_vel_magnitude > self.max_angular_velocity:
-            scale = self.max_angular_velocity / ang_vel_magnitude
-            angular_velocity = Vector3(
-                angular_velocity.x * scale, angular_velocity.y * scale, angular_velocity.z * scale
-            )
-
-        self.last_angular_velocity_cmd = angular_velocity  # type: ignore[assignment]
-
-        return angular_velocity
-
-    def create_status_overlay(
-        self,
-        image: np.ndarray,  # type: ignore[type-arg]
-        current_target: Detection3D | None = None,
-    ) -> np.ndarray:  # type: ignore[type-arg]
-        """
-        Create PBVS status overlay on image.
-
-        Args:
-            image: Input image
-            current_target: Current target object Detection3D (for display)
-
-        Returns:
-            Image with PBVS status overlay
-        """
-        return create_pbvs_visualization(
-            image,
-            current_target,
-            self.last_position_error,
-            self.last_target_reached,
-            "velocity_control",
-        )
diff --git a/dimos/manipulation/visual_servoing/utils.py b/dimos/manipulation/visual_servoing/utils.py
deleted file mode 100644
index 5922739429..0000000000
--- a/dimos/manipulation/visual_servoing/utils.py
+++ /dev/null
@@ -1,801 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Any
-
-import cv2
-from dimos_lcm.vision_msgs import Detection2D, Detection3D
-import numpy as np
-
-from dimos.msgs.geometry_msgs import Pose, Quaternion, Vector3
-from dimos.perception.common.utils import project_2d_points_to_3d
-from dimos.perception.detection2d.utils import plot_results
-from dimos.utils.transform_utils import (
-    compose_transforms,
-    euler_to_quaternion,
-    get_distance,
-    matrix_to_pose,
-    offset_distance,
-    optical_to_robot_frame,
-    pose_to_matrix,
-    robot_to_optical_frame,
-    yaw_towards_point,
-)
-
-
-def match_detection_by_id(
-    detection_3d: Detection3D, detections_3d: list[Detection3D], detections_2d: list[Detection2D]
-) -> Detection2D | None:
-    """
-    Find the corresponding Detection2D for a given Detection3D.
-
-    Args:
-        detection_3d: The Detection3D to match
-        detections_3d: List of all Detection3D objects
-        detections_2d: List of all Detection2D objects (must be 1:1 correspondence)
-
-    Returns:
-        Corresponding Detection2D if found, None otherwise
-    """
-    for i, det_3d in enumerate(detections_3d):
-        if det_3d.id == detection_3d.id and i < len(detections_2d):
-            return detections_2d[i]
-    return None
-
-
-def transform_pose(
-    obj_pos: np.ndarray,  # type: ignore[type-arg]
-    obj_orientation: np.ndarray,  # type: ignore[type-arg]
-    transform_matrix: np.ndarray,  # type: ignore[type-arg]
-    to_optical: bool = False,
-    to_robot: bool = False,
-) -> Pose:
-    """
-    Transform object pose with optional frame convention conversion.
-
-    Args:
-        obj_pos: Object position [x, y, z]
-        obj_orientation: Object orientation [roll, pitch, yaw] in radians
-        transform_matrix: 4x4 transformation matrix from camera frame to desired frame
-        to_optical: If True, input is in robot frame → convert result to optical frame
-        to_robot: If True, input is in optical frame → convert to robot frame first
-
-    Returns:
-        Object pose in desired frame as Pose
-    """
-    # Convert euler angles to quaternion using utility function
-    euler_vector = Vector3(obj_orientation[0], obj_orientation[1], obj_orientation[2])
-    obj_orientation_quat = euler_to_quaternion(euler_vector)
-
-    input_pose = Pose(
-        position=Vector3(obj_pos[0], obj_pos[1], obj_pos[2]), orientation=obj_orientation_quat
-    )
-
-    # Apply input frame conversion based on flags
-    if to_robot:
-        # Input is in optical frame → convert to robot frame first
-        pose_for_transform = optical_to_robot_frame(input_pose)
-    else:
-        # Default or to_optical: use input pose as-is
-        pose_for_transform = input_pose
-
-    # Create transformation matrix from pose (relative to camera)
-    T_camera_object = pose_to_matrix(pose_for_transform)
-
-    # Use compose_transforms to combine transformations
-    T_desired_object = compose_transforms(transform_matrix, T_camera_object)
-
-    # Convert back to pose
-    result_pose = matrix_to_pose(T_desired_object)
-
-    # Apply output frame conversion based on flags
-    if to_optical:
-        # Input was robot frame → convert result to optical frame
-        desired_pose = robot_to_optical_frame(result_pose)
-    else:
-        # Default or to_robot: use result as-is
-        desired_pose = result_pose
-
-    return desired_pose
-
-
-def transform_points_3d(
-    points_3d: np.ndarray,  # type: ignore[type-arg]
-    transform_matrix: np.ndarray,  # type: ignore[type-arg]
-    to_optical: bool = False,
-    to_robot: bool = False,
-) -> np.ndarray:  # type: ignore[type-arg]
-    """
-    Transform 3D points with optional frame convention conversion.
-    Applies the same transformation pipeline as transform_pose but for multiple points.
-
-    Args:
-        points_3d: Nx3 array of 3D points [x, y, z]
-        transform_matrix: 4x4 transformation matrix from camera frame to desired frame
-        to_optical: If True, input is in robot frame → convert result to optical frame
-        to_robot: If True, input is in optical frame → convert to robot frame first
-
-    Returns:
-        Nx3 array of transformed 3D points in desired frame
-    """
-    if points_3d.size == 0:
-        return np.zeros((0, 3), dtype=np.float32)
-
-    points_3d = np.asarray(points_3d)
-    if points_3d.ndim == 1:
-        points_3d = points_3d.reshape(1, -1)
-
-    transformed_points = []
-
-    for point in points_3d:
-        input_point_pose = Pose(
-            position=Vector3(point[0], point[1], point[2]),
-            orientation=Quaternion(0.0, 0.0, 0.0, 1.0),  # Identity quaternion
-        )
-
-        # Apply input frame conversion based on flags
-        if to_robot:
-            # Input is in optical frame → convert to robot frame first
-            pose_for_transform = optical_to_robot_frame(input_point_pose)
-        else:
-            # Default or to_optical: use input pose as-is
-            pose_for_transform = input_point_pose
-
-        # Create transformation matrix from point pose (relative to camera)
-        T_camera_point = pose_to_matrix(pose_for_transform)
-
-        # Use compose_transforms to combine transformations
-        T_desired_point = compose_transforms(transform_matrix, T_camera_point)
-
-        # Convert back to pose
-        result_pose = matrix_to_pose(T_desired_point)
-
-        # Apply output frame conversion based on flags
-        if to_optical:
-            # Input was robot frame → convert result to optical frame
-            desired_pose = robot_to_optical_frame(result_pose)
-        else:
-            # Default or to_robot: use result as-is
-            desired_pose = result_pose
-
-        transformed_point = [
-            desired_pose.position.x,
-            desired_pose.position.y,
-            desired_pose.position.z,
-        ]
-        transformed_points.append(transformed_point)
-
-    return np.array(transformed_points, dtype=np.float32)
-
-
-def select_points_from_depth(
-    depth_image: np.ndarray,  # type: ignore[type-arg]
-    target_point: tuple[int, int],
-    camera_intrinsics: list[float] | np.ndarray,  # type: ignore[type-arg]
-    radius: int = 5,
-) -> np.ndarray:  # type: ignore[type-arg]
-    """
-    Select points around a target point within a bounding box and project them to 3D.
-
-    Args:
-        depth_image: Depth image in meters (H, W)
-        target_point: (x, y) target point coordinates
-        radius: Half-width of the bounding box (so bbox size is radius*2 x radius*2)
-        camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
-
-    Returns:
-        Nx3 array of 3D points (X, Y, Z) in camera frame
-    """
-    x_target, y_target = target_point
-    height, width = depth_image.shape
-
-    x_min = max(0, x_target - radius)
-    x_max = min(width, x_target + radius)
-    y_min = max(0, y_target - radius)
-    y_max = min(height, y_target + radius)
-
-    # Create coordinate grids for the bounding box (vectorized)
-    y_coords, x_coords = np.meshgrid(range(y_min, y_max), range(x_min, x_max), indexing="ij")
-
-    # Flatten to get all coordinate pairs
-    x_flat = x_coords.flatten()
-    y_flat = y_coords.flatten()
-
-    # Extract corresponding depth values using advanced indexing
-    depth_flat = depth_image[y_flat, x_flat]
-
-    valid_mask = (depth_flat > 0) & np.isfinite(depth_flat)
-
-    if not np.any(valid_mask):
-        return np.zeros((0, 3), dtype=np.float32)
-
-    points_2d = np.column_stack([x_flat[valid_mask], y_flat[valid_mask]]).astype(np.float32)
-    depth_values = depth_flat[valid_mask].astype(np.float32)
-
-    points_3d = project_2d_points_to_3d(points_2d, depth_values, camera_intrinsics)
-
-    return points_3d
-
-
-def update_target_grasp_pose(
-    target_pose: Pose, ee_pose: Pose, grasp_distance: float = 0.0, grasp_pitch_degrees: float = 45.0
-) -> Pose | None:
-    """
-    Update target grasp pose based on current target pose and EE pose.
-
-    Args:
-        target_pose: Target pose to grasp
-        ee_pose: Current end-effector pose
-        grasp_distance: Distance to maintain from target (pregrasp or grasp distance)
-        grasp_pitch_degrees: Grasp pitch angle in degrees (default 90° for top-down)
-
-    Returns:
-        Target grasp pose or None if target is invalid
-    """
-
-    target_pos = target_pose.position
-
-    # Calculate orientation pointing from target towards EE
-    yaw_to_ee = yaw_towards_point(target_pos, ee_pose.position)
-
-    # Create target pose with proper orientation
-    # Convert grasp pitch from degrees to radians with mapping:
-    # 0° (level) -> π/2 (1.57 rad), 90° (top-down) -> π (3.14 rad)
-    pitch_radians = 1.57 + np.radians(grasp_pitch_degrees)
-
-    # Convert euler angles to quaternion using utility function
-    euler = Vector3(0.0, pitch_radians, yaw_to_ee)  # roll=0, pitch=mapped, yaw=calculated
-    target_orientation = euler_to_quaternion(euler)
-
-    updated_pose = Pose(target_pos, target_orientation)
-
-    if grasp_distance > 0.0:
-        return offset_distance(updated_pose, grasp_distance)
-    else:
-        return updated_pose
-
-
-def is_target_reached(target_pose: Pose, current_pose: Pose, tolerance: float = 0.01) -> bool:
-    """
-    Check if the target pose has been reached within tolerance.
-
-    Args:
-        target_pose: Target pose to reach
-        current_pose: Current pose (e.g., end-effector pose)
-        tolerance: Distance threshold for considering target reached (meters, default 0.01 = 1cm)
-
-    Returns:
-        True if target is reached within tolerance, False otherwise
-    """
-    # Calculate position error using distance utility
-    error_magnitude = get_distance(target_pose, current_pose)
-    return error_magnitude < tolerance
-
-
-@dataclass
-class ObjectMatchResult:
-    """Result of object matching with confidence metrics."""
-
-    matched_object: Detection3D | None
-    confidence: float
-    distance: float
-    size_similarity: float
-    is_valid_match: bool
-
-
-def calculate_object_similarity(
-    target_obj: Detection3D,
-    candidate_obj: Detection3D,
-    distance_weight: float = 0.6,
-    size_weight: float = 0.4,
-) -> tuple[float, float, float]:
-    """
-    Calculate comprehensive similarity between two objects.
-
-    Args:
-        target_obj: Target Detection3D object
-        candidate_obj: Candidate Detection3D object
-        distance_weight: Weight for distance component (0-1)
-        size_weight: Weight for size component (0-1)
-
-    Returns:
-        Tuple of (total_similarity, distance_m, size_similarity)
-    """
-    # Extract positions
-    target_pos = target_obj.bbox.center.position
-    candidate_pos = candidate_obj.bbox.center.position
-
-    target_xyz = np.array([target_pos.x, target_pos.y, target_pos.z])
-    candidate_xyz = np.array([candidate_pos.x, candidate_pos.y, candidate_pos.z])
-
-    # Calculate Euclidean distance
-    distance = np.linalg.norm(target_xyz - candidate_xyz)
-    distance_similarity = 1.0 / (1.0 + distance)  # Exponential decay
-
-    # Calculate size similarity by comparing each dimension individually
-    size_similarity = 1.0  # Default if no size info
-    target_size = target_obj.bbox.size
-    candidate_size = candidate_obj.bbox.size
-
-    if target_size and candidate_size:
-        # Extract dimensions
-        target_dims = [target_size.x, target_size.y, target_size.z]
-        candidate_dims = [candidate_size.x, candidate_size.y, candidate_size.z]
-
-        # Calculate similarity for each dimension pair
-        dim_similarities = []
-        for target_dim, candidate_dim in zip(target_dims, candidate_dims, strict=False):
-            if target_dim == 0.0 and candidate_dim == 0.0:
-                dim_similarities.append(1.0)  # Both dimensions are zero
-            elif target_dim == 0.0 or candidate_dim == 0.0:
-                dim_similarities.append(0.0)  # One dimension is zero, other is not
-            else:
-                # Calculate similarity as min/max ratio
-                max_dim = max(target_dim, candidate_dim)
-                min_dim = min(target_dim, candidate_dim)
-                dim_similarity = min_dim / max_dim if max_dim > 0 else 0.0
-                dim_similarities.append(dim_similarity)
-
-        # Return average similarity across all dimensions
-        size_similarity = np.mean(dim_similarities) if dim_similarities else 0.0  # type: ignore[assignment]
-
-    # Weighted combination
-    total_similarity = distance_weight * distance_similarity + size_weight * size_similarity
-
-    return total_similarity, distance, size_similarity  # type: ignore[return-value]
-
-
-def find_best_object_match(
-    target_obj: Detection3D,
-    candidates: list[Detection3D],
-    max_distance: float = 0.1,
-    min_size_similarity: float = 0.4,
-    distance_weight: float = 0.7,
-    size_weight: float = 0.3,
-) -> ObjectMatchResult:
-    """
-    Find the best matching object from candidates using distance and size criteria.
-
-    Args:
-        target_obj: Target Detection3D to match against
-        candidates: List of candidate Detection3D objects
-        max_distance: Maximum allowed distance for valid match (meters)
-        min_size_similarity: Minimum size similarity for valid match (0-1)
-        distance_weight: Weight for distance in similarity calculation
-        size_weight: Weight for size in similarity calculation
-
-    Returns:
-        ObjectMatchResult with best match and confidence metrics
-    """
-    if not candidates or not target_obj.bbox or not target_obj.bbox.center:
-        return ObjectMatchResult(None, 0.0, float("inf"), 0.0, False)
-
-    best_match = None
-    best_confidence = 0.0
-    best_distance = float("inf")
-    best_size_sim = 0.0
-
-    for candidate in candidates:
-        if not candidate.bbox or not candidate.bbox.center:
-            continue
-
-        similarity, distance, size_sim = calculate_object_similarity(
-            target_obj, candidate, distance_weight, size_weight
-        )
-
-        # Check validity constraints
-        is_valid = distance <= max_distance and size_sim >= min_size_similarity
-
-        if is_valid and similarity > best_confidence:
-            best_match = candidate
-            best_confidence = similarity
-            best_distance = distance
-            best_size_sim = size_sim
-
-    return ObjectMatchResult(
-        matched_object=best_match,
-        confidence=best_confidence,
-        distance=best_distance,
-        size_similarity=best_size_sim,
-        is_valid_match=best_match is not None,
-    )
-
-
-def parse_zed_pose(zed_pose_data: dict[str, Any]) -> Pose | None:
-    """
-    Parse ZED pose data dictionary into a Pose object.
-
-    Args:
-        zed_pose_data: Dictionary from ZEDCamera.get_pose() containing:
-            - position: [x, y, z] in meters
-            - rotation: [x, y, z, w] quaternion
-            - euler_angles: [roll, pitch, yaw] in radians
-            - valid: Whether pose is valid
-
-    Returns:
-        Pose object with position and orientation, or None if invalid
-    """
-    if not zed_pose_data or not zed_pose_data.get("valid", False):
-        return None
-
-    # Extract position
-    position = zed_pose_data.get("position", [0, 0, 0])
-    pos_vector = Vector3(position[0], position[1], position[2])
-
-    quat = zed_pose_data["rotation"]
-    orientation = Quaternion(quat[0], quat[1], quat[2], quat[3])
-    return Pose(position=pos_vector, orientation=orientation)
-
-
-def estimate_object_depth(
-    depth_image: np.ndarray,  # type: ignore[type-arg]
-    segmentation_mask: np.ndarray | None,  # type: ignore[type-arg]
-    bbox: list[float],
-) -> float:
-    """
-    Estimate object depth dimension using segmentation mask and depth data.
-    Optimized for real-time performance.
-
-    Args:
-        depth_image: Depth image in meters
-        segmentation_mask: Binary segmentation mask for the object
-        bbox: Bounding box [x1, y1, x2, y2]
-
-    Returns:
-        Estimated object depth in meters
-    """
-    x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
-
-    # Extract depth ROI once
-    roi_depth = depth_image[y1:y2, x1:x2]
-
-    if segmentation_mask is not None and segmentation_mask.size > 0:
-        # Extract mask ROI efficiently
-        mask_roi = (
-            segmentation_mask[y1:y2, x1:x2]
-            if segmentation_mask.shape != roi_depth.shape
-            else segmentation_mask
-        )
-
-        # Fast mask application using boolean indexing
-        valid_mask = mask_roi > 0
-        if np.sum(valid_mask) > 10:  # Early exit if not enough points
-            masked_depths = roi_depth[valid_mask]
-
-            # Fast percentile calculation using numpy's optimized functions
-            depth_90 = np.percentile(masked_depths, 90)
-            depth_10 = np.percentile(masked_depths, 10)
-            depth_range = depth_90 - depth_10
-
-            # Clamp to reasonable bounds with single operation
-            return np.clip(depth_range, 0.02, 0.5)  # type: ignore[no-any-return]
-
-    # Fast fallback using area calculation
-    bbox_area = (x2 - x1) * (y2 - y1)
-
-    # Vectorized area-based estimation
-    if bbox_area > 10000:
-        return 0.15
-    elif bbox_area > 5000:
-        return 0.10
-    else:
-        return 0.05
-
-
-# ============= Visualization Functions =============
-
-
-def create_manipulation_visualization(  # type: ignore[no-untyped-def]
-    rgb_image: np.ndarray,  # type: ignore[type-arg]
-    feedback,
-    detection_3d_array=None,
-    detection_2d_array=None,
-) -> np.ndarray:  # type: ignore[type-arg]
-    """
-    Create simple visualization for manipulation class using feedback.
-
-    Args:
-        rgb_image: RGB image array
-        feedback: Feedback object containing all state information
-        detection_3d_array: Optional 3D detections for object visualization
-        detection_2d_array: Optional 2D detections for object visualization
-
-    Returns:
-        BGR image with visualization overlays
-    """
-    # Convert to BGR for OpenCV
-    viz = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
-
-    # Draw detections if available
-    if detection_3d_array and detection_2d_array:
-        # Extract 2D bboxes
-        bboxes_2d = []
-        for det_2d in detection_2d_array.detections:
-            if det_2d.bbox:
-                x1 = det_2d.bbox.center.position.x - det_2d.bbox.size_x / 2
-                y1 = det_2d.bbox.center.position.y - det_2d.bbox.size_y / 2
-                x2 = det_2d.bbox.center.position.x + det_2d.bbox.size_x / 2
-                y2 = det_2d.bbox.center.position.y + det_2d.bbox.size_y / 2
-                bboxes_2d.append([x1, y1, x2, y2])
-
-        # Draw basic detections
-        rgb_with_detections = visualize_detections_3d(
-            rgb_image, detection_3d_array.detections, show_coordinates=True, bboxes_2d=bboxes_2d
-        )
-        viz = cv2.cvtColor(rgb_with_detections, cv2.COLOR_RGB2BGR)
-
-    # Add manipulation status overlay
-    status_y = 30
-    cv2.putText(
-        viz,
-        "Eye-in-Hand Visual Servoing",
-        (10, status_y),
-        cv2.FONT_HERSHEY_SIMPLEX,
-        0.6,
-        (0, 255, 255),
-        2,
-    )
-
-    # Stage information
-    stage_text = f"Stage: {feedback.grasp_stage.value.upper()}"
-    stage_color = {
-        "idle": (100, 100, 100),
-        "pre_grasp": (0, 255, 255),
-        "grasp": (0, 255, 0),
-        "close_and_retract": (255, 0, 255),
-        "place": (0, 150, 255),
-        "retract": (255, 150, 0),
-    }.get(feedback.grasp_stage.value, (255, 255, 255))
-
-    cv2.putText(
-        viz,
-        stage_text,
-        (10, status_y + 25),
-        cv2.FONT_HERSHEY_SIMPLEX,
-        0.5,
-        stage_color,
-        1,
-    )
-
-    # Target tracking status
-    if feedback.target_tracked:
-        cv2.putText(
-            viz,
-            "Target: TRACKED",
-            (10, status_y + 45),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            0.5,
-            (0, 255, 0),
-            1,
-        )
-    elif feedback.grasp_stage.value != "idle":
-        cv2.putText(
-            viz,
-            "Target: LOST",
-            (10, status_y + 45),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            0.5,
-            (0, 0, 255),
-            1,
-        )
-
-    # Waiting status
-    if feedback.waiting_for_reach:
-        cv2.putText(
-            viz,
-            "Status: WAITING FOR ROBOT",
-            (10, status_y + 65),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            0.5,
-            (255, 255, 0),
-            1,
-        )
-
-    # Overall result
-    if feedback.success is not None:
-        result_text = "Pick & Place: SUCCESS" if feedback.success else "Pick & Place: FAILED"
-        result_color = (0, 255, 0) if feedback.success else (0, 0, 255)
-        cv2.putText(
-            viz,
-            result_text,
-            (10, status_y + 85),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            0.5,
-            result_color,
-            2,
-        )
-
-    # Control hints (bottom of image)
-    hint_text = "Click object to grasp | s=STOP | r=RESET | g=RELEASE"
-    cv2.putText(
-        viz,
-        hint_text,
-        (10, viz.shape[0] - 10),
-        cv2.FONT_HERSHEY_SIMPLEX,
-        0.4,
-        (200, 200, 200),
-        1,
-    )
-
-    return viz
-
-
-def create_pbvs_visualization(  # type: ignore[no-untyped-def]
-    image: np.ndarray,  # type: ignore[type-arg]
-    current_target=None,
-    position_error=None,
-    target_reached: bool = False,
-    grasp_stage: str = "idle",
-) -> np.ndarray:  # type: ignore[type-arg]
-    """
-    Create simple PBVS visualization overlay.
-
-    Args:
-        image: Input image (RGB or BGR)
-        current_target: Current target Detection3D
-        position_error: Position error Vector3
-        target_reached: Whether target is reached
-        grasp_stage: Current grasp stage string
-
-    Returns:
-        Image with PBVS overlay
-    """
-    viz = image.copy()
-
-    # Only show PBVS info if we have a target
-    if current_target is None:
-        return viz
-
-    # Create status panel at bottom
-    height, width = viz.shape[:2]
-    panel_height = 100
-    panel_y = height - panel_height
-
-    # Semi-transparent overlay
-    overlay = viz.copy()
-    cv2.rectangle(overlay, (0, panel_y), (width, height), (0, 0, 0), -1)
-    viz = cv2.addWeighted(viz, 0.7, overlay, 0.3, 0)
-
-    # PBVS Status
-    y_offset = panel_y + 20
-    cv2.putText(
-        viz,
-        "PBVS Control",
-        (10, y_offset),
-        cv2.FONT_HERSHEY_SIMPLEX,
-        0.6,
-        (0, 255, 255),
-        2,
-    )
-
-    # Position error
-    if position_error:
-        error_mag = np.linalg.norm([position_error.x, position_error.y, position_error.z])
-        error_text = f"Error: {error_mag * 100:.1f}cm"
-        error_color = (0, 255, 0) if target_reached else (0, 255, 255)
-        cv2.putText(
-            viz,
-            error_text,
-            (10, y_offset + 25),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            0.5,
-            error_color,
-            1,
-        )
-
-    # Stage
-    cv2.putText(
-        viz,
-        f"Stage: {grasp_stage}",
-        (10, y_offset + 45),
-        cv2.FONT_HERSHEY_SIMPLEX,
-        0.5,
-        (255, 150, 255),
-        1,
-    )
-
-    # Target reached indicator
-    if target_reached:
-        cv2.putText(
-            viz,
-            "TARGET REACHED",
-            (width - 150, y_offset + 25),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            0.6,
-            (0, 255, 0),
-            2,
-        )
-
-    return viz
-
-
-def visualize_detections_3d(
-    rgb_image: np.ndarray,  # type: ignore[type-arg]
-    detections: list[Detection3D],
-    show_coordinates: bool = True,
-    bboxes_2d: list[list[float]] | None = None,
-) -> np.ndarray:  # type: ignore[type-arg]
-    """
-    Visualize detections with 3D position overlay next to bounding boxes.
-
-    Args:
-        rgb_image: Original RGB image
-        detections: List of Detection3D objects
-        show_coordinates: Whether to show 3D coordinates next to bounding boxes
-        bboxes_2d: Optional list of 2D bounding boxes corresponding to detections
-
-    Returns:
-        Visualization image
-    """
-    if not detections:
-        return rgb_image.copy()
-
-    # If no 2D bboxes provided, skip visualization
-    if bboxes_2d is None:
-        return rgb_image.copy()
-
-    # Extract data for plot_results function
-    bboxes = bboxes_2d
-    track_ids = [int(det.id) if det.id.isdigit() else i for i, det in enumerate(detections)]
-    class_ids = [i for i in range(len(detections))]
-    confidences = [
-        det.results[0].hypothesis.score if det.results_length > 0 else 0.0 for det in detections
-    ]
-    names = [
-        det.results[0].hypothesis.class_id if det.results_length > 0 else "unknown"
-        for det in detections
-    ]
-
-    # Use plot_results for basic visualization
-    viz = plot_results(rgb_image, bboxes, track_ids, class_ids, confidences, names)
-
-    # Add 3D position coordinates if requested
-    if show_coordinates and bboxes_2d is not None:
-        for i, det in enumerate(detections):
-            if det.bbox and det.bbox.center and i < len(bboxes_2d):
-                position = det.bbox.center.position
-                bbox = bboxes_2d[i]
-
-                pos_xyz = np.array([position.x, position.y, position.z])
-
-                # Get bounding box coordinates
-                _x1, y1, x2, _y2 = map(int, bbox)
-
-                # Add position text next to bounding box (top-right corner)
-                pos_text = f"({pos_xyz[0]:.2f}, {pos_xyz[1]:.2f}, {pos_xyz[2]:.2f})"
-                text_x = x2 + 5  # Right edge of bbox + small offset
-                text_y = y1 + 15  # Top edge of bbox + small offset
-
-                # Add background rectangle for better readability
-                text_size = cv2.getTextSize(pos_text, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)[0]
-                cv2.rectangle(
-                    viz,
-                    (text_x - 2, text_y - text_size[1] - 2),
-                    (text_x + text_size[0] + 2, text_y + 2),
-                    (0, 0, 0),
-                    -1,
-                )
-
-                cv2.putText(
-                    viz,
-                    pos_text,
-                    (text_x, text_y),
-                    cv2.FONT_HERSHEY_SIMPLEX,
-                    0.4,
-                    (255, 255, 255),
-                    1,
-                )
-
-    return viz  # type: ignore[no-any-return]
diff --git a/dimos/msgs/geometry_msgs/Quaternion.py b/dimos/msgs/geometry_msgs/Quaternion.py
index d19436d441..02c9592ea6 100644
--- a/dimos/msgs/geometry_msgs/Quaternion.py
+++ b/dimos/msgs/geometry_msgs/Quaternion.py
@@ -138,6 +138,20 @@ def from_euler(cls, vector: Vector3) -> Quaternion:
 
         return cls(x, y, z, w)
 
+    @classmethod
+    def from_rotation_matrix(cls, matrix: np.ndarray) -> Quaternion:  # type: ignore[type-arg]
+        """Convert a 3x3 rotation matrix to quaternion.
+
+        Args:
+            matrix: 3x3 rotation matrix (numpy array)
+
+        Returns:
+            Quaternion representation
+        """
+        rotation = R.from_matrix(matrix)
+        quat = rotation.as_quat()  # Returns [x, y, z, w]
+        return cls(quat[0], quat[1], quat[2], quat[3])
+
     def to_euler(self) -> Vector3:
         """Convert quaternion to Euler angles (roll, pitch, yaw) in radians.
 
diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py
index 26ed141867..de3e7abeca 100644
--- a/dimos/msgs/sensor_msgs/Image.py
+++ b/dimos/msgs/sensor_msgs/Image.py
@@ -310,6 +310,10 @@ def to_cupy(self) -> Image:
     def to_opencv(self) -> np.ndarray:  # type: ignore[type-arg]
         return self._impl.to_opencv()
 
+    def as_numpy(self) -> np.ndarray:  # type: ignore[type-arg]
+        """Get image data as numpy array in RGB format."""
+        return np.asarray(self.data)
+
     def to_rgb(self) -> Image:
         return Image(self._impl.to_rgb())
 
@@ -352,6 +356,20 @@ def sharpness(self) -> float:
         """Return sharpness score."""
         return self._impl.sharpness()
 
+    def to_depth_meters(self) -> Image:
+        """Return a depth image normalized to meters as float32."""
+        depth_cv = self.to_opencv()
+        fmt = self.format
+
+        if fmt == ImageFormat.DEPTH16:
+            depth_cv = depth_cv.astype(np.float32) / 1000.0
+            fmt = ImageFormat.DEPTH
+        elif depth_cv.dtype != np.float32:
+            depth_cv = depth_cv.astype(np.float32)
+            fmt = ImageFormat.DEPTH if fmt == ImageFormat.DEPTH else fmt
+
+        return Image.from_numpy(depth_cv, format=fmt, frame_id=self.frame_id, ts=self.ts)
+
     def save(self, filepath: str) -> bool:
         return self._impl.save(filepath)
 
diff --git a/dimos/msgs/sensor_msgs/PointCloud2.py b/dimos/msgs/sensor_msgs/PointCloud2.py
index 772ddad7ab..d68c62f51d 100644
--- a/dimos/msgs/sensor_msgs/PointCloud2.py
+++ b/dimos/msgs/sensor_msgs/PointCloud2.py
@@ -354,6 +354,8 @@ def transform(self, tf: Transform) -> PointCloud2:
 
     def voxel_downsample(self, voxel_size: float = 0.025) -> PointCloud2:
         """Downsample the pointcloud with a voxel grid."""
+        if voxel_size <= 0:
+            return self
         if len(self.pointcloud.points) < 20:
             return self
         downsampled = self._pcd_tensor.voxel_down_sample(voxel_size)
diff --git a/dimos/msgs/vision_msgs/Detection2D.py b/dimos/msgs/vision_msgs/Detection2D.py
new file mode 100644
index 0000000000..aa957f8061
--- /dev/null
+++ b/dimos/msgs/vision_msgs/Detection2D.py
@@ -0,0 +1,27 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dimos_lcm.vision_msgs.Detection2D import Detection2D as LCMDetection2D
+
+from dimos.types.timestamped import to_timestamp
+
+
+class Detection2D(LCMDetection2D):  # type: ignore[misc]
+    msg_name = "vision_msgs.Detection2D"
+
+    # for _get_field_type() to work when decoding in _decode_one()
+    __annotations__ = LCMDetection2D.__annotations__
+
+    @property
+    def ts(self) -> float:
+        return to_timestamp(self.header.stamp)
diff --git a/dimos/msgs/vision_msgs/Detection3D.py b/dimos/msgs/vision_msgs/Detection3D.py
new file mode 100644
index 0000000000..e074ecb0b1
--- /dev/null
+++ b/dimos/msgs/vision_msgs/Detection3D.py
@@ -0,0 +1,27 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dimos_lcm.vision_msgs.Detection3D import Detection3D as LCMDetection3D
+
+from dimos.types.timestamped import to_timestamp
+
+
+class Detection3D(LCMDetection3D):  # type: ignore[misc]
+    msg_name = "vision_msgs.Detection3D"
+
+    # for _get_field_type() to work when decoding in _decode_one()
+    __annotations__ = LCMDetection3D.__annotations__
+
+    @property
+    def ts(self) -> float:
+        return to_timestamp(self.header.stamp)
diff --git a/dimos/msgs/vision_msgs/Detection3DArray.py b/dimos/msgs/vision_msgs/Detection3DArray.py
index 59905cad4c..2eba82204d 100644
--- a/dimos/msgs/vision_msgs/Detection3DArray.py
+++ b/dimos/msgs/vision_msgs/Detection3DArray.py
@@ -11,11 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from dimos_lcm.vision_msgs.Detection3DArray import Detection3DArray as LCMDetection3DArray
 
-from dimos_lcm.vision_msgs.Detection3DArray import (
-    Detection3DArray as LCMDetection3DArray,
-)
+from dimos.types.timestamped import to_timestamp
 
 
 class Detection3DArray(LCMDetection3DArray):  # type: ignore[misc]
     msg_name = "vision_msgs.Detection3DArray"
+
+    # for _get_field_type() to work when decoding in _decode_one()
+    __annotations__ = LCMDetection3DArray.__annotations__
+
+    @property
+    def ts(self) -> float:
+        return to_timestamp(self.header.stamp)
diff --git a/dimos/msgs/vision_msgs/__init__.py b/dimos/msgs/vision_msgs/__init__.py
index af170cbfab..0f1c9c8dc1 100644
--- a/dimos/msgs/vision_msgs/__init__.py
+++ b/dimos/msgs/vision_msgs/__init__.py
@@ -1,6 +1,15 @@
 from .BoundingBox2DArray import BoundingBox2DArray
 from .BoundingBox3DArray import BoundingBox3DArray
+from .Detection2D import Detection2D
 from .Detection2DArray import Detection2DArray
+from .Detection3D import Detection3D
 from .Detection3DArray import Detection3DArray
 
-__all__ = ["BoundingBox2DArray", "BoundingBox3DArray", "Detection2DArray", "Detection3DArray"]
+__all__ = [
+    "BoundingBox2DArray",
+    "BoundingBox3DArray",
+    "Detection2D",
+    "Detection2DArray",
+    "Detection3D",
+    "Detection3DArray",
+]
diff --git a/dimos/perception/common/__init__.py b/dimos/perception/common/__init__.py
index 67481bc449..16281fe0b6 100644
--- a/dimos/perception/common/__init__.py
+++ b/dimos/perception/common/__init__.py
@@ -1,3 +1 @@
-from .detection2d_tracker import get_tracked_results, target2dTracker
-from .ibvs import *
 from .utils import *
diff --git a/dimos/perception/common/detection2d_tracker.py b/dimos/perception/common/detection2d_tracker.py
deleted file mode 100644
index 9ff36be8a1..0000000000
--- a/dimos/perception/common/detection2d_tracker.py
+++ /dev/null
@@ -1,396 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections import deque
-from collections.abc import Sequence
-
-import numpy as np
-
-
-def compute_iou(bbox1, bbox2):  # type: ignore[no-untyped-def]
-    """
-    Compute Intersection over Union (IoU) of two bounding boxes.
-    Each bbox is [x1, y1, x2, y2].
-    """
-    x1 = max(bbox1[0], bbox2[0])
-    y1 = max(bbox1[1], bbox2[1])
-    x2 = min(bbox1[2], bbox2[2])
-    y2 = min(bbox1[3], bbox2[3])
-
-    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
-    area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
-    area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
-
-    union_area = area1 + area2 - inter_area
-    if union_area == 0:
-        return 0
-    return inter_area / union_area
-
-
-def get_tracked_results(tracked_targets):  # type: ignore[no-untyped-def]
-    """
-    Extract tracked results from a list of target2d objects.
-
-    Args:
-        tracked_targets (list[target2d]): List of target2d objects (published targets)
-                                        returned by the tracker's update() function.
-
-    Returns:
-        tuple: (tracked_masks, tracked_bboxes, tracked_track_ids, tracked_probs, tracked_names)
-            where each is a list of the corresponding attribute from each target.
-    """
-    tracked_masks = []
-    tracked_bboxes = []
-    tracked_track_ids = []
-    tracked_probs = []
-    tracked_names = []
-
-    for target in tracked_targets:
-        # Extract the latest values stored in each target.
-        tracked_masks.append(target.latest_mask)
-        tracked_bboxes.append(target.latest_bbox)
-        # Here we use the most recent detection's track ID.
-        tracked_track_ids.append(target.target_id)
-        # Use the latest probability from the history.
-        tracked_probs.append(target.score)
-        # Use the stored name (if any). If not available, you can use a default value.
-        tracked_names.append(target.name)
-
-    return tracked_masks, tracked_bboxes, tracked_track_ids, tracked_probs, tracked_names
-
-
-class target2d:
-    """
-    Represents a tracked 2D target.
-    Stores the latest bounding box and mask along with a short history of track IDs,
-    detection probabilities, and computed texture values.
-    """
-
-    def __init__(  # type: ignore[no-untyped-def]
-        self,
-        initial_mask,
-        initial_bbox,
-        track_id,
-        prob: float,
-        name: str,
-        texture_value,
-        target_id,
-        history_size: int = 10,
-    ) -> None:
-        """
-        Args:
-            initial_mask (torch.Tensor): Latest segmentation mask.
-            initial_bbox (list): Bounding box in [x1, y1, x2, y2] format.
-            track_id (int): Detection’s track ID (may be -1 if not provided).
-            prob (float): Detection probability.
-            name (str): Object class name.
-            texture_value (float): Computed average texture value for this detection.
-            target_id (int): Unique identifier assigned by the tracker.
-            history_size (int): Maximum number of frames to keep in the history.
-        """
-        self.target_id = target_id
-        self.latest_mask = initial_mask
-        self.latest_bbox = initial_bbox
-        self.name = name
-        self.score = 1.0
-
-        self.track_id = track_id
-        self.probs_history = deque(maxlen=history_size)  # type: ignore[var-annotated]
-        self.texture_history = deque(maxlen=history_size)  # type: ignore[var-annotated]
-
-        self.frame_count = deque(maxlen=history_size)  # type: ignore[var-annotated]  # Total frames this target has been seen.
-        self.missed_frames = 0  # Consecutive frames when no detection was assigned.
-        self.history_size = history_size
-
-    def update(self, mask, bbox, track_id, prob: float, name: str, texture_value) -> None:  # type: ignore[no-untyped-def]
-        """
-        Update the target with a new detection.
-        """
-        self.latest_mask = mask
-        self.latest_bbox = bbox
-        self.name = name
-
-        self.track_id = track_id
-        self.probs_history.append(prob)
-        self.texture_history.append(texture_value)
-
-        self.frame_count.append(1)
-        self.missed_frames = 0
-
-    def mark_missed(self) -> None:
-        """
-        Increment the count of consecutive frames where this target was not updated.
-        """
-        self.missed_frames += 1
-        self.frame_count.append(0)
-
-    def compute_score(  # type: ignore[no-untyped-def]
-        self,
-        frame_shape,
-        min_area_ratio,
-        max_area_ratio,
-        texture_range=(0.0, 1.0),
-        border_safe_distance: int = 50,
-        weights=None,
-    ):
-        """
-        Compute a combined score for the target based on several factors.
-
-        Factors:
-          - **Detection probability:** Average over recent frames.
-          - **Temporal stability:** How consistently the target has appeared.
-          - **Texture quality:** Normalized using the provided min and max values.
-          - **Border proximity:** Computed from the minimum distance from the bbox to the frame edges.
-          - **Size:** How the object's area (relative to the frame) compares to acceptable bounds.
-
-        Args:
-            frame_shape (tuple): (height, width) of the frame.
-            min_area_ratio (float): Minimum acceptable ratio (bbox area / frame area).
-            max_area_ratio (float): Maximum acceptable ratio.
-            texture_range (tuple): (min_texture, max_texture) expected values.
-            border_safe_distance (float): Distance (in pixels) considered safe from the border.
-            weights (dict): Weights for each component. Expected keys:
-                            'prob', 'temporal', 'texture', 'border', and 'size'.
-
-        Returns:
-            float: The combined (normalized) score in the range [0, 1].
-        """
-        # Default weights if none provided.
-        if weights is None:
-            weights = {"prob": 1.0, "temporal": 1.0, "texture": 1.0, "border": 1.0, "size": 1.0}
-
-        h, w = frame_shape
-        x1, y1, x2, y2 = self.latest_bbox
-        bbox_area = (x2 - x1) * (y2 - y1)
-        frame_area = w * h
-        area_ratio = bbox_area / frame_area
-
-        # Detection probability factor.
-        avg_prob = np.mean(self.probs_history)
-        # Temporal stability factor: normalized by history size.
-        temporal_stability = np.mean(self.frame_count)
-        # Texture factor: normalize average texture using the provided range.
-        avg_texture = np.mean(self.texture_history) if self.texture_history else 0.0
-        min_texture, max_texture = texture_range
-        if max_texture == min_texture:
-            normalized_texture = avg_texture
-        else:
-            normalized_texture = (avg_texture - min_texture) / (max_texture - min_texture)
-            normalized_texture = max(0.0, min(normalized_texture, 1.0))
-
-        # Border factor: compute the minimum distance from the bbox to any frame edge.
-        left_dist = x1
-        top_dist = y1
-        right_dist = w - x2
-        min_border_dist = min(left_dist, top_dist, right_dist)
-        # Normalize the border distance: full score (1.0) if at least border_safe_distance away.
-        border_factor = min(1.0, min_border_dist / border_safe_distance)
-
-        # Size factor: penalize objects that are too small or too big.
-        if area_ratio < min_area_ratio:
-            size_factor = area_ratio / min_area_ratio
-        elif area_ratio > max_area_ratio:
-            # Here we compute a linear penalty if the area exceeds max_area_ratio.
-            if 1 - max_area_ratio > 0:
-                size_factor = max(0, (1 - area_ratio) / (1 - max_area_ratio))
-            else:
-                size_factor = 0.0
-        else:
-            size_factor = 1.0
-
-        # Combine factors using a weighted sum (each factor is assumed in [0, 1]).
-        w_prob = weights.get("prob", 1.0)
-        w_temporal = weights.get("temporal", 1.0)
-        w_texture = weights.get("texture", 1.0)
-        w_border = weights.get("border", 1.0)
-        w_size = weights.get("size", 1.0)
-        total_weight = w_prob + w_temporal + w_texture + w_border + w_size
-
-        # print(f"track_id: {self.target_id}, avg_prob: {avg_prob:.2f}, temporal_stability: {temporal_stability:.2f}, normalized_texture: {normalized_texture:.2f}, border_factor: {border_factor:.2f}, size_factor: {size_factor:.2f}")
-
-        final_score = (
-            w_prob * avg_prob
-            + w_temporal * temporal_stability
-            + w_texture * normalized_texture
-            + w_border * border_factor
-            + w_size * size_factor
-        ) / total_weight
-
-        self.score = final_score
-
-        return final_score
-
-
-class target2dTracker:
-    """
-    Tracker that maintains a history of targets across frames.
-    New segmentation detections (frame, masks, bboxes, track_ids, probabilities,
-    and computed texture values) are matched to existing targets or used to create new ones.
-
-    The tracker uses a scoring system that incorporates:
-      - **Detection probability**
-      - **Temporal stability**
-      - **Texture quality** (normalized within a specified range)
-      - **Proximity to image borders** (a continuous penalty based on the distance)
-      - **Object size** relative to the frame
-
-    Targets are published if their score exceeds the start threshold and are removed if their score
-    falls below the stop threshold or if they are missed for too many consecutive frames.
-    """
-
-    def __init__(  # type: ignore[no-untyped-def]
-        self,
-        history_size: int = 10,
-        score_threshold_start: float = 0.5,
-        score_threshold_stop: float = 0.3,
-        min_frame_count: int = 10,
-        max_missed_frames: int = 3,
-        min_area_ratio: float = 0.001,
-        max_area_ratio: float = 0.1,
-        texture_range=(0.0, 1.0),
-        border_safe_distance: int = 50,
-        weights=None,
-    ) -> None:
-        """
-        Args:
-            history_size (int): Maximum history length (number of frames) per target.
-            score_threshold_start (float): Minimum score for a target to be published.
-            score_threshold_stop (float): If a target’s score falls below this, it is removed.
-            min_frame_count (int): Minimum number of frames a target must be seen to be published.
-            max_missed_frames (int): Maximum consecutive frames a target can be missing before deletion.
-            min_area_ratio (float): Minimum acceptable bbox area relative to the frame.
-            max_area_ratio (float): Maximum acceptable bbox area relative to the frame.
-            texture_range (tuple): (min_texture, max_texture) expected values.
-            border_safe_distance (float): Distance (in pixels) considered safe from the border.
-            weights (dict): Weights for the scoring components (keys: 'prob', 'temporal',
-                            'texture', 'border', 'size').
-        """
-        self.history_size = history_size
-        self.score_threshold_start = score_threshold_start
-        self.score_threshold_stop = score_threshold_stop
-        self.min_frame_count = min_frame_count
-        self.max_missed_frames = max_missed_frames
-        self.min_area_ratio = min_area_ratio
-        self.max_area_ratio = max_area_ratio
-        self.texture_range = texture_range
-        self.border_safe_distance = border_safe_distance
-        # Default weights if none are provided.
-        if weights is None:
-            weights = {"prob": 1.0, "temporal": 1.0, "texture": 1.0, "border": 1.0, "size": 1.0}
-        self.weights = weights
-
-        self.targets = {}  # type: ignore[var-annotated]  # Dictionary mapping target_id -> target2d instance.
-        self.next_target_id = 0
-
-    def update(  # type: ignore[no-untyped-def]
-        self,
-        frame,
-        masks,
-        bboxes,
-        track_ids,
-        probs: Sequence[float],
-        names: Sequence[str],
-        texture_values,
-    ):
-        """
-        Update the tracker with new detections from the current frame.
-
-        Args:
-            frame (np.ndarray): Current BGR frame.
-            masks (list[torch.Tensor]): List of segmentation masks.
-            bboxes (list): List of bounding boxes [x1, y1, x2, y2].
-            track_ids (list): List of detection track IDs.
-            probs (list): List of detection probabilities.
-            names (list): List of class names.
-            texture_values (list): List of computed texture values.
-
-        Returns:
-            published_targets (list[target2d]): Targets that are active and have scores above
-                                                the start threshold.
-        """
-        updated_target_ids = set()
-        frame_shape = frame.shape[:2]  # (height, width)
-
-        # For each detection, try to match with an existing target.
-        for mask, bbox, det_tid, prob, name, texture in zip(
-            masks, bboxes, track_ids, probs, names, texture_values, strict=False
-        ):
-            matched_target = None
-
-            # First, try matching by detection track ID if valid.
-            if det_tid != -1:
-                for target in self.targets.values():
-                    if target.track_id == det_tid:
-                        matched_target = target
-                        break
-
-            # Otherwise, try matching using IoU.
-            if matched_target is None:
-                best_iou = 0
-                for target in self.targets.values():
-                    iou = compute_iou(bbox, target.latest_bbox)  # type: ignore[no-untyped-call]
-                    if iou > 0.5 and iou > best_iou:
-                        best_iou = iou
-                        matched_target = target
-
-            # Update existing target or create a new one.
-            if matched_target is not None:
-                matched_target.update(mask, bbox, det_tid, prob, name, texture)
-                updated_target_ids.add(matched_target.target_id)
-            else:
-                new_target = target2d(
-                    mask, bbox, det_tid, prob, name, texture, self.next_target_id, self.history_size
-                )
-                self.targets[self.next_target_id] = new_target
-                updated_target_ids.add(self.next_target_id)
-                self.next_target_id += 1
-
-        # Mark targets that were not updated.
-        for target_id, target in list(self.targets.items()):
-            if target_id not in updated_target_ids:
-                target.mark_missed()
-                if target.missed_frames > self.max_missed_frames:
-                    del self.targets[target_id]
-                    continue  # Skip further checks for this target.
-            # Remove targets whose score falls below the stop threshold.
-            score = target.compute_score(
-                frame_shape,
-                self.min_area_ratio,
-                self.max_area_ratio,
-                texture_range=self.texture_range,
-                border_safe_distance=self.border_safe_distance,
-                weights=self.weights,
-            )
-            if score < self.score_threshold_stop:
-                del self.targets[target_id]
-
-        # Publish targets with scores above the start threshold.
-        published_targets = []
-        for target in self.targets.values():
-            score = target.compute_score(
-                frame_shape,
-                self.min_area_ratio,
-                self.max_area_ratio,
-                texture_range=self.texture_range,
-                border_safe_distance=self.border_safe_distance,
-                weights=self.weights,
-            )
-            if (
-                score >= self.score_threshold_start
-                and sum(target.frame_count) >= self.min_frame_count
-                and target.missed_frames <= 5
-            ):
-                published_targets.append(target)
-
-        return published_targets
diff --git a/dimos/perception/common/export_tensorrt.py b/dimos/perception/common/export_tensorrt.py
deleted file mode 100644
index ca671e36f2..0000000000
--- a/dimos/perception/common/export_tensorrt.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-from ultralytics import YOLO, FastSAM  # type: ignore[attr-defined, import-not-found]
-
-
-def parse_args():  # type: ignore[no-untyped-def]
-    parser = argparse.ArgumentParser(description="Export YOLO/FastSAM models to different formats")
-    parser.add_argument("--model_path", type=str, required=True, help="Path to the model weights")
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        choices=["yolo", "fastsam"],
-        required=True,
-        help="Type of model to export",
-    )
-    parser.add_argument(
-        "--precision",
-        type=str,
-        choices=["fp32", "fp16", "int8"],
-        default="fp32",
-        help="Precision for export",
-    )
-    parser.add_argument(
-        "--format", type=str, choices=["onnx", "engine"], default="onnx", help="Export format"
-    )
-    return parser.parse_args()
-
-
-def main() -> None:
-    args = parse_args()  # type: ignore[no-untyped-call]
-    half = args.precision == "fp16"
-    int8 = args.precision == "int8"
-    # Load the appropriate model
-    if args.model_type == "yolo":
-        model: YOLO | FastSAM = YOLO(args.model_path)
-    else:
-        model = FastSAM(args.model_path)
-
-    # Export the model
-    model.export(format=args.format, half=half, int8=int8)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/dimos/perception/common/ibvs.py b/dimos/perception/common/ibvs.py
deleted file mode 100644
index e24819f432..0000000000
--- a/dimos/perception/common/ibvs.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-
-class PersonDistanceEstimator:
-    def __init__(self, K, camera_pitch, camera_height) -> None:  # type: ignore[no-untyped-def]
-        """
-        Initialize the distance estimator using ground plane constraint.
-
-        Args:
-            K: 3x3 Camera intrinsic matrix in OpenCV format
-               (Assumed to be already for an undistorted image)
-            camera_pitch: Upward pitch of the camera (in radians), in the robot frame
-                         Positive means looking up, negative means looking down
-            camera_height: Height of the camera above the ground (in meters)
-        """
-        self.K = K
-        self.camera_height = camera_height
-
-        # Precompute the inverse intrinsic matrix
-        self.K_inv = np.linalg.inv(K)
-
-        # Transform from camera to robot frame (z-forward to x-forward)
-        self.T = np.array([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
-
-        # Pitch rotation matrix (positive is upward)
-        theta = -camera_pitch  # Negative since positive pitch is negative rotation about robot Y
-        self.R_pitch = np.array(
-            [[np.cos(theta), 0, np.sin(theta)], [0, 1, 0], [-np.sin(theta), 0, np.cos(theta)]]
-        )
-
-        # Combined transform from camera to robot frame
-        self.A = self.R_pitch @ self.T
-
-        # Store focal length and principal point for angle calculation
-        self.fx = K[0, 0]
-        self.cx = K[0, 2]
-
-    def estimate_distance_angle(self, bbox: tuple, robot_pitch: float | None = None):  # type: ignore[no-untyped-def, type-arg]
-        """
-        Estimate distance and angle to person using ground plane constraint.
-
-        Args:
-            bbox: tuple (x_min, y_min, x_max, y_max)
-                 where y_max represents the feet position
-            robot_pitch: Current pitch of the robot body (in radians)
-                        If provided, this will be combined with the camera's fixed pitch
-
-        Returns:
-            depth: distance to person along camera's z-axis (meters)
-            angle: horizontal angle in camera frame (radians, positive right)
-        """
-        x_min, _, x_max, y_max = bbox
-
-        # Get center point of feet
-        u_c = (x_min + x_max) / 2.0
-        v_feet = y_max
-
-        # Create homogeneous feet point and get ray direction
-        p_feet = np.array([u_c, v_feet, 1.0])
-        d_feet_cam = self.K_inv @ p_feet
-
-        # If robot_pitch is provided, recalculate the transformation matrix
-        if robot_pitch is not None:
-            # Combined pitch (fixed camera pitch + current robot pitch)
-            total_pitch = -camera_pitch - robot_pitch  # Both negated for correct rotation direction
-            R_total_pitch = np.array(
-                [
-                    [np.cos(total_pitch), 0, np.sin(total_pitch)],
-                    [0, 1, 0],
-                    [-np.sin(total_pitch), 0, np.cos(total_pitch)],
-                ]
-            )
-            # Use the updated transformation matrix
-            A = R_total_pitch @ self.T
-        else:
-            # Use the precomputed transformation matrix
-            A = self.A
-
-        # Convert ray to robot frame using appropriate transformation
-        d_feet_robot = A @ d_feet_cam
-
-        # Ground plane intersection (z=0)
-        # camera_height + t * d_feet_robot[2] = 0
-        if abs(d_feet_robot[2]) < 1e-6:
-            raise ValueError("Feet ray is parallel to ground plane")
-
-        # Solve for scaling factor t
-        t = -self.camera_height / d_feet_robot[2]
-
-        # Get 3D feet position in robot frame
-        p_feet_robot = t * d_feet_robot
-
-        # Convert back to camera frame
-        p_feet_cam = self.A.T @ p_feet_robot
-
-        # Extract depth (z-coordinate in camera frame)
-        depth = p_feet_cam[2]
-
-        # Calculate horizontal angle from image center
-        angle = np.arctan((u_c - self.cx) / self.fx)
-
-        return depth, angle
-
-
-class ObjectDistanceEstimator:
-    """
-    Estimate distance to an object using the ground plane constraint.
-    This class assumes the camera is mounted on a robot and uses the
-    camera's intrinsic parameters to estimate the distance to a detected object.
-    """
-
-    def __init__(self, K, camera_pitch, camera_height) -> None:  # type: ignore[no-untyped-def]
-        """
-        Initialize the distance estimator using ground plane constraint.
-
-        Args:
-            K: 3x3 Camera intrinsic matrix in OpenCV format
-               (Assumed to be already for an undistorted image)
-            camera_pitch: Upward pitch of the camera (in radians)
-                         Positive means looking up, negative means looking down
-            camera_height: Height of the camera above the ground (in meters)
-        """
-        self.K = K
-        self.camera_height = camera_height
-
-        # Precompute the inverse intrinsic matrix
-        self.K_inv = np.linalg.inv(K)
-
-        # Transform from camera to robot frame (z-forward to x-forward)
-        self.T = np.array([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
-
-        # Pitch rotation matrix (positive is upward)
-        theta = -camera_pitch  # Negative since positive pitch is negative rotation about robot Y
-        self.R_pitch = np.array(
-            [[np.cos(theta), 0, np.sin(theta)], [0, 1, 0], [-np.sin(theta), 0, np.cos(theta)]]
-        )
-
-        # Combined transform from camera to robot frame
-        self.A = self.R_pitch @ self.T
-
-        # Store focal length and principal point for angle calculation
-        self.fx = K[0, 0]
-        self.fy = K[1, 1]
-        self.cx = K[0, 2]
-        self.estimated_object_size = None
-
-    def estimate_object_size(self, bbox: tuple, distance: float):  # type: ignore[no-untyped-def, type-arg]
-        """
-        Estimate the physical size of an object based on its bbox and known distance.
-
-        Args:
-            bbox: tuple (x_min, y_min, x_max, y_max) bounding box in the image
-            distance: Known distance to the object (in meters)
-            robot_pitch: Current pitch of the robot body (in radians), if any
-
-        Returns:
-            estimated_size: Estimated physical height of the object (in meters)
-        """
-        _x_min, y_min, _x_max, y_max = bbox
-
-        # Calculate object height in pixels
-        object_height_px = y_max - y_min
-
-        # Calculate the physical height using the known distance and focal length
-        estimated_size = object_height_px * distance / self.fy
-        self.estimated_object_size = estimated_size
-
-        return estimated_size
-
-    def set_estimated_object_size(self, size: float) -> None:
-        """
-        Set the estimated object size for future distance calculations.
-
-        Args:
-            size: Estimated physical size of the object (in meters)
-        """
-        self.estimated_object_size = size  # type: ignore[assignment]
-
-    def estimate_distance_angle(self, bbox: tuple):  # type: ignore[no-untyped-def, type-arg]
-        """
-        Estimate distance and angle to object using size-based estimation.
-
-        Args:
-            bbox: tuple (x_min, y_min, x_max, y_max)
-                 where y_max represents the bottom of the object
-            robot_pitch: Current pitch of the robot body (in radians)
-                        If provided, this will be combined with the camera's fixed pitch
-            initial_distance: Initial distance estimate for the object (in meters)
-                             Used to calibrate object size if not previously known
-
-        Returns:
-            depth: distance to object along camera's z-axis (meters)
-            angle: horizontal angle in camera frame (radians, positive right)
-            or None, None if estimation not possible
-        """
-        # If we don't have estimated object size and no initial distance is provided,
-        # we can't estimate the distance
-        if self.estimated_object_size is None:
-            return None, None
-
-        x_min, y_min, x_max, y_max = bbox
-
-        # Calculate center of the object for angle calculation
-        u_c = (x_min + x_max) / 2.0
-
-        # If we have an initial distance estimate and no object size yet,
-        # calculate and store the object size using the initial distance
-        object_height_px = y_max - y_min
-        depth = self.estimated_object_size * self.fy / object_height_px
-
-        # Calculate horizontal angle from image center
-        angle = np.arctan((u_c - self.cx) / self.fx)
-
-        return depth, angle
-
-
-# Example usage:
-if __name__ == "__main__":
-    # Example camera calibration
-    K = np.array([[600, 0, 320], [0, 600, 240], [0, 0, 1]], dtype=np.float32)
-
-    # Camera mounted 1.2m high, pitched down 10 degrees
-    camera_pitch = np.deg2rad(0)  # negative for downward pitch
-    camera_height = 1.0  # meters
-
-    estimator = PersonDistanceEstimator(K, camera_pitch, camera_height)
-    object_estimator = ObjectDistanceEstimator(K, camera_pitch, camera_height)
-
-    # Example detection
-    bbox = (300, 100, 380, 400)  # x1, y1, x2, y2
-
-    depth, angle = estimator.estimate_distance_angle(bbox)
-    # Estimate object size based on the known distance
-    object_size = object_estimator.estimate_object_size(bbox, depth)
-    depth_obj, angle_obj = object_estimator.estimate_distance_angle(bbox)
-
-    print(f"Estimated person depth: {depth:.2f} m")
-    print(f"Estimated person angle: {np.rad2deg(angle):.1f}°")
-    print(f"Estimated object depth: {depth_obj:.2f} m")
-    print(f"Estimated object angle: {np.rad2deg(angle_obj):.1f}°")
-
-    # Shrink the bbox by 30 pixels while keeping the same center
-    x_min, y_min, x_max, y_max = bbox
-    width = x_max - x_min
-    height = y_max - y_min
-    center_x = (x_min + x_max) // 2
-    center_y = (y_min + y_max) // 2
-
-    new_width = max(width - 20, 2)  # Ensure width is at least 2 pixels
-    new_height = max(height - 20, 2)  # Ensure height is at least 2 pixels
-
-    x_min = center_x - new_width // 2
-    x_max = center_x + new_width // 2
-    y_min = center_y - new_height // 2
-    y_max = center_y + new_height // 2
-
-    bbox = (x_min, y_min, x_max, y_max)
-
-    # Re-estimate distance and angle with the new bbox
-    depth, angle = estimator.estimate_distance_angle(bbox)
-    depth_obj, angle_obj = object_estimator.estimate_distance_angle(bbox)
-
-    print(f"New estimated person depth: {depth:.2f} m")
-    print(f"New estimated person angle: {np.rad2deg(angle):.1f}°")
-    print(f"New estimated object depth: {depth_obj:.2f} m")
-    print(f"New estimated object angle: {np.rad2deg(angle_obj):.1f}°")
diff --git a/dimos/perception/demo_object_scene_registration.py b/dimos/perception/demo_object_scene_registration.py
new file mode 100644
index 0000000000..fe9c3100bb
--- /dev/null
+++ b/dimos/perception/demo_object_scene_registration.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dimos.agents.agent import llm_agent
+from dimos.agents.cli.human import human_input
+from dimos.agents.spec import Provider
+from dimos.core.blueprints import autoconnect
+from dimos.hardware.sensors.camera.realsense import realsense_camera
+from dimos.hardware.sensors.camera.zed import zed_camera
+from dimos.perception.detection.detectors.yoloe import YoloePromptMode
+from dimos.perception.object_scene_registration import object_scene_registration_module
+from dimos.robot.foxglove_bridge import foxglove_bridge
+
+camera_choice = "zed"
+
+if camera_choice == "realsense":
+    camera_module = realsense_camera(enable_pointcloud=False)
+elif camera_choice == "zed":
+    camera_module = zed_camera(enable_pointcloud=False)
+else:
+    raise ValueError(f"Invalid camera choice: {camera_choice}")
+
+demo_object_scene_registration = autoconnect(
+    camera_module,
+    object_scene_registration_module(target_frame="world", prompt_mode=YoloePromptMode.LRPC),
+    foxglove_bridge(),
+    human_input(),
+    llm_agent(),
+).global_config(viewer_backend="foxglove")
diff --git a/dimos/perception/detection/detectors/conftest.py b/dimos/perception/detection/detectors/conftest.py
index 9cb600aeff..6a2c041a8b 100644
--- a/dimos/perception/detection/detectors/conftest.py
+++ b/dimos/perception/detection/detectors/conftest.py
@@ -17,6 +17,7 @@
 from dimos.msgs.sensor_msgs import Image
 from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
 from dimos.perception.detection.detectors.yolo import Yolo2DDetector
+from dimos.perception.detection.detectors.yoloe import Yoloe2DDetector, YoloePromptMode
 from dimos.utils.data import get_data
 
 
@@ -36,3 +37,9 @@ def person_detector():
 def bbox_detector():
     """Create a Yolo2DDetector instance for general object detection."""
     return Yolo2DDetector()
+
+
+@pytest.fixture(scope="session")
+def yoloe_detector():
+    """Create a Yoloe2DDetector instance for general object detection."""
+    return Yoloe2DDetector(prompt_mode=YoloePromptMode.LRPC)
diff --git a/dimos/perception/detection/detectors/detic.py b/dimos/perception/detection/detectors/detic.py
deleted file mode 100644
index 288a3e056d..0000000000
--- a/dimos/perception/detection/detectors/detic.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections.abc import Sequence
-import os
-import sys
-
-import numpy as np
-
-# Add Detic to Python path
-from dimos.constants import DIMOS_PROJECT_ROOT
-from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.detectors.types import Detector
-from dimos.perception.detection2d.utils import plot_results
-
-detic_path = DIMOS_PROJECT_ROOT / "dimos/models/Detic"
-if str(detic_path) not in sys.path:
-    sys.path.append(str(detic_path))
-    sys.path.append(str(detic_path / "third_party/CenterNet2"))
-
-# PIL patch for compatibility
-import PIL.Image
-
-if not hasattr(PIL.Image, "LINEAR") and hasattr(PIL.Image, "BILINEAR"):
-    PIL.Image.LINEAR = PIL.Image.BILINEAR  # type: ignore[attr-defined]
-
-# Detectron2 imports
-from detectron2.config import get_cfg  # type: ignore[import-not-found]
-from detectron2.data import MetadataCatalog  # type: ignore[import-not-found]
-
-
-# Simple tracking implementation
-class SimpleTracker:
-    """Simple IOU-based tracker implementation without external dependencies"""
-
-    def __init__(self, iou_threshold: float = 0.3, max_age: int = 5) -> None:
-        self.iou_threshold = iou_threshold
-        self.max_age = max_age
-        self.next_id = 1
-        self.tracks = {}  # type: ignore[var-annotated]  # id -> {bbox, class_id, age, mask, etc}
-
-    def _calculate_iou(self, bbox1, bbox2):  # type: ignore[no-untyped-def]
-        """Calculate IoU between two bboxes in format [x1,y1,x2,y2]"""
-        x1 = max(bbox1[0], bbox2[0])
-        y1 = max(bbox1[1], bbox2[1])
-        x2 = min(bbox1[2], bbox2[2])
-        y2 = min(bbox1[3], bbox2[3])
-
-        if x2 < x1 or y2 < y1:
-            return 0.0
-
-        intersection = (x2 - x1) * (y2 - y1)
-        area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
-        area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
-        union = area1 + area2 - intersection
-
-        return intersection / union if union > 0 else 0
-
-    def update(self, detections, masks):  # type: ignore[no-untyped-def]
-        """Update tracker with new detections
-
-        Args:
-            detections: List of [x1,y1,x2,y2,score,class_id]
-            masks: List of segmentation masks corresponding to detections
-
-        Returns:
-            List of [track_id, bbox, score, class_id, mask]
-        """
-        if len(detections) == 0:
-            # Age existing tracks
-            for track_id in list(self.tracks.keys()):
-                self.tracks[track_id]["age"] += 1
-                # Remove old tracks
-                if self.tracks[track_id]["age"] > self.max_age:
-                    del self.tracks[track_id]
-            return []
-
-        # Convert to numpy for easier handling
-        if not isinstance(detections, np.ndarray):
-            detections = np.array(detections)
-
-        result = []
-        matched_indices = set()
-
-        # Update existing tracks
-        for track_id, track in list(self.tracks.items()):
-            track["age"] += 1
-
-            if track["age"] > self.max_age:
-                del self.tracks[track_id]
-                continue
-
-            # Find best matching detection for this track
-            best_iou = self.iou_threshold
-            best_idx = -1
-
-            for i, det in enumerate(detections):
-                if i in matched_indices:
-                    continue
-
-                # Check class match
-                if det[5] != track["class_id"]:
-                    continue
-
-                iou = self._calculate_iou(track["bbox"], det[:4])  # type: ignore[no-untyped-call]
-                if iou > best_iou:
-                    best_iou = iou
-                    best_idx = i
-
-            # If we found a match, update the track
-            if best_idx >= 0:
-                self.tracks[track_id]["bbox"] = detections[best_idx][:4]
-                self.tracks[track_id]["score"] = detections[best_idx][4]
-                self.tracks[track_id]["age"] = 0
-                self.tracks[track_id]["mask"] = masks[best_idx]
-                matched_indices.add(best_idx)
-
-                # Add to results with mask
-                result.append(
-                    [
-                        track_id,
-                        detections[best_idx][:4],
-                        detections[best_idx][4],
-                        int(detections[best_idx][5]),
-                        self.tracks[track_id]["mask"],
-                    ]
-                )
-
-        # Create new tracks for unmatched detections
-        for i, det in enumerate(detections):
-            if i in matched_indices:
-                continue
-
-            # Create new track
-            new_id = self.next_id
-            self.next_id += 1
-
-            self.tracks[new_id] = {
-                "bbox": det[:4],
-                "score": det[4],
-                "class_id": int(det[5]),
-                "age": 0,
-                "mask": masks[i],
-            }
-
-            # Add to results with mask directly from the track
-            result.append([new_id, det[:4], det[4], int(det[5]), masks[i]])
-
-        return result
-
-
-class Detic2DDetector(Detector):
-    def __init__(  # type: ignore[no-untyped-def]
-        self, model_path=None, device: str = "cuda", vocabulary=None, threshold: float = 0.5
-    ) -> None:
-        """
-        Initialize the Detic detector with open vocabulary support.
-
-        Args:
-            model_path (str): Path to a custom Detic model weights (optional)
-            device (str): Device to run inference on ('cuda' or 'cpu')
-            vocabulary (list): Custom vocabulary (list of class names) or 'lvis', 'objects365', 'openimages', 'coco'
-            threshold (float): Detection confidence threshold
-        """
-        self.device = device
-        self.threshold = threshold
-
-        # Set up Detic paths - already added to sys.path at module level
-
-        # Import Detic modules
-        from centernet.config import add_centernet_config  # type: ignore[import-not-found]
-        from detic.config import add_detic_config  # type: ignore[import-not-found]
-        from detic.modeling.text.text_encoder import (  # type: ignore[import-not-found]
-            build_text_encoder,
-        )
-        from detic.modeling.utils import reset_cls_test  # type: ignore[import-not-found]
-
-        # Keep reference to these functions for later use
-        self.reset_cls_test = reset_cls_test
-        self.build_text_encoder = build_text_encoder
-
-        # Setup model configuration
-        self.cfg = get_cfg()
-        add_centernet_config(self.cfg)
-        add_detic_config(self.cfg)
-
-        # Use default Detic config
-        self.cfg.merge_from_file(
-            os.path.join(
-                detic_path, "configs/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml"
-            )
-        )
-
-        # Set default weights if not provided
-        if model_path is None:
-            self.cfg.MODEL.WEIGHTS = "https://dl.fbaipublicfiles.com/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth"
-        else:
-            self.cfg.MODEL.WEIGHTS = model_path
-
-        # Set device
-        if device == "cpu":
-            self.cfg.MODEL.DEVICE = "cpu"
-
-        # Set detection threshold
-        self.cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = threshold
-        self.cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = "rand"
-        self.cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = True
-
-        # Built-in datasets for Detic - use absolute paths with detic_path
-        self.builtin_datasets = {
-            "lvis": {
-                "metadata": "lvis_v1_val",
-                "classifier": os.path.join(
-                    detic_path, "datasets/metadata/lvis_v1_clip_a+cname.npy"
-                ),
-            },
-            "objects365": {
-                "metadata": "objects365_v2_val",
-                "classifier": os.path.join(
-                    detic_path, "datasets/metadata/o365_clip_a+cnamefix.npy"
-                ),
-            },
-            "openimages": {
-                "metadata": "oid_val_expanded",
-                "classifier": os.path.join(detic_path, "datasets/metadata/oid_clip_a+cname.npy"),
-            },
-            "coco": {
-                "metadata": "coco_2017_val",
-                "classifier": os.path.join(detic_path, "datasets/metadata/coco_clip_a+cname.npy"),
-            },
-        }
-
-        # Override config paths to use absolute paths
-        self.cfg.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH = os.path.join(
-            detic_path, "datasets/metadata/lvis_v1_train_cat_info.json"
-        )
-
-        # Initialize model
-        self.predictor = None
-
-        # Setup with initial vocabulary
-        vocabulary = vocabulary or "lvis"
-        self.setup_vocabulary(vocabulary)  # type: ignore[no-untyped-call]
-
-        # Initialize our simple tracker
-        self.tracker = SimpleTracker(iou_threshold=0.5, max_age=5)
-
-    def setup_vocabulary(self, vocabulary):  # type: ignore[no-untyped-def]
-        """
-        Setup the model's vocabulary.
-
-        Args:
-            vocabulary: Either a string ('lvis', 'objects365', 'openimages', 'coco')
-                       or a list of class names for custom vocabulary.
-        """
-        if self.predictor is None:
-            # Initialize the model
-            from detectron2.engine import DefaultPredictor  # type: ignore[import-not-found]
-
-            self.predictor = DefaultPredictor(self.cfg)
-
-        if isinstance(vocabulary, str) and vocabulary in self.builtin_datasets:
-            # Use built-in dataset
-            dataset = vocabulary
-            metadata = MetadataCatalog.get(self.builtin_datasets[dataset]["metadata"])
-            classifier = self.builtin_datasets[dataset]["classifier"]
-            num_classes = len(metadata.thing_classes)
-            self.class_names = metadata.thing_classes
-        else:
-            # Use custom vocabulary
-            if isinstance(vocabulary, str):
-                # If it's a string but not a built-in dataset, treat as a file
-                try:
-                    with open(vocabulary) as f:
-                        class_names = [line.strip() for line in f if line.strip()]
-                except:
-                    # Default to LVIS if there's an issue
-                    print(f"Error loading vocabulary from {vocabulary}, using LVIS")
-                    return self.setup_vocabulary("lvis")  # type: ignore[no-untyped-call]
-            else:
-                # Assume it's a list of class names
-                class_names = vocabulary
-
-            # Create classifier from text embeddings
-            metadata = MetadataCatalog.get("__unused")
-            metadata.thing_classes = class_names
-            self.class_names = class_names
-
-            # Generate CLIP embeddings for custom vocabulary
-            classifier = self._get_clip_embeddings(class_names)
-            num_classes = len(class_names)
-
-        # Reset model with new vocabulary
-        self.reset_cls_test(self.predictor.model, classifier, num_classes)  # type: ignore[attr-defined]
-        return self.class_names
-
-    def _get_clip_embeddings(self, vocabulary, prompt: str = "a "):  # type: ignore[no-untyped-def]
-        """
-        Generate CLIP embeddings for a vocabulary list.
-
-        Args:
-            vocabulary (list): List of class names
-            prompt (str): Prompt prefix to use for CLIP
-
-        Returns:
-            torch.Tensor: Tensor of embeddings
-        """
-        text_encoder = self.build_text_encoder(pretrain=True)
-        text_encoder.eval()
-        texts = [prompt + x for x in vocabulary]
-        emb = text_encoder(texts).detach().permute(1, 0).contiguous().cpu()
-        return emb
-
-    def process_image(self, image: Image):  # type: ignore[no-untyped-def]
-        """
-        Process an image and return detection results.
-
-        Args:
-            image: Input image in BGR format (OpenCV)
-
-        Returns:
-            tuple: (bboxes, track_ids, class_ids, confidences, names, masks)
-                - bboxes: list of [x1, y1, x2, y2] coordinates
-                - track_ids: list of tracking IDs (or -1 if no tracking)
-                - class_ids: list of class indices
-                - confidences: list of detection confidences
-                - names: list of class names
-                - masks: list of segmentation masks (numpy arrays)
-        """
-        # Run inference with Detic
-        outputs = self.predictor(image.to_opencv())  # type: ignore[misc]
-        instances = outputs["instances"].to("cpu")
-
-        # Extract bounding boxes, classes, scores, and masks
-        if len(instances) == 0:
-            return [], [], [], [], []  # , []
-
-        boxes = instances.pred_boxes.tensor.numpy()
-        class_ids = instances.pred_classes.numpy()
-        scores = instances.scores.numpy()
-        masks = instances.pred_masks.numpy()
-
-        # Convert boxes to [x1, y1, x2, y2] format
-        bboxes = []
-        for box in boxes:
-            x1, y1, x2, y2 = box.tolist()
-            bboxes.append([x1, y1, x2, y2])
-
-        # Get class names
-        [self.class_names[class_id] for class_id in class_ids]
-
-        # Apply tracking
-        detections = []
-        filtered_masks = []
-        for i, bbox in enumerate(bboxes):
-            if scores[i] >= self.threshold:
-                # Format for tracker: [x1, y1, x2, y2, score, class_id]
-                detections.append([*bbox, scores[i], class_ids[i]])
-                filtered_masks.append(masks[i])
-
-        if not detections:
-            return [], [], [], [], []  # , []
-
-        # Update tracker with detections and correctly aligned masks
-        track_results = self.tracker.update(detections, filtered_masks)  # type: ignore[no-untyped-call]
-
-        # Process tracking results
-        track_ids = []
-        tracked_bboxes = []
-        tracked_class_ids = []
-        tracked_scores = []
-        tracked_names = []
-        tracked_masks = []
-
-        for track_id, bbox, score, class_id, mask in track_results:
-            track_ids.append(int(track_id))
-            tracked_bboxes.append(bbox.tolist() if isinstance(bbox, np.ndarray) else bbox)
-            tracked_class_ids.append(int(class_id))
-            tracked_scores.append(score)
-            tracked_names.append(self.class_names[int(class_id)])
-            tracked_masks.append(mask)
-
-        return (
-            tracked_bboxes,
-            track_ids,
-            tracked_class_ids,
-            tracked_scores,
-            tracked_names,
-            # tracked_masks,
-        )
-
-    def visualize_results(  # type: ignore[no-untyped-def]
-        self, image, bboxes, track_ids, class_ids, confidences, names: Sequence[str]
-    ):
-        """
-        Generate visualization of detection results.
-
-        Args:
-            image: Original input image
-            bboxes: List of bounding boxes
-            track_ids: List of tracking IDs
-            class_ids: List of class indices
-            confidences: List of detection confidences
-            names: List of class names
-
-        Returns:
-            Image with visualized detections
-        """
-
-        return plot_results(image, bboxes, track_ids, class_ids, confidences, names)
-
-    def cleanup(self) -> None:
-        """Clean up resources."""
-        # Nothing specific to clean up for Detic
-        pass
diff --git a/dimos/perception/detection/detectors/test_bbox_detectors.py b/dimos/perception/detection/detectors/test_bbox_detectors.py
index bd9c1358b5..4047e35e7b 100644
--- a/dimos/perception/detection/detectors/test_bbox_detectors.py
+++ b/dimos/perception/detection/detectors/test_bbox_detectors.py
@@ -17,7 +17,7 @@
 from dimos.perception.detection.type import Detection2D, ImageDetections2D
 
 
-@pytest.fixture(params=["bbox_detector", "person_detector"], scope="session")
+@pytest.fixture(params=["bbox_detector", "person_detector", "yoloe_detector"], scope="session")
 def detector(request):
     """Parametrized fixture that provides both bbox and person detectors."""
     return request.getfixturevalue(request.param)
diff --git a/dimos/perception/detection/detectors/yoloe.py b/dimos/perception/detection/detectors/yoloe.py
new file mode 100644
index 0000000000..9c9881209c
--- /dev/null
+++ b/dimos/perception/detection/detectors/yoloe.py
@@ -0,0 +1,177 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+import threading
+from typing import Any
+
+import numpy as np
+from numpy.typing import NDArray
+from ultralytics import YOLOE  # type: ignore[attr-defined, import-not-found]
+
+from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection.detectors.types import Detector
+from dimos.perception.detection.type import ImageDetections2D
+from dimos.utils.data import get_data
+from dimos.utils.gpu_utils import is_cuda_available
+
+
+class YoloePromptMode(Enum):
+    """YOLO-E prompt modes."""
+
+    LRPC = "lrpc"
+    PROMPT = "prompt"
+
+
+class Yoloe2DDetector(Detector):
+    def __init__(
+        self,
+        model_path: str = "models_yoloe",
+        model_name: str | None = None,
+        device: str | None = None,
+        prompt_mode: YoloePromptMode = YoloePromptMode.LRPC,
+        exclude_class_ids: list[int] | None = None,
+        max_area_ratio: float | None = 0.3,
+    ) -> None:
+        """
+        Initialize YOLO-E 2D detector.
+
+        Args:
+            model_path: Path to model directory (fetched via get_data from LFS).
+            model_name: Model filename. Defaults based on prompt_mode.
+            device: Device to run inference on ('cuda', 'cpu', or None for auto).
+            prompt_mode: LRPC for prompt-free detection, PROMPT for text/visual prompting.
+            exclude_class_ids: Class IDs to filter out from results (pass [] to disable).
+            max_area_ratio: Maximum bbox area ratio (0-1) relative to image.
+        """
+        if model_name is None:
+            if prompt_mode == YoloePromptMode.LRPC:
+                model_name = "yoloe-11s-seg-pf.pt"
+            else:
+                model_name = "yoloe-11s-seg.pt"
+
+        self.model = YOLOE(get_data(model_path) / model_name)
+        self.prompt_mode = prompt_mode
+        self._visual_prompts: dict[str, NDArray[Any]] | None = None
+        self.max_area_ratio = max_area_ratio
+        self._lock = threading.Lock()
+
+        if prompt_mode == YoloePromptMode.PROMPT:
+            self.set_prompts(text=["nothing"])
+        self.exclude_class_ids = set(exclude_class_ids) if exclude_class_ids else set()
+
+        if self.max_area_ratio is not None and not (0.0 < self.max_area_ratio <= 1.0):
+            raise ValueError("max_area_ratio must be in the range (0, 1].")
+
+        if device:
+            self.device = device
+        elif is_cuda_available():  # type: ignore[no-untyped-call]
+            self.device = "cuda"
+        else:
+            self.device = "cpu"
+
+    def set_prompts(
+        self,
+        text: list[str] | None = None,
+        bboxes: NDArray[np.float64] | None = None,
+    ) -> None:
+        """
+        Set prompts for detection. Provide either text or bboxes, not both.
+
+        Args:
+            text: List of class names to detect.
+            bboxes: Bounding boxes in xyxy format, shape (N, 4).
+        """
+        if text is not None and bboxes is not None:
+            raise ValueError("Provide either text or bboxes, not both.")
+        if text is None and bboxes is None:
+            raise ValueError("Must provide either text or bboxes.")
+
+        with self._lock:
+            self.model.predictor = None
+            if text is not None:
+                self.model.set_classes(text, self.model.get_text_pe(text))  # type: ignore[no-untyped-call]
+                self._visual_prompts = None
+            else:
+                cls = np.arange(len(bboxes), dtype=np.int16)  # type: ignore[arg-type]
+                self._visual_prompts = {"bboxes": bboxes, "cls": cls}  # type: ignore[dict-item]
+
+    def process_image(self, image: Image) -> "ImageDetections2D[Any]":
+        """
+        Process an image and return detection results.
+
+        Args:
+            image: Input image
+
+        Returns:
+            ImageDetections2D containing all detected objects
+        """
+        track_kwargs = {
+            "source": image.to_opencv(),
+            "device": self.device,
+            "conf": 0.6,
+            "iou": 0.6,
+            "persist": True,
+            "verbose": False,
+        }
+
+        with self._lock:
+            if self._visual_prompts is not None:
+                track_kwargs["visual_prompts"] = self._visual_prompts
+
+            results = self.model.track(**track_kwargs)  # type: ignore[arg-type]
+
+        detections = ImageDetections2D.from_ultralytics_result(image, results)
+        return self._apply_filters(image, detections)
+
+    def _apply_filters(
+        self,
+        image: Image,
+        detections: "ImageDetections2D[Any]",
+    ) -> "ImageDetections2D[Any]":
+        if not self.exclude_class_ids and self.max_area_ratio is None:
+            return detections
+
+        predicates = []
+
+        if self.exclude_class_ids:
+            predicates.append(lambda det: det.class_id not in self.exclude_class_ids)
+
+        if self.max_area_ratio is not None:
+            image_area = image.width * image.height
+
+            def area_filter(det):  # type: ignore[no-untyped-def]
+                if image_area <= 0:
+                    return True
+                return (det.bbox_2d_volume() / image_area) <= self.max_area_ratio
+
+            predicates.append(area_filter)
+
+        filtered = detections.detections
+        for predicate in predicates:
+            filtered = [det for det in filtered if predicate(det)]  # type: ignore[no-untyped-call]
+
+        return ImageDetections2D(image, filtered)
+
+    def stop(self) -> None:
+        """Clean up resources used by the detector."""
+        if hasattr(self.model, "predictor") and self.model.predictor is not None:
+            predictor = self.model.predictor
+            if hasattr(predictor, "trackers") and predictor.trackers:
+                for tracker in predictor.trackers:
+                    if hasattr(tracker, "tracker") and hasattr(tracker.tracker, "gmc"):
+                        gmc = tracker.tracker.gmc
+                        if hasattr(gmc, "executor") and gmc.executor is not None:
+                            gmc.executor.shutdown(wait=True)
+            self.model.predictor = None
diff --git a/dimos/perception/detection/objectDB.py b/dimos/perception/detection/objectDB.py
new file mode 100644
index 0000000000..6ade2d8c8d
--- /dev/null
+++ b/dimos/perception/detection/objectDB.py
@@ -0,0 +1,312 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import threading
+import time
+from typing import TYPE_CHECKING, Any
+
+import open3d as o3d  # type: ignore[import-untyped]
+
+from dimos.msgs.sensor_msgs import PointCloud2
+from dimos.utils.logging_config import setup_logger
+
+if TYPE_CHECKING:
+    from dimos.msgs.geometry_msgs import Vector3
+    from dimos.perception.detection.type.detection3d.object import Object
+
+logger = setup_logger()
+
+
+class ObjectDB:
+    """Spatial memory database for 3D object detections.
+
+    Maintains two tiers of objects internally:
+    - _pending_objects: Recently detected objects (detection_count < threshold)
+    - _objects: Confirmed permanent objects (detection_count >= threshold)
+
+    Deduplication uses two heuristics:
+    1. track_id match from YOLOE tracker (recent match)
+    2. Center distance within threshold (spatial proximity match)
+    """
+
+    def __init__(
+        self,
+        distance_threshold: float = 0.1,
+        min_detections_for_permanent: int = 6,
+        pending_ttl_s: float = 5.0,
+        track_id_ttl_s: float = 2.0,
+    ) -> None:
+        self._distance_threshold = distance_threshold
+        self._min_detections = min_detections_for_permanent
+        self._pending_ttl_s = pending_ttl_s
+        self._track_id_ttl_s = track_id_ttl_s
+
+        # Internal storage - keyed by object_id
+        self._pending_objects: dict[str, Object] = {}
+        self._objects: dict[str, Object] = {}  # Permanent objects
+
+        # track_id -> object_id mapping for fast lookup
+        self._track_id_map: dict[int, str] = {}
+        self._last_add_stats: dict[str, int] = {}
+
+        self._lock = threading.RLock()
+
+    # ─────────────────────────────────────────────────────────────────
+    # Public Methods
+    # ─────────────────────────────────────────────────────────────────
+
+    def add_objects(self, objects: list[Object]) -> list[Object]:
+        """Add multiple objects to the database with deduplication.
+
+        Args:
+            objects: List of Object instances from object_scene_registration
+
+        Returns:
+            List of updated/created Object instances
+        """
+        stats = {
+            "input": len(objects),
+            "created": 0,
+            "updated": 0,
+            "promoted": 0,
+            "matched_track": 0,
+            "matched_distance": 0,
+        }
+
+        results: list[Object] = []
+        now = time.time()
+        with self._lock:
+            self._prune_stale_pending(now)
+            for obj in objects:
+                matched, reason = self._match(obj, now)
+                if matched is None:
+                    results.append(self._insert_pending(obj, now))
+                    stats["created"] += 1
+                    continue
+
+                self._update_existing(matched, obj, now)
+                results.append(matched)
+                stats["updated"] += 1
+                if reason == "track":
+                    stats["matched_track"] += 1
+                elif reason == "distance":
+                    stats["matched_distance"] += 1
+                if self._check_promotion(matched):
+                    stats["promoted"] += 1
+
+        stats["pending"] = len(self._pending_objects)
+        stats["permanent"] = len(self._objects)
+        self._last_add_stats = stats
+        return results
+
+    def get_last_add_stats(self) -> dict[str, int]:
+        with self._lock:
+            return dict(self._last_add_stats)
+
+    def get_objects(self) -> list[Object]:
+        """Get all permanent objects (detection_count >= threshold)."""
+        with self._lock:
+            return list(self._objects.values())
+
+    def get_all_objects(self) -> list[Object]:
+        """Get all objects (both pending and permanent)."""
+        with self._lock:
+            return list(self._pending_objects.values()) + list(self._objects.values())
+
+    def promote(self, object_id: str) -> bool:
+        """Promote an object from pending to permanent."""
+        with self._lock:
+            if object_id in self._pending_objects:
+                self._objects[object_id] = self._pending_objects.pop(object_id)
+                return True
+            return object_id in self._objects
+
+    def find_by_name(self, name: str) -> list[Object]:
+        """Find all permanent objects with matching name."""
+        with self._lock:
+            return [obj for obj in self._objects.values() if obj.name == name]
+
+    def find_nearest(
+        self,
+        position: Vector3,
+        name: str | None = None,
+    ) -> Object | None:
+        """Find nearest permanent object to a position, optionally filtered by name.
+
+        Args:
+            position: Position to search from
+            name: Optional name filter
+
+        Returns:
+            Nearest Object or None if no objects found
+        """
+        with self._lock:
+            candidates = [
+                obj
+                for obj in self._objects.values()
+                if obj.center is not None and (name is None or obj.name == name)
+            ]
+
+            if not candidates:
+                return None
+
+            return min(candidates, key=lambda obj: position.distance(obj.center))  # type: ignore[arg-type]
+
+    def clear(self) -> None:
+        """Clear all objects from the database."""
+        with self._lock:
+            # Drop Open3D pointcloud references before clearing to reduce shutdown warnings.
+            for obj in list(self._pending_objects.values()) + list(self._objects.values()):
+                obj.pointcloud = PointCloud2(
+                    pointcloud=o3d.geometry.PointCloud(),
+                    frame_id=obj.pointcloud.frame_id,
+                    ts=obj.pointcloud.ts,
+                )
+            self._pending_objects.clear()
+            self._objects.clear()
+            self._track_id_map.clear()
+            logger.info("ObjectDB cleared")
+
+    def get_stats(self) -> dict[str, int]:
+        """Get statistics about the database."""
+        with self._lock:
+            return {
+                "pending_count": len(self._pending_objects),
+                "permanent_count": len(self._objects),
+                "total_count": len(self._pending_objects) + len(self._objects),
+            }
+
+    # ─────────────────────────────────────────────────────────────────
+    # Internal Methods
+    # ─────────────────────────────────────────────────────────────────
+
+    def _match(self, obj: Object, now: float) -> tuple[Object | None, str | None]:
+        if obj.track_id >= 0:
+            matched = self._match_by_track_id(obj.track_id, now)
+            if matched is not None:
+                return matched, "track"
+
+        matched = self._match_by_distance(obj)
+        if matched is not None:
+            return matched, "distance"
+        return None, None
+
+    def _insert_pending(self, obj: Object, now: float) -> Object:
+        if not obj.ts:
+            obj.ts = now
+        self._pending_objects[obj.object_id] = obj
+        if obj.track_id >= 0:
+            self._track_id_map[obj.track_id] = obj.object_id
+        logger.info(f"Created new pending object {obj.object_id} ({obj.name})")
+        return obj
+
+    def _update_existing(self, existing: Object, obj: Object, now: float) -> None:
+        existing.update_object(obj)
+        existing.ts = obj.ts or now
+        if obj.track_id >= 0:
+            self._track_id_map[obj.track_id] = existing.object_id
+
+    def _match_by_track_id(self, track_id: int, now: float) -> Object | None:
+        """Find object with matching track_id from YOLOE."""
+        if track_id < 0:
+            return None
+
+        object_id = self._track_id_map.get(track_id)
+        if object_id is None:
+            return None
+
+        # Check in permanent objects first
+        if object_id in self._objects:
+            obj = self._objects[object_id]
+        elif object_id in self._pending_objects:
+            obj = self._pending_objects[object_id]
+        else:
+            del self._track_id_map[track_id]
+            return None
+
+        last_seen = obj.ts if obj.ts else now
+        if now - last_seen > self._track_id_ttl_s:
+            del self._track_id_map[track_id]
+            return None
+
+        return obj
+
+    def _match_by_distance(self, obj: Object) -> Object | None:
+        """Find object within distance threshold."""
+        if obj.center is None:
+            return None
+
+        # Combine all objects and filter by valid center
+        all_objects = list(self._objects.values()) + list(self._pending_objects.values())
+        candidates = [
+            o
+            for o in all_objects
+            if o.center is not None and obj.center.distance(o.center) < self._distance_threshold
+        ]
+
+        if not candidates:
+            return None
+
+        return min(candidates, key=lambda o: obj.center.distance(o.center))  # type: ignore[union-attr]
+
+    def _prune_stale_pending(self, now: float) -> None:
+        if self._pending_ttl_s <= 0:
+            return
+        cutoff = now - self._pending_ttl_s
+        stale_ids = [
+            obj_id for obj_id, obj in self._pending_objects.items() if (obj.ts or now) < cutoff
+        ]
+        for obj_id in stale_ids:
+            del self._pending_objects[obj_id]
+            for track_id, mapped_id in list(self._track_id_map.items()):
+                if mapped_id == obj_id:
+                    del self._track_id_map[track_id]
+
+    def _check_promotion(self, obj: Object) -> bool:
+        """Move object from pending to permanent if threshold met."""
+        if obj.detections_count >= self._min_detections:
+            # Check if it's in pending
+            if obj.object_id in self._pending_objects:
+                # Promote to permanent
+                del self._pending_objects[obj.object_id]
+                self._objects[obj.object_id] = obj
+                logger.info(
+                    f"Promoted object {obj.object_id} ({obj.name}) to permanent "
+                    f"with {obj.detections_count} detections"
+                )
+                return True
+        return False
+
+    # ─────────────────────────────────────────────────────────────────
+    # Agent encoding
+    # ─────────────────────────────────────────────────────────────────
+
+    def agent_encode(self) -> list[dict[str, Any]]:
+        """Encode permanent objects for agent consumption."""
+        with self._lock:
+            return [obj.agent_encode() for obj in self._objects.values()]
+
+    def __len__(self) -> int:
+        """Return number of permanent objects."""
+        with self._lock:
+            return len(self._objects)
+
+    def __repr__(self) -> str:
+        with self._lock:
+            return f"ObjectDB(permanent={len(self._objects)}, pending={len(self._pending_objects)})"
+
+
+__all__ = ["ObjectDB"]
diff --git a/dimos/perception/detection/person_tracker.py b/dimos/perception/detection/person_tracker.py
index 6212080858..50082742f0 100644
--- a/dimos/perception/detection/person_tracker.py
+++ b/dimos/perception/detection/person_tracker.py
@@ -84,9 +84,7 @@ def detections_stream(self) -> Observable[ImageDetections2D]:
                 buffer_size=2.0,
             ).pipe(
                 ops.map(
-                    lambda pair: ImageDetections2D.from_ros_detection2d_array(  # type: ignore[misc]
-                        *pair
-                    )
+                    lambda pair: ImageDetections2D.from_ros_detection2d_array(*pair)  # type: ignore[misc, arg-type]
                 )
             )
         )
diff --git a/dimos/perception/detection/reid/module.py b/dimos/perception/detection/reid/module.py
index 4e239da39a..f3f2a5a126 100644
--- a/dimos/perception/detection/reid/module.py
+++ b/dimos/perception/detection/reid/module.py
@@ -65,7 +65,7 @@ def detections_stream(self) -> Observable[ImageDetections2D]:
                 ),
                 match_tolerance=0.0,
                 buffer_size=2.0,
-            ).pipe(ops.map(lambda pair: ImageDetections2D.from_ros_detection2d_array(*pair)))  # type: ignore[misc]
+            ).pipe(ops.map(lambda pair: ImageDetections2D.from_ros_detection2d_array(*pair)))  # type: ignore[misc, arg-type]
         )
 
     @rpc
diff --git a/dimos/perception/detection/type/detection2d/__init__.py b/dimos/perception/detection/type/detection2d/__init__.py
index ad3b7fa62e..8994d840b6 100644
--- a/dimos/perception/detection/type/detection2d/__init__.py
+++ b/dimos/perception/detection/type/detection2d/__init__.py
@@ -17,11 +17,13 @@
 from dimos.perception.detection.type.detection2d.imageDetections2D import ImageDetections2D
 from dimos.perception.detection.type.detection2d.person import Detection2DPerson
 from dimos.perception.detection.type.detection2d.point import Detection2DPoint
+from dimos.perception.detection.type.detection2d.seg import Detection2DSeg
 
 __all__ = [
     "Detection2D",
     "Detection2DBBox",
     "Detection2DPerson",
     "Detection2DPoint",
+    "Detection2DSeg",
     "ImageDetections2D",
 ]
diff --git a/dimos/perception/detection/type/detection2d/imageDetections2D.py b/dimos/perception/detection/type/detection2d/imageDetections2D.py
index 680f9dd117..34033a9c50 100644
--- a/dimos/perception/detection/type/detection2d/imageDetections2D.py
+++ b/dimos/perception/detection/type/detection2d/imageDetections2D.py
@@ -14,28 +14,32 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Generic
+from typing import TYPE_CHECKING, Any, Generic
 
 from typing_extensions import TypeVar
 
 from dimos.perception.detection.type.detection2d.base import Detection2D
 from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox
+from dimos.perception.detection.type.detection2d.person import Detection2DPerson
+from dimos.perception.detection.type.detection2d.seg import Detection2DSeg
 from dimos.perception.detection.type.imageDetections import ImageDetections
 
 if TYPE_CHECKING:
-    from dimos_lcm.vision_msgs import Detection2DArray
-    from ultralytics.engine.results import Results  # type: ignore[import-not-found]
+    from ultralytics.engine.results import Results
 
     from dimos.msgs.sensor_msgs import Image
+    from dimos.msgs.vision_msgs import Detection2DArray
 
-# TypeVar with default - Detection2DBBox is the default when no type param given
 T2D = TypeVar("T2D", bound=Detection2D, default=Detection2DBBox)
 
 
 class ImageDetections2D(ImageDetections[T2D], Generic[T2D]):
     @classmethod
-    def from_ros_detection2d_array(  # type: ignore[no-untyped-def]
-        cls, image: Image, ros_detections: Detection2DArray, **kwargs
+    def from_ros_detection2d_array(
+        cls,
+        image: Image,
+        ros_detections: Detection2DArray,
+        **kwargs: Any,
     ) -> ImageDetections2D[Detection2DBBox]:
         """Convert from ROS Detection2DArray message to ImageDetections2D object."""
         detections: list[Detection2DBBox] = []
@@ -47,24 +51,25 @@ def from_ros_detection2d_array(  # type: ignore[no-untyped-def]
         return ImageDetections2D(image=image, detections=detections)
 
     @classmethod
-    def from_ultralytics_result(  # type: ignore[no-untyped-def]
-        cls, image: Image, results: list[Results], **kwargs
+    def from_ultralytics_result(
+        cls,
+        image: Image,
+        results: list[Results],
     ) -> ImageDetections2D[Detection2DBBox]:
         """Create ImageDetections2D from ultralytics Results.
 
         Dispatches to appropriate Detection2D subclass based on result type:
+        - If masks present: creates Detection2DSeg
         - If keypoints present: creates Detection2DPerson
         - Otherwise: creates Detection2DBBox
 
         Args:
             image: Source image
             results: List of ultralytics Results objects
-            **kwargs: Additional arguments passed to detection constructors
 
         Returns:
             ImageDetections2D containing appropriate detection types
         """
-        from dimos.perception.detection.type.detection2d.person import Detection2DPerson
 
         detections: list[Detection2DBBox] = []
         for result in results:
@@ -74,7 +79,10 @@ def from_ultralytics_result(  # type: ignore[no-untyped-def]
             num_detections = len(result.boxes.xyxy)
             for i in range(num_detections):
                 detection: Detection2DBBox
-                if result.keypoints is not None:
+                if result.masks is not None:
+                    # Segmentation detection with mask
+                    detection = Detection2DSeg.from_ultralytics_result(result, i, image)
+                elif result.keypoints is not None:
                     # Pose detection with keypoints
                     detection = Detection2DPerson.from_ultralytics_result(result, i, image)
                 else:
diff --git a/dimos/perception/detection/type/detection2d/seg.py b/dimos/perception/detection/type/detection2d/seg.py
new file mode 100644
index 0000000000..21f8e8e689
--- /dev/null
+++ b/dimos/perception/detection/type/detection2d/seg.py
@@ -0,0 +1,204 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+import cv2
+from dimos_lcm.foxglove_msgs.ImageAnnotations import PointsAnnotation
+from dimos_lcm.foxglove_msgs.Point2 import Point2
+import numpy as np
+import torch
+
+from dimos.msgs.foxglove_msgs.Color import Color
+from dimos.perception.detection.type.detection2d.bbox import Bbox, Detection2DBBox
+from dimos.types.timestamped import to_ros_stamp
+
+if TYPE_CHECKING:
+    from ultralytics.engine.results import Results
+
+    from dimos.msgs.sensor_msgs import Image
+
+
+@dataclass
+class Detection2DSeg(Detection2DBBox):
+    """Represents a detection with a segmentation mask."""
+
+    mask: np.ndarray[Any, np.dtype[np.uint8]]  # Binary mask [H, W], uint8 0 or 255
+
+    @classmethod
+    def from_sam2_result(
+        cls,
+        mask: np.ndarray[Any, Any] | torch.Tensor,
+        obj_id: int,
+        image: Image,
+        class_id: int = 0,
+        name: str = "object",
+        confidence: float = 1.0,
+    ) -> Detection2DSeg:
+        """Create Detection2DSeg from SAM output (single object).
+
+        Args:
+            mask: Segmentation mask (logits or binary). Shape [H, W] or [1, H, W].
+            obj_id: Tracking ID of the object.
+            image: Source image.
+            class_id: Class ID (default 0).
+            name: Class name (default "object").
+            confidence: Confidence score (default 1.0).
+
+        Returns:
+            Detection2DSeg instance.
+        """
+        # Convert mask to numpy if tensor
+        if isinstance(mask, torch.Tensor):
+            mask = mask.detach().cpu().numpy()
+
+        # Handle dimensions (EdgeTAM might return [1, H, W] or [H, W])
+        if mask.ndim == 3:
+            mask = mask.squeeze()
+
+        # Binarize if it's logits (usually < 0 is background, > 0 is foreground)
+        # or if it's boolean
+        if mask.dtype == bool:
+            mask = mask.astype(np.uint8) * 255
+        elif np.issubdtype(mask.dtype, np.floating):
+            mask = (mask > 0.0).astype(np.uint8) * 255
+
+        # Calculate bbox
+        y_indices, x_indices = np.where(mask > 0)
+        if len(x_indices) > 0:
+            x1_val, y1_val = float(np.min(x_indices)), float(np.min(y_indices))
+            x2_val, y2_val = float(np.max(x_indices)), float(np.max(y_indices))
+        else:
+            x1_val = y1_val = x2_val = y2_val = 0.0
+
+        bbox = (x1_val, y1_val, x2_val, y2_val)
+
+        return cls(
+            bbox=bbox,
+            track_id=obj_id,
+            class_id=class_id,
+            confidence=confidence,
+            name=name,
+            ts=image.ts,
+            image=image,
+            mask=mask.astype(np.uint8),  # type: ignore[arg-type]
+        )
+
+    @classmethod
+    def from_ultralytics_result(cls, result: Results, idx: int, image: Image) -> Detection2DSeg:
+        """Create Detection2DSeg from ultralytics Results object with segmentation mask.
+
+        Args:
+            result: Ultralytics Results object containing detection and mask data
+            idx: Index of the detection in the results
+            image: Source image
+
+        Returns:
+            Detection2DSeg instance
+        """
+        if result.boxes is None:
+            raise ValueError("Result has no boxes")
+
+        # Extract bounding box coordinates
+        bbox_array = result.boxes.xyxy[idx].cpu().numpy()
+        bbox: Bbox = (
+            float(bbox_array[0]),
+            float(bbox_array[1]),
+            float(bbox_array[2]),
+            float(bbox_array[3]),
+        )
+
+        # Extract confidence
+        confidence = float(result.boxes.conf[idx].cpu())
+
+        # Extract class ID and name
+        class_id = int(result.boxes.cls[idx].cpu())
+        if hasattr(result, "names") and result.names is not None:
+            if isinstance(result.names, dict):
+                name = result.names.get(class_id, f"class_{class_id}")
+            elif isinstance(result.names, list) and class_id < len(result.names):
+                name = result.names[class_id]
+            else:
+                name = f"class_{class_id}"
+        else:
+            name = f"class_{class_id}"
+
+        # Extract track ID if available
+        track_id = -1
+        if hasattr(result.boxes, "id") and result.boxes.id is not None:
+            track_id = int(result.boxes.id[idx].cpu())
+
+        # Extract mask
+        mask = np.zeros((image.height, image.width), dtype=np.uint8)
+        if result.masks is not None and idx < len(result.masks.data):
+            mask_tensor = result.masks.data[idx]
+            mask_np = mask_tensor.cpu().numpy()
+
+            # Resize mask to image size if needed
+            if mask_np.shape != (image.height, image.width):
+                mask_np = cv2.resize(
+                    mask_np.astype(np.float32),
+                    (image.width, image.height),
+                    interpolation=cv2.INTER_LINEAR,
+                )
+
+            # Binarize mask
+            mask = (mask_np > 0.5).astype(np.uint8) * 255  # type: ignore[assignment]
+
+        return cls(
+            bbox=bbox,
+            track_id=track_id,
+            class_id=class_id,
+            confidence=confidence,
+            name=name,
+            ts=image.ts,
+            image=image,
+            mask=mask,
+        )
+
+    def to_points_annotation(self) -> list[PointsAnnotation]:
+        """Override to include mask outline."""
+        annotations = super().to_points_annotation()
+
+        # Find contours
+        contours, _ = cv2.findContours(self.mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+        for contour in contours:
+            # Simplify contour to reduce points
+            epsilon = 0.005 * cv2.arcLength(contour, True)
+            approx = cv2.approxPolyDP(contour, epsilon, True)
+
+            points = []
+            for pt in approx:
+                points.append(Point2(x=float(pt[0][0]), y=float(pt[0][1])))
+
+            if len(points) < 3:
+                continue
+
+            annotations.append(
+                PointsAnnotation(
+                    timestamp=to_ros_stamp(self.ts),
+                    outline_color=Color.from_string(str(self.class_id), alpha=1.0, brightness=1.25),
+                    fill_color=Color.from_string(str(self.track_id), alpha=0.4),
+                    thickness=1.0,
+                    points_length=len(points),
+                    points=points,
+                    type=PointsAnnotation.LINE_LOOP,
+                )
+            )
+
+        return annotations
diff --git a/dimos/perception/detection/type/detection3d/base.py b/dimos/perception/detection/type/detection3d/base.py
index d8cc430c44..b036584f3e 100644
--- a/dimos/perception/detection/type/detection3d/base.py
+++ b/dimos/perception/detection/type/detection3d/base.py
@@ -15,23 +15,22 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING
 
+from dimos.msgs.geometry_msgs import Transform
 from dimos.perception.detection.type.detection2d import Detection2DBBox
 
 if TYPE_CHECKING:
     from dimos_lcm.sensor_msgs import CameraInfo
 
-    from dimos.msgs.geometry_msgs import Transform
-
 
 @dataclass
 class Detection3D(Detection2DBBox):
     """Abstract base class for 3D detections."""
 
-    transform: Transform
-    frame_id: str
+    frame_id: str = ""
+    transform: Transform = field(default_factory=Transform.identity)
 
     @classmethod
     @abstractmethod
diff --git a/dimos/perception/detection/type/detection3d/bbox.py b/dimos/perception/detection/type/detection3d/bbox.py
index ac6f82a25e..cf7f4ea3cc 100644
--- a/dimos/perception/detection/type/detection3d/bbox.py
+++ b/dimos/perception/detection/type/detection3d/bbox.py
@@ -14,11 +14,15 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import functools
 from typing import Any
 
-from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3
+from dimos_lcm.vision_msgs import ObjectHypothesis, ObjectHypothesisWithPose
+
+from dimos.msgs.geometry_msgs import Pose, PoseStamped, Quaternion, Transform, Vector3
+from dimos.msgs.std_msgs import Header
+from dimos.msgs.vision_msgs import Detection3D
 from dimos.perception.detection.type.detection2d import Detection2DBBox
 
 
@@ -29,11 +33,11 @@ class Detection3DBBox(Detection2DBBox):
     Represents a 3D detection as an oriented bounding box in world space.
     """
 
-    transform: Transform  # Camera to world transform
-    frame_id: str  # Frame ID (e.g., "world", "map")
     center: Vector3  # Center point in world frame
     size: Vector3  # Width, height, depth
-    orientation: tuple[float, float, float, float]  # Quaternion (x, y, z, w)
+    transform: Transform | None = None  # Camera to world transform
+    frame_id: str = ""  # Frame ID (e.g., "world", "map")
+    orientation: Quaternion = field(default_factory=lambda: Quaternion(0.0, 0.0, 0.0, 1.0))
 
     @functools.cached_property
     def pose(self) -> PoseStamped:
@@ -48,8 +52,34 @@ def pose(self) -> PoseStamped:
             orientation=self.orientation,
         )
 
+    def to_detection3d_msg(self) -> Detection3D:
+        """Convert to ROS Detection3D message."""
+        msg = Detection3D()
+        msg.header = Header(self.ts, self.frame_id)
+
+        # Results
+        msg.results = [
+            ObjectHypothesisWithPose(
+                hypothesis=ObjectHypothesis(
+                    class_id=str(self.class_id),
+                    score=self.confidence,
+                )
+            )
+        ]
+
+        # Bounding Box
+        msg.bbox.center = Pose(
+            position=self.center,
+            orientation=self.orientation,
+        )
+        msg.bbox.size = self.size
+
+        return msg
+
     def to_repr_dict(self) -> dict[str, Any]:
         # Calculate distance from camera
+        if self.transform is None:
+            return super().to_repr_dict()
         camera_pos = self.transform.translation
         distance = (self.center - camera_pos).magnitude()
 
diff --git a/dimos/perception/detection/type/detection3d/object.py b/dimos/perception/detection/type/detection3d/object.py
new file mode 100644
index 0000000000..00d4d88661
--- /dev/null
+++ b/dimos/perception/detection/type/detection3d/object.py
@@ -0,0 +1,363 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+import time
+from typing import TYPE_CHECKING, Any
+import uuid
+
+import cv2
+from dimos_lcm.geometry_msgs import Pose
+import numpy as np
+import open3d as o3d  # type: ignore[import-untyped]
+
+from dimos.msgs.geometry_msgs import PoseStamped, Quaternion, Transform, Vector3
+from dimos.msgs.sensor_msgs import Image, PointCloud2
+from dimos.msgs.std_msgs import Header
+from dimos.msgs.vision_msgs import Detection3D as ROSDetection3D, Detection3DArray
+from dimos.perception.detection.type.detection2d.seg import Detection2DSeg
+from dimos.perception.detection.type.detection3d.base import Detection3D
+
+if TYPE_CHECKING:
+    from dimos_lcm.sensor_msgs import CameraInfo
+
+    from dimos.perception.detection.type.detection2d import ImageDetections2D
+
+
+@dataclass(kw_only=True)
+class Object(Detection3D):
+    """3D object detection combining bounding box and pointcloud representations.
+
+    Represents a detected object in 3D space with support for accumulating
+    multiple detections over time.
+    """
+
+    object_id: str = field(default_factory=lambda: uuid.uuid4().hex[:8])
+    center: Vector3
+    size: Vector3
+    pose: PoseStamped
+    pointcloud: PointCloud2
+    camera_transform: Transform | None = None
+    mask: np.ndarray[Any, np.dtype[np.uint8]] | None = None
+    detections_count: int = 1
+
+    def update_object(self, other: Object) -> None:
+        """Update this object with data from another detection.
+
+        Accumulates pointclouds by transforming the new pointcloud to world frame
+        and adding it to the existing pointcloud. Updates center and camera_transform,
+        and increments the detections_count.
+
+        Args:
+            other: Another Object instance with newer detection data.
+        """
+        # Accumulate pointclouds if transforms are available
+        if other.camera_transform is not None:
+            # Transform new pointcloud to world frame and add to existing
+            # transformed_pc = other.pointcloud.transform(other.camera_transform)
+            # self.pointcloud = self.pointcloud + transformed_pc
+
+            # Recompute center from accumulated pointcloud
+            self.pointcloud = other.pointcloud
+            pc_center = other.pointcloud.center
+            self.center = Vector3(pc_center.x, pc_center.y, pc_center.z)
+        else:
+            # No transform available, just replace
+            self.pointcloud = other.pointcloud
+            self.center = other.center
+
+        self.camera_transform = other.camera_transform
+        self.size = other.size
+        self.pose = other.pose
+        self.track_id = other.track_id
+        self.mask = other.mask
+        self.name = other.name
+        self.bbox = other.bbox
+        self.confidence = other.confidence
+        self.class_id = other.class_id
+        self.ts = other.ts
+        self.frame_id = other.frame_id
+        self.image = other.image
+        self.detections_count += 1
+
+    def get_oriented_bounding_box(self) -> Any:
+        """Get oriented bounding box of the pointcloud."""
+        return self.pointcloud.get_oriented_bounding_box()
+
+    def scene_entity_label(self) -> str:
+        """Get label for scene visualization."""
+        if self.detections_count > 1:
+            return f"{self.name} ({self.detections_count})"
+        return f"{self.track_id}/{self.name} ({self.confidence:.0%})"
+
+    def to_detection3d_msg(self) -> ROSDetection3D:
+        """Convert to ROS Detection3D message."""
+        obb = self.get_oriented_bounding_box()  # type: ignore[no-untyped-call]
+        orientation = Quaternion.from_rotation_matrix(obb.R)
+
+        msg = ROSDetection3D()
+        msg.header = Header(self.ts, self.frame_id)
+        msg.id = str(self.track_id)
+        msg.bbox.center = Pose(
+            position=Vector3(obb.center[0], obb.center[1], obb.center[2]),
+            orientation=orientation,
+        )
+        msg.bbox.size = Vector3(obb.extent[0], obb.extent[1], obb.extent[2])
+
+        return msg
+
+    def agent_encode(self) -> dict[str, Any]:
+        """Encode for agent consumption."""
+        return {
+            "id": self.track_id,
+            "name": self.name,
+            "detections": self.detections_count,
+            "last_seen": f"{round(time.time() - self.ts)}s ago",
+        }
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert object to dictionary with all relevant data."""
+        return {
+            "object_id": self.object_id,
+            "track_id": self.track_id,
+            "class_id": self.class_id,
+            "name": self.name,
+            "mask": self.mask,
+            "pointcloud": self.pointcloud.as_numpy(),
+            "image": self.image.as_numpy() if self.image else None,
+        }
+
+    @classmethod
+    def from_2d_to_list(
+        cls,
+        detections_2d: ImageDetections2D[Detection2DSeg],
+        color_image: Image,
+        depth_image: Image,
+        camera_info: CameraInfo,
+        camera_transform: Transform | None = None,
+        depth_scale: float = 1.0,
+        depth_trunc: float = 10.0,
+        statistical_nb_neighbors: int = 10,
+        statistical_std_ratio: float = 0.5,
+        voxel_downsample: float = 0.005,
+        mask_erode_pixels: int = 3,
+    ) -> list[Object]:
+        """Create 3D Objects from 2D detections and RGBD images.
+
+        Uses Open3D's optimized RGBD projection for efficient processing.
+
+        Args:
+            detections_2d: 2D detections with segmentation masks
+            color_image: RGB color image
+            depth_image: Depth image (in meters if depth_scale=1.0)
+            camera_info: Camera intrinsics
+            camera_transform: Optional transform from camera frame to world frame.
+                If provided, pointclouds will be transformed to world frame.
+            depth_scale: Scale factor for depth (1.0 for meters, 1000.0 for mm)
+            depth_trunc: Maximum depth value in meters
+            statistical_nb_neighbors: Neighbors for statistical outlier removal
+            statistical_std_ratio: Std ratio for statistical outlier removal
+            voxel_downsample: Voxel size (meters) for downsampling before filtering. Set <= 0 to skip.
+            mask_erode_pixels: Number of pixels to erode the mask by to remove
+                              noisy depth edge points. Set to 0 to disable.
+
+        Returns:
+            List of Object instances with pointclouds
+        """
+        color_cv = color_image.to_opencv()
+        if color_cv.ndim == 3 and color_cv.shape[2] == 3:
+            color_cv = cv2.cvtColor(color_cv, cv2.COLOR_BGR2RGB)
+
+        depth_cv = depth_image.to_opencv()
+        h, w = depth_cv.shape[:2]
+
+        # Build Open3D camera intrinsics
+        fx, fy = camera_info.K[0], camera_info.K[4]
+        cx, cy = camera_info.K[2], camera_info.K[5]
+        intrinsic_o3d = o3d.camera.PinholeCameraIntrinsic(w, h, fx, fy, cx, cy)
+
+        objects: list[Object] = []
+
+        for det in detections_2d.detections:
+            if isinstance(det, Detection2DSeg):
+                mask = det.mask
+                store_mask = det.mask
+            else:
+                mask = np.zeros((h, w), dtype=np.uint8)
+                x1, y1, x2, y2 = map(int, det.bbox)
+                x1, y1 = max(0, x1), max(0, y1)
+                x2, y2 = min(w, x2), min(h, y2)
+                mask[y1:y2, x1:x2] = 255
+                store_mask = mask
+
+            if mask_erode_pixels > 0:
+                mask_uint8 = mask.astype(np.uint8)
+                if mask_uint8.max() == 1:
+                    mask_uint8 = mask_uint8 * 255  # type: ignore[assignment]
+                kernel_size = 2 * mask_erode_pixels + 1
+                erode_kernel = cv2.getStructuringElement(
+                    cv2.MORPH_ELLIPSE, (kernel_size, kernel_size)
+                )
+                mask = cv2.erode(mask_uint8, erode_kernel)  # type: ignore[assignment]
+
+            depth_masked = depth_cv.copy()
+            depth_masked[mask == 0] = 0
+
+            rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth(
+                o3d.geometry.Image(color_cv.astype(np.uint8)),
+                o3d.geometry.Image(depth_masked.astype(np.float32)),
+                depth_scale=depth_scale,
+                depth_trunc=depth_trunc,
+                convert_rgb_to_intensity=False,
+            )
+            pcd = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, intrinsic_o3d)
+
+            pc0 = PointCloud2(
+                pcd,
+                frame_id=depth_image.frame_id,
+                ts=depth_image.ts,
+            ).voxel_downsample(voxel_downsample)
+
+            pcd_filtered, _ = pc0.pointcloud.remove_statistical_outlier(
+                nb_neighbors=statistical_nb_neighbors,
+                std_ratio=statistical_std_ratio,
+            )
+
+            if len(pcd_filtered.points) < 10:
+                continue
+
+            pc = PointCloud2(
+                pcd_filtered,
+                frame_id=depth_image.frame_id,
+                ts=depth_image.ts,
+            )
+
+            # Transform pointcloud to world frame if camera_transform is provided
+            if camera_transform is not None:
+                pc = pc.transform(camera_transform)
+                frame_id = camera_transform.frame_id
+            else:
+                frame_id = depth_image.frame_id
+
+            # Compute center from pointcloud
+            obb = pc.pointcloud.get_oriented_bounding_box()
+            center = Vector3(obb.center[0], obb.center[1], obb.center[2])
+            size = Vector3(obb.extent[0], obb.extent[1], obb.extent[2])
+            orientation = Quaternion.from_rotation_matrix(obb.R)
+            pose = PoseStamped(
+                ts=det.ts,
+                frame_id=frame_id,
+                position=center,
+                orientation=orientation,
+            )
+
+            objects.append(
+                cls(
+                    bbox=det.bbox,
+                    track_id=det.track_id,
+                    class_id=det.class_id,
+                    confidence=det.confidence,
+                    name=det.name,
+                    ts=det.ts,
+                    image=det.image,
+                    frame_id=frame_id,
+                    pointcloud=pc,
+                    center=center,
+                    size=size,
+                    pose=pose,
+                    camera_transform=camera_transform,
+                    mask=store_mask,
+                )
+            )
+
+        return objects
+
+
+def aggregate_pointclouds(objects: list[Object]) -> PointCloud2:
+    """Aggregate all object pointclouds into a single colored pointcloud.
+
+    Each object's points are colored based on its track_id.
+
+    Args:
+        objects: List of Object instances with pointclouds
+
+    Returns:
+        Combined PointCloud2 with all points colored by object (empty if no points).
+    """
+    if not objects:
+        return PointCloud2(pointcloud=o3d.geometry.PointCloud(), frame_id="", ts=0.0)
+
+    all_points = []
+    all_colors = []
+
+    for _i, obj in enumerate(objects):
+        points, colors = obj.pointcloud.as_numpy()
+        if len(points) == 0:
+            continue
+
+        try:
+            seed = int(obj.object_id, 16)
+        except (ValueError, TypeError):
+            seed = abs(hash(obj.object_id))
+        np.random.seed(abs(seed) % (2**32 - 1))
+        track_color = np.random.randint(50, 255, 3) / 255.0
+
+        if colors is not None:
+            blended = np.clip(0.6 * colors + 0.4 * track_color, 0.0, 1.0)
+        else:
+            blended = np.tile(track_color, (len(points), 1))
+
+        all_points.append(points)
+        all_colors.append(blended)
+
+    if not all_points:
+        return PointCloud2(
+            pointcloud=o3d.geometry.PointCloud(), frame_id=objects[0].frame_id, ts=objects[0].ts
+        )
+
+    combined_points = np.vstack(all_points)
+    combined_colors = np.vstack(all_colors)
+
+    pc = PointCloud2.from_numpy(
+        combined_points,
+        frame_id=objects[0].frame_id,
+        timestamp=objects[0].ts,
+    )
+    pcd = pc.pointcloud
+    pcd.colors = o3d.utility.Vector3dVector(combined_colors)
+    pc.pointcloud = pcd
+
+    return pc
+
+
+def to_detection3d_array(objects: list[Object]) -> Detection3DArray:
+    """Convert a list of Objects to a ROS Detection3DArray message.
+
+    Args:
+        objects: List of Object instances
+
+    Returns:
+        Detection3DArray ROS message
+    """
+    array = Detection3DArray()
+
+    if objects:
+        array.header = Header(objects[0].ts, objects[0].frame_id)
+
+    for obj in objects:
+        array.detections.append(obj.to_detection3d_msg())
+
+    return array
diff --git a/dimos/perception/detection/type/detection3d/pointcloud.py b/dimos/perception/detection/type/detection3d/pointcloud.py
index fd924a6564..7edceb17a5 100644
--- a/dimos/perception/detection/type/detection3d/pointcloud.py
+++ b/dimos/perception/detection/type/detection3d/pointcloud.py
@@ -14,7 +14,7 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import functools
 from typing import TYPE_CHECKING, Any
 
@@ -52,7 +52,7 @@
 
 @dataclass
 class Detection3DPC(Detection3D):
-    pointcloud: PointCloud2
+    pointcloud: PointCloud2 = field(default_factory=PointCloud2)
 
     @functools.cached_property
     def center(self) -> Vector3:
diff --git a/dimos/perception/detection2d/utils.py b/dimos/perception/detection2d/utils.py
deleted file mode 100644
index a505eef7c8..0000000000
--- a/dimos/perception/detection2d/utils.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections.abc import Sequence
-
-import cv2
-import numpy as np
-
-
-def filter_detections(  # type: ignore[no-untyped-def]
-    bboxes,
-    track_ids,
-    class_ids,
-    confidences,
-    names: Sequence[str],
-    class_filter=None,
-    name_filter=None,
-    track_id_filter=None,
-):
-    """
-    Filter detection results based on class IDs, names, and/or tracking IDs.
-
-    Args:
-        bboxes: List of bounding boxes [x1, y1, x2, y2]
-        track_ids: List of tracking IDs
-        class_ids: List of class indices
-        confidences: List of detection confidences
-        names: List of class names
-        class_filter: List/set of class IDs to keep, or None to keep all
-        name_filter: List/set of class names to keep, or None to keep all
-        track_id_filter: List/set of track IDs to keep, or None to keep all
-
-    Returns:
-        tuple: (filtered_bboxes, filtered_track_ids, filtered_class_ids,
-                filtered_confidences, filtered_names)
-    """
-    # Convert filters to sets for efficient lookup
-    if class_filter is not None:
-        class_filter = set(class_filter)
-    if name_filter is not None:
-        name_filter = set(name_filter)
-    if track_id_filter is not None:
-        track_id_filter = set(track_id_filter)
-
-    # Initialize lists for filtered results
-    filtered_bboxes = []
-    filtered_track_ids = []
-    filtered_class_ids = []
-    filtered_confidences = []
-    filtered_names = []
-
-    # Filter detections
-    for bbox, track_id, class_id, conf, name in zip(
-        bboxes, track_ids, class_ids, confidences, names, strict=False
-    ):
-        # Check if detection passes all specified filters
-        keep = True
-
-        if class_filter is not None:
-            keep = keep and (class_id in class_filter)
-
-        if name_filter is not None:
-            keep = keep and (name in name_filter)
-
-        if track_id_filter is not None:
-            keep = keep and (track_id in track_id_filter)
-
-        # If detection passes all filters, add it to results
-        if keep:
-            filtered_bboxes.append(bbox)
-            filtered_track_ids.append(track_id)
-            filtered_class_ids.append(class_id)
-            filtered_confidences.append(conf)
-            filtered_names.append(name)
-
-    return (
-        filtered_bboxes,
-        filtered_track_ids,
-        filtered_class_ids,
-        filtered_confidences,
-        filtered_names,
-    )
-
-
-def extract_detection_results(result, class_filter=None, name_filter=None, track_id_filter=None):  # type: ignore[no-untyped-def]
-    """
-    Extract and optionally filter detection information from a YOLO result object.
-
-    Args:
-        result: Ultralytics result object
-        class_filter: List/set of class IDs to keep, or None to keep all
-        name_filter: List/set of class names to keep, or None to keep all
-        track_id_filter: List/set of track IDs to keep, or None to keep all
-
-    Returns:
-        tuple: (bboxes, track_ids, class_ids, confidences, names)
-            - bboxes: list of [x1, y1, x2, y2] coordinates
-            - track_ids: list of tracking IDs
-            - class_ids: list of class indices
-            - confidences: list of detection confidences
-            - names: list of class names
-    """
-    bboxes = []  # type: ignore[var-annotated]
-    track_ids = []  # type: ignore[var-annotated]
-    class_ids = []  # type: ignore[var-annotated]
-    confidences = []  # type: ignore[var-annotated]
-    names = []  # type: ignore[var-annotated]
-
-    if result.boxes is None:
-        return bboxes, track_ids, class_ids, confidences, names
-
-    for box in result.boxes:
-        # Extract bounding box coordinates
-        x1, y1, x2, y2 = box.xyxy[0].tolist()
-
-        # Extract tracking ID if available
-        track_id = -1
-        if hasattr(box, "id") and box.id is not None:
-            track_id = int(box.id[0].item())
-
-        # Extract class information
-        cls_idx = int(box.cls[0])
-        name = result.names[cls_idx]
-
-        # Extract confidence
-        conf = float(box.conf[0])
-
-        # Check filters before adding to results
-        keep = True
-        if class_filter is not None:
-            keep = keep and (cls_idx in class_filter)
-        if name_filter is not None:
-            keep = keep and (name in name_filter)
-        if track_id_filter is not None:
-            keep = keep and (track_id in track_id_filter)
-
-        if keep:
-            bboxes.append([x1, y1, x2, y2])
-            track_ids.append(track_id)
-            class_ids.append(cls_idx)
-            confidences.append(conf)
-            names.append(name)
-
-    return bboxes, track_ids, class_ids, confidences, names
-
-
-def plot_results(  # type: ignore[no-untyped-def]
-    image, bboxes, track_ids, class_ids, confidences, names: Sequence[str], alpha: float = 0.5
-):
-    """
-    Draw bounding boxes and labels on the image.
-
-    Args:
-        image: Original input image
-        bboxes: List of bounding boxes [x1, y1, x2, y2]
-        track_ids: List of tracking IDs
-        class_ids: List of class indices
-        confidences: List of detection confidences
-        names: List of class names
-        alpha: Transparency of the overlay
-
-    Returns:
-        Image with visualized detections
-    """
-    vis_img = image.copy()
-
-    for bbox, track_id, conf, name in zip(bboxes, track_ids, confidences, names, strict=False):
-        # Generate consistent color based on track_id or class name
-        if track_id != -1:
-            np.random.seed(track_id)
-        else:
-            np.random.seed(hash(name) % 100000)
-        color = np.random.randint(0, 255, (3,), dtype=np.uint8)
-        np.random.seed(None)
-
-        # Draw bounding box
-        x1, y1, x2, y2 = map(int, bbox)
-        cv2.rectangle(vis_img, (x1, y1), (x2, y2), color.tolist(), 2)
-
-        # Prepare label text
-        if track_id != -1:
-            label = f"ID:{track_id} {name} {conf:.2f}"
-        else:
-            label = f"{name} {conf:.2f}"
-
-        # Calculate text size for background rectangle
-        (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
-
-        # Draw background rectangle for text
-        cv2.rectangle(vis_img, (x1, y1 - text_h - 8), (x1 + text_w + 4, y1), color.tolist(), -1)
-
-        # Draw text with white color for better visibility
-        cv2.putText(
-            vis_img, label, (x1 + 2, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1
-        )
-
-    return vis_img
-
-
-def calculate_depth_from_bbox(depth_map, bbox):  # type: ignore[no-untyped-def]
-    """
-    Calculate the average depth of an object within a bounding box.
-    Uses the 25th to 75th percentile range to filter outliers.
-
-    Args:
-        depth_map: The depth map
-        bbox: Bounding box in format [x1, y1, x2, y2]
-
-    Returns:
-        float: Average depth in meters, or None if depth estimation fails
-    """
-    try:
-        # Extract region of interest from the depth map
-        x1, y1, x2, y2 = map(int, bbox)
-        roi_depth = depth_map[y1:y2, x1:x2]
-
-        if roi_depth.size == 0:
-            return None
-
-        # Calculate 25th and 75th percentile to filter outliers
-        p25 = np.percentile(roi_depth, 25)
-        p75 = np.percentile(roi_depth, 75)
-
-        # Filter depth values within this range
-        filtered_depth = roi_depth[(roi_depth >= p25) & (roi_depth <= p75)]
-
-        # Calculate average depth (convert to meters)
-        if filtered_depth.size > 0:
-            return np.mean(filtered_depth) / 1000.0  # Convert mm to meters
-
-        return None
-    except Exception as e:
-        print(f"Error calculating depth from bbox: {e}")
-        return None
-
-
-def calculate_distance_angle_from_bbox(bbox, depth: int, camera_intrinsics):  # type: ignore[no-untyped-def]
-    """
-    Calculate distance and angle to object center based on bbox and depth.
-
-    Args:
-        bbox: Bounding box [x1, y1, x2, y2]
-        depth: Depth value in meters
-        camera_intrinsics: List [fx, fy, cx, cy] with camera parameters
-
-    Returns:
-        tuple: (distance, angle) in meters and radians
-    """
-    if camera_intrinsics is None:
-        raise ValueError("Camera intrinsics required for distance calculation")
-
-    # Extract camera parameters
-    fx, _fy, cx, _cy = camera_intrinsics
-
-    # Calculate center of bounding box in pixels
-    x1, y1, x2, y2 = bbox
-    center_x = (x1 + x2) / 2
-    (y1 + y2) / 2
-
-    # Calculate normalized image coordinates
-    x_norm = (center_x - cx) / fx
-
-    # Calculate angle (positive to the right)
-    angle = np.arctan(x_norm)
-
-    # Calculate distance using depth and angle
-    distance = depth / np.cos(angle) if np.cos(angle) != 0 else depth
-
-    return distance, angle
-
-
-def calculate_object_size_from_bbox(bbox, depth: int, camera_intrinsics):  # type: ignore[no-untyped-def]
-    """
-    Estimate physical width and height of object in meters.
-
-    Args:
-        bbox: Bounding box [x1, y1, x2, y2]
-        depth: Depth value in meters
-        camera_intrinsics: List [fx, fy, cx, cy] with camera parameters
-
-    Returns:
-        tuple: (width, height) in meters
-    """
-    if camera_intrinsics is None:
-        return 0.0, 0.0
-
-    fx, fy, _, _ = camera_intrinsics
-
-    # Calculate bbox dimensions in pixels
-    x1, y1, x2, y2 = bbox
-    width_px = x2 - x1
-    height_px = y2 - y1
-
-    # Convert to meters using similar triangles and depth
-    width_m = (width_px * depth) / fx
-    height_m = (height_px * depth) / fy
-
-    return width_m, height_m
diff --git a/dimos/perception/grasp_generation/__init__.py b/dimos/perception/grasp_generation/__init__.py
deleted file mode 100644
index 16281fe0b6..0000000000
--- a/dimos/perception/grasp_generation/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .utils import *
diff --git a/dimos/perception/grasp_generation/grasp_generation.py b/dimos/perception/grasp_generation/grasp_generation.py
deleted file mode 100644
index 4f2e4b68a1..0000000000
--- a/dimos/perception/grasp_generation/grasp_generation.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Dimensional-hosted grasp generation for manipulation pipeline.
-"""
-
-import asyncio
-
-import numpy as np
-import open3d as o3d  # type: ignore[import-untyped]
-
-from dimos.perception.grasp_generation.utils import parse_grasp_results
-from dimos.types.manipulation import ObjectData
-from dimos.utils.logging_config import setup_logger
-
-logger = setup_logger()
-
-
-class HostedGraspGenerator:
-    """
-    Dimensional-hosted grasp generator using WebSocket communication.
-    """
-
-    def __init__(self, server_url: str) -> None:
-        """
-        Initialize Dimensional-hosted grasp generator.
-
-        Args:
-            server_url: WebSocket URL for Dimensional-hosted grasp generator server
-        """
-        self.server_url = server_url
-        logger.info(f"Initialized grasp generator with server: {server_url}")
-
-    def generate_grasps_from_objects(
-        self, objects: list[ObjectData], full_pcd: o3d.geometry.PointCloud
-    ) -> list[dict]:  # type: ignore[type-arg]
-        """
-        Generate grasps from ObjectData objects using grasp generator.
-
-        Args:
-            objects: List of ObjectData with point clouds
-            full_pcd: Open3D point cloud of full scene
-
-        Returns:
-            Parsed grasp results as list of dictionaries
-        """
-        try:
-            # Combine all point clouds
-            all_points = []
-            all_colors = []
-            valid_objects = 0
-
-            for obj in objects:
-                if "point_cloud_numpy" not in obj or obj["point_cloud_numpy"] is None:
-                    continue
-
-                points = obj["point_cloud_numpy"]
-                if not isinstance(points, np.ndarray) or points.size == 0:
-                    continue
-
-                if len(points.shape) != 2 or points.shape[1] != 3:
-                    continue
-
-                colors = None
-                if "colors_numpy" in obj and obj["colors_numpy"] is not None:  # type: ignore[typeddict-item]
-                    colors = obj["colors_numpy"]  # type: ignore[typeddict-item]
-                    if isinstance(colors, np.ndarray) and colors.size > 0:
-                        if (
-                            colors.shape[0] != points.shape[0]
-                            or len(colors.shape) != 2
-                            or colors.shape[1] != 3
-                        ):
-                            colors = None
-
-                all_points.append(points)
-                if colors is not None:
-                    all_colors.append(colors)
-                valid_objects += 1
-
-            if not all_points:
-                return []
-
-            # Combine point clouds
-            combined_points = np.vstack(all_points)
-            combined_colors = None
-            if len(all_colors) == valid_objects and len(all_colors) > 0:
-                combined_colors = np.vstack(all_colors)
-
-            # Send grasp request
-            grasps = self._send_grasp_request_sync(combined_points, combined_colors)
-
-            if not grasps:
-                return []
-
-            # Parse and return results in list of dictionaries format
-            return parse_grasp_results(grasps)
-
-        except Exception as e:
-            logger.error(f"Grasp generation failed: {e}")
-            return []
-
-    def _send_grasp_request_sync(
-        self,
-        points: np.ndarray,  # type: ignore[type-arg]
-        colors: np.ndarray | None,  # type: ignore[type-arg]
-    ) -> list[dict] | None:  # type: ignore[type-arg]
-        """Send synchronous grasp request to grasp server."""
-
-        try:
-            # Prepare colors
-            colors = np.ones((points.shape[0], 3), dtype=np.float32) * 0.5
-
-            # Ensure correct data types
-            points = points.astype(np.float32)
-            colors = colors.astype(np.float32)
-
-            # Validate ranges
-            if np.any(np.isnan(points)) or np.any(np.isinf(points)):
-                logger.error("Points contain NaN or Inf values")
-                return None
-            if np.any(np.isnan(colors)) or np.any(np.isinf(colors)):
-                logger.error("Colors contain NaN or Inf values")
-                return None
-
-            colors = np.clip(colors, 0.0, 1.0)
-
-            # Run async request in sync context
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-            try:
-                result = loop.run_until_complete(self._async_grasp_request(points, colors))
-                return result
-            finally:
-                loop.close()
-
-        except Exception as e:
-            logger.error(f"Error in synchronous grasp request: {e}")
-            return None
-
-    async def _async_grasp_request(
-        self,
-        points: np.ndarray,  # type: ignore[type-arg]
-        colors: np.ndarray,  # type: ignore[type-arg]
-    ) -> list[dict] | None:  # type: ignore[type-arg]
-        """Async grasp request helper."""
-        import json
-
-        import websockets
-
-        try:
-            async with websockets.connect(self.server_url) as websocket:
-                request = {
-                    "points": points.tolist(),
-                    "colors": colors.tolist(),
-                    "lims": [-1.0, 1.0, -1.0, 1.0, 0.0, 2.0],
-                }
-
-                await websocket.send(json.dumps(request))
-                response = await websocket.recv()
-                grasps = json.loads(response)
-
-                if isinstance(grasps, dict) and "error" in grasps:
-                    logger.error(f"Server returned error: {grasps['error']}")
-                    return None
-                elif isinstance(grasps, int | float) and grasps == 0:
-                    return None
-                elif not isinstance(grasps, list):
-                    logger.error(f"Server returned unexpected response type: {type(grasps)}")
-                    return None
-                elif len(grasps) == 0:
-                    return None
-
-                return self._convert_grasp_format(grasps)
-
-        except Exception as e:
-            logger.error(f"Async grasp request failed: {e}")
-            return None
-
-    def _convert_grasp_format(self, grasps: list[dict]) -> list[dict]:  # type: ignore[type-arg]
-        """Convert Dimensional Grasp format to visualization format."""
-        converted = []
-
-        for i, grasp in enumerate(grasps):
-            rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3)))
-            euler_angles = self._rotation_matrix_to_euler(rotation_matrix)
-
-            converted_grasp = {
-                "id": f"grasp_{i}",
-                "score": grasp.get("score", 0.0),
-                "width": grasp.get("width", 0.0),
-                "height": grasp.get("height", 0.0),
-                "depth": grasp.get("depth", 0.0),
-                "translation": grasp.get("translation", [0, 0, 0]),
-                "rotation_matrix": rotation_matrix.tolist(),
-                "euler_angles": euler_angles,
-            }
-            converted.append(converted_grasp)
-
-        converted.sort(key=lambda x: x["score"], reverse=True)
-        return converted
-
-    def _rotation_matrix_to_euler(self, rotation_matrix: np.ndarray) -> dict[str, float]:  # type: ignore[type-arg]
-        """Convert rotation matrix to Euler angles (in radians)."""
-        sy = np.sqrt(rotation_matrix[0, 0] ** 2 + rotation_matrix[1, 0] ** 2)
-
-        singular = sy < 1e-6
-
-        if not singular:
-            x = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2])
-            y = np.arctan2(-rotation_matrix[2, 0], sy)
-            z = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0])
-        else:
-            x = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1])
-            y = np.arctan2(-rotation_matrix[2, 0], sy)
-            z = 0
-
-        return {"roll": x, "pitch": y, "yaw": z}
-
-    def cleanup(self) -> None:
-        """Clean up resources."""
-        logger.info("Grasp generator cleaned up")
diff --git a/dimos/perception/grasp_generation/utils.py b/dimos/perception/grasp_generation/utils.py
deleted file mode 100644
index 492a3d1df4..0000000000
--- a/dimos/perception/grasp_generation/utils.py
+++ /dev/null
@@ -1,529 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utilities for grasp generation and visualization."""
-
-import cv2
-import numpy as np
-import open3d as o3d  # type: ignore[import-untyped]
-
-from dimos.perception.common.utils import project_3d_points_to_2d
-
-
-def create_gripper_geometry(
-    grasp_data: dict,  # type: ignore[type-arg]
-    finger_length: float = 0.08,
-    finger_thickness: float = 0.004,
-) -> list[o3d.geometry.TriangleMesh]:
-    """
-    Create a simple fork-like gripper geometry from grasp data.
-
-    Args:
-        grasp_data: Dictionary containing grasp parameters
-            - translation: 3D position list
-            - rotation_matrix: 3x3 rotation matrix defining gripper coordinate system
-                * X-axis: gripper width direction (opening/closing)
-                * Y-axis: finger length direction
-                * Z-axis: approach direction (toward object)
-            - width: Gripper opening width
-        finger_length: Length of gripper fingers (longer)
-        finger_thickness: Thickness of gripper fingers
-        base_height: Height of gripper base (longer)
-        color: RGB color for the gripper (solid blue)
-
-    Returns:
-        List of Open3D TriangleMesh geometries for the gripper
-    """
-
-    translation = np.array(grasp_data["translation"])
-    rotation_matrix = np.array(grasp_data["rotation_matrix"])
-
-    width = grasp_data.get("width", 0.04)
-
-    # Create transformation matrix
-    transform = np.eye(4)
-    transform[:3, :3] = rotation_matrix
-    transform[:3, 3] = translation
-
-    geometries = []
-
-    # Gripper dimensions
-    finger_width = 0.006  # Thickness of each finger
-    handle_length = 0.05  # Length of handle extending backward
-
-    # Build gripper in local coordinate system:
-    # X-axis = width direction (left/right finger separation)
-    # Y-axis = finger length direction (fingers extend along +Y)
-    # Z-axis = approach direction (toward object, handle extends along -Z)
-    # IMPORTANT: Fingertips should be at origin (translation point)
-
-    # Create left finger extending along +Y, positioned at +X
-    left_finger = o3d.geometry.TriangleMesh.create_box(
-        width=finger_width,  # Thin finger
-        height=finger_length,  # Extends along Y (finger length direction)
-        depth=finger_thickness,  # Thin in Z direction
-    )
-    left_finger.translate(
-        [
-            width / 2 - finger_width / 2,  # Position at +X (half width from center)
-            -finger_length,  # Shift so fingertips are at origin
-            -finger_thickness / 2,  # Center in Z
-        ]
-    )
-
-    # Create right finger extending along +Y, positioned at -X
-    right_finger = o3d.geometry.TriangleMesh.create_box(
-        width=finger_width,  # Thin finger
-        height=finger_length,  # Extends along Y (finger length direction)
-        depth=finger_thickness,  # Thin in Z direction
-    )
-    right_finger.translate(
-        [
-            -width / 2 - finger_width / 2,  # Position at -X (half width from center)
-            -finger_length,  # Shift so fingertips are at origin
-            -finger_thickness / 2,  # Center in Z
-        ]
-    )
-
-    # Create base connecting fingers - flat like a stickman body
-    base = o3d.geometry.TriangleMesh.create_box(
-        width=width + finger_width,  # Full width plus finger thickness
-        height=finger_thickness,  # Flat like fingers (stickman style)
-        depth=finger_thickness,  # Thin like fingers
-    )
-    base.translate(
-        [
-            -width / 2 - finger_width / 2,  # Start from left finger position
-            -finger_length - finger_thickness,  # Behind fingers, adjusted for fingertips at origin
-            -finger_thickness / 2,  # Center in Z
-        ]
-    )
-
-    # Create handle extending backward - flat stick like stickman arm
-    handle = o3d.geometry.TriangleMesh.create_box(
-        width=finger_width,  # Same width as fingers
-        height=handle_length,  # Extends backward along Y direction (same plane)
-        depth=finger_thickness,  # Thin like fingers (same plane)
-    )
-    handle.translate(
-        [
-            -finger_width / 2,  # Center in X
-            -finger_length
-            - finger_thickness
-            - handle_length,  # Extend backward from base, adjusted for fingertips at origin
-            -finger_thickness / 2,  # Same Z plane as other components
-        ]
-    )
-
-    # Use solid red color for all parts (user changed to red)
-    solid_color = [1.0, 0.0, 0.0]  # Red color
-
-    left_finger.paint_uniform_color(solid_color)
-    right_finger.paint_uniform_color(solid_color)
-    base.paint_uniform_color(solid_color)
-    handle.paint_uniform_color(solid_color)
-
-    # Apply transformation to all parts
-    left_finger.transform(transform)
-    right_finger.transform(transform)
-    base.transform(transform)
-    handle.transform(transform)
-
-    geometries.extend([left_finger, right_finger, base, handle])
-
-    return geometries
-
-
-def create_all_gripper_geometries(
-    grasp_list: list[dict],  # type: ignore[type-arg]
-    max_grasps: int = -1,
-) -> list[o3d.geometry.TriangleMesh]:
-    """
-    Create gripper geometries for multiple grasps.
-
-    Args:
-        grasp_list: List of grasp dictionaries
-        max_grasps: Maximum number of grasps to visualize (-1 for all)
-
-    Returns:
-        List of all gripper geometries
-    """
-    all_geometries = []
-
-    grasps_to_show = grasp_list if max_grasps < 0 else grasp_list[:max_grasps]
-
-    for grasp in grasps_to_show:
-        gripper_parts = create_gripper_geometry(grasp)
-        all_geometries.extend(gripper_parts)
-
-    return all_geometries
-
-
-def draw_grasps_on_image(
-    image: np.ndarray,  # type: ignore[type-arg]
-    grasp_data: dict | dict[int | str, list[dict]] | list[dict],  # type: ignore[type-arg]
-    camera_intrinsics: list[float] | np.ndarray,  # type: ignore[type-arg]  # [fx, fy, cx, cy] or 3x3 matrix
-    max_grasps: int = -1,  # -1 means show all grasps
-    finger_length: float = 0.08,  # Match 3D gripper
-    finger_thickness: float = 0.004,  # Match 3D gripper
-) -> np.ndarray:  # type: ignore[type-arg]
-    """
-    Draw fork-like gripper visualizations on the image matching 3D gripper design.
-
-    Args:
-        image: Base image to draw on
-        grasp_data: Can be:
-            - A single grasp dict
-            - A list of grasp dicts
-            - A dictionary mapping object IDs or "scene" to list of grasps
-        camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
-        max_grasps: Maximum number of grasps to visualize (-1 for all)
-        finger_length: Length of gripper fingers (matches 3D design)
-        finger_thickness: Thickness of gripper fingers (matches 3D design)
-
-    Returns:
-        Image with grasps drawn
-    """
-    result = image.copy()
-
-    # Convert camera intrinsics to 3x3 matrix if needed
-    if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
-        fx, fy, cx, cy = camera_intrinsics
-        camera_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
-    else:
-        camera_matrix = np.array(camera_intrinsics)
-
-    # Convert input to standard format
-    if isinstance(grasp_data, dict) and not any(
-        key in grasp_data for key in ["scene", 0, 1, 2, 3, 4, 5]
-    ):
-        # Single grasp
-        grasps_to_draw = [(grasp_data, 0)]
-    elif isinstance(grasp_data, list):
-        # List of grasps
-        grasps_to_draw = [(grasp, i) for i, grasp in enumerate(grasp_data)]
-    else:
-        # Dictionary of grasps by object ID
-        grasps_to_draw = []
-        for _obj_id, grasps in grasp_data.items():
-            for i, grasp in enumerate(grasps):
-                grasps_to_draw.append((grasp, i))
-
-    # Limit number of grasps if specified
-    if max_grasps > 0:
-        grasps_to_draw = grasps_to_draw[:max_grasps]
-
-    # Define grasp colors (solid red to match 3D design)
-    def get_grasp_color(index: int) -> tuple:  # type: ignore[type-arg]
-        # Use solid red color for all grasps to match 3D design
-        return (0, 0, 255)  # Red in BGR format for OpenCV
-
-    # Draw each grasp
-    for grasp, index in grasps_to_draw:
-        try:
-            color = get_grasp_color(index)
-            thickness = max(1, 4 - index // 3)
-
-            # Extract grasp parameters (using translation and rotation_matrix)
-            if "translation" not in grasp or "rotation_matrix" not in grasp:
-                continue
-
-            translation = np.array(grasp["translation"])
-            rotation_matrix = np.array(grasp["rotation_matrix"])
-            width = grasp.get("width", 0.04)
-
-            # Match 3D gripper dimensions
-            finger_width = 0.006  # Thickness of each finger (matches 3D)
-            handle_length = 0.05  # Length of handle extending backward (matches 3D)
-
-            # Create gripper geometry in local coordinate system matching 3D design:
-            # X-axis = width direction (left/right finger separation)
-            # Y-axis = finger length direction (fingers extend along +Y)
-            # Z-axis = approach direction (toward object, handle extends along -Z)
-            # IMPORTANT: Fingertips should be at origin (translation point)
-
-            # Left finger extending along +Y, positioned at +X
-            left_finger_points = np.array(
-                [
-                    [
-                        width / 2 - finger_width / 2,  # type: ignore[operator]
-                        -finger_length,
-                        -finger_thickness / 2,
-                    ],  # Back left
-                    [
-                        width / 2 + finger_width / 2,  # type: ignore[operator]
-                        -finger_length,
-                        -finger_thickness / 2,
-                    ],  # Back right
-                    [
-                        width / 2 + finger_width / 2,  # type: ignore[operator]
-                        0,
-                        -finger_thickness / 2,
-                    ],  # Front right (at origin)
-                    [
-                        width / 2 - finger_width / 2,  # type: ignore[operator]
-                        0,
-                        -finger_thickness / 2,
-                    ],  # Front left (at origin)
-                ]
-            )
-
-            # Right finger extending along +Y, positioned at -X
-            right_finger_points = np.array(
-                [
-                    [
-                        -width / 2 - finger_width / 2,  # type: ignore[operator]
-                        -finger_length,
-                        -finger_thickness / 2,
-                    ],  # Back left
-                    [
-                        -width / 2 + finger_width / 2,  # type: ignore[operator]
-                        -finger_length,
-                        -finger_thickness / 2,
-                    ],  # Back right
-                    [
-                        -width / 2 + finger_width / 2,  # type: ignore[operator]
-                        0,
-                        -finger_thickness / 2,
-                    ],  # Front right (at origin)
-                    [
-                        -width / 2 - finger_width / 2,  # type: ignore[operator]
-                        0,
-                        -finger_thickness / 2,
-                    ],  # Front left (at origin)
-                ]
-            )
-
-            # Base connecting fingers - flat rectangle behind fingers
-            base_points = np.array(
-                [
-                    [
-                        -width / 2 - finger_width / 2,  # type: ignore[operator]
-                        -finger_length - finger_thickness,
-                        -finger_thickness / 2,
-                    ],  # Back left
-                    [
-                        width / 2 + finger_width / 2,  # type: ignore[operator]
-                        -finger_length - finger_thickness,
-                        -finger_thickness / 2,
-                    ],  # Back right
-                    [
-                        width / 2 + finger_width / 2,  # type: ignore[operator]
-                        -finger_length,
-                        -finger_thickness / 2,
-                    ],  # Front right
-                    [
-                        -width / 2 - finger_width / 2,  # type: ignore[operator]
-                        -finger_length,
-                        -finger_thickness / 2,
-                    ],  # Front left
-                ]
-            )
-
-            # Handle extending backward - thin rectangle
-            handle_points = np.array(
-                [
-                    [
-                        -finger_width / 2,
-                        -finger_length - finger_thickness - handle_length,
-                        -finger_thickness / 2,
-                    ],  # Back left
-                    [
-                        finger_width / 2,
-                        -finger_length - finger_thickness - handle_length,
-                        -finger_thickness / 2,
-                    ],  # Back right
-                    [
-                        finger_width / 2,
-                        -finger_length - finger_thickness,
-                        -finger_thickness / 2,
-                    ],  # Front right
-                    [
-                        -finger_width / 2,
-                        -finger_length - finger_thickness,
-                        -finger_thickness / 2,
-                    ],  # Front left
-                ]
-            )
-
-            # Transform all points to world frame
-            def transform_points(points):  # type: ignore[no-untyped-def]
-                # Apply rotation and translation
-                world_points = (rotation_matrix @ points.T).T + translation
-                return world_points
-
-            left_finger_world = transform_points(left_finger_points)  # type: ignore[no-untyped-call]
-            right_finger_world = transform_points(right_finger_points)  # type: ignore[no-untyped-call]
-            base_world = transform_points(base_points)  # type: ignore[no-untyped-call]
-            handle_world = transform_points(handle_points)  # type: ignore[no-untyped-call]
-
-            # Project to 2D
-            left_finger_2d = project_3d_points_to_2d(left_finger_world, camera_matrix)
-            right_finger_2d = project_3d_points_to_2d(right_finger_world, camera_matrix)
-            base_2d = project_3d_points_to_2d(base_world, camera_matrix)
-            handle_2d = project_3d_points_to_2d(handle_world, camera_matrix)
-
-            # Draw left finger
-            pts = left_finger_2d.astype(np.int32)
-            cv2.polylines(result, [pts], True, color, thickness)
-
-            # Draw right finger
-            pts = right_finger_2d.astype(np.int32)
-            cv2.polylines(result, [pts], True, color, thickness)
-
-            # Draw base
-            pts = base_2d.astype(np.int32)
-            cv2.polylines(result, [pts], True, color, thickness)
-
-            # Draw handle
-            pts = handle_2d.astype(np.int32)
-            cv2.polylines(result, [pts], True, color, thickness)
-
-            # Draw grasp center (fingertips at origin)
-            center_2d = project_3d_points_to_2d(translation.reshape(1, -1), camera_matrix)[0]
-            cv2.circle(result, tuple(center_2d.astype(int)), 3, color, -1)
-
-        except Exception:
-            # Skip this grasp if there's an error
-            continue
-
-    return result
-
-
-def get_standard_coordinate_transform():  # type: ignore[no-untyped-def]
-    """
-    Get a standard coordinate transformation matrix for consistent visualization.
-
-    This transformation ensures that:
-    - X (red) axis points right
-    - Y (green) axis points up
-    - Z (blue) axis points toward viewer
-
-    Returns:
-        4x4 transformation matrix
-    """
-    # Standard transformation matrix to ensure consistent coordinate frame orientation
-    transform = np.array(
-        [
-            [1, 0, 0, 0],  # X points right
-            [0, -1, 0, 0],  # Y points up (flip from OpenCV to standard)
-            [0, 0, -1, 0],  # Z points toward viewer (flip depth)
-            [0, 0, 0, 1],
-        ]
-    )
-    return transform
-
-
-def visualize_grasps_3d(
-    point_cloud: o3d.geometry.PointCloud,
-    grasp_list: list[dict],  # type: ignore[type-arg]
-    max_grasps: int = -1,
-) -> None:
-    """
-    Visualize grasps in 3D with point cloud.
-
-    Args:
-        point_cloud: Open3D point cloud
-        grasp_list: List of grasp dictionaries
-        max_grasps: Maximum number of grasps to visualize
-    """
-    # Apply standard coordinate transformation
-    transform = get_standard_coordinate_transform()  # type: ignore[no-untyped-call]
-
-    # Transform point cloud
-    pc_copy = o3d.geometry.PointCloud(point_cloud)
-    pc_copy.transform(transform)
-    geometries = [pc_copy]
-
-    # Transform gripper geometries
-    gripper_geometries = create_all_gripper_geometries(grasp_list, max_grasps)
-    for geom in gripper_geometries:
-        geom.transform(transform)
-    geometries.extend(gripper_geometries)
-
-    # Add transformed coordinate frame
-    origin_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.1)
-    origin_frame.transform(transform)
-    geometries.append(origin_frame)
-
-    o3d.visualization.draw_geometries(geometries, window_name="3D Grasp Visualization")
-
-
-def parse_grasp_results(grasps: list[dict]) -> list[dict]:  # type: ignore[type-arg]
-    """
-    Parse grasp results into visualization format.
-
-    Args:
-        grasps: List of grasp dictionaries
-
-    Returns:
-        List of dictionaries containing:
-        - id: Unique grasp identifier
-        - score: Confidence score (float)
-        - width: Gripper opening width (float)
-        - translation: 3D position [x, y, z]
-        - rotation_matrix: 3x3 rotation matrix as nested list
-    """
-    if not grasps:
-        return []
-
-    parsed_grasps = []
-
-    for i, grasp in enumerate(grasps):
-        # Extract data from each grasp
-        translation = grasp.get("translation", [0, 0, 0])
-        rotation_matrix = np.array(grasp.get("rotation_matrix", np.eye(3)))
-        score = float(grasp.get("score", 0.0))
-        width = float(grasp.get("width", 0.08))
-
-        parsed_grasp = {
-            "id": f"grasp_{i}",
-            "score": score,
-            "width": width,
-            "translation": translation,
-            "rotation_matrix": rotation_matrix.tolist(),
-        }
-        parsed_grasps.append(parsed_grasp)
-
-    return parsed_grasps
-
-
-def create_grasp_overlay(
-    rgb_image: np.ndarray,  # type: ignore[type-arg]
-    grasps: list[dict],  # type: ignore[type-arg]
-    camera_intrinsics: list[float] | np.ndarray,  # type: ignore[type-arg]
-) -> np.ndarray:  # type: ignore[type-arg]
-    """
-    Create grasp visualization overlay on RGB image.
-
-    Args:
-        rgb_image: RGB input image
-        grasps: List of grasp dictionaries in viz format
-        camera_intrinsics: Camera parameters
-
-    Returns:
-        RGB image with grasp overlay
-    """
-    try:
-        bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
-
-        result_bgr = draw_grasps_on_image(
-            bgr_image,
-            grasps,
-            camera_intrinsics,
-            max_grasps=-1,
-        )
-        return cv2.cvtColor(result_bgr, cv2.COLOR_BGR2RGB)
-    except Exception:
-        return rgb_image.copy()
diff --git a/dimos/perception/object_detection_stream.py b/dimos/perception/object_detection_stream.py
deleted file mode 100644
index 4d93e3ddd4..0000000000
--- a/dimos/perception/object_detection_stream.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import numpy as np
-from reactivex import Observable, operators as ops
-
-from dimos.perception.detection2d.yolo_2d_det import (  # type: ignore[import-not-found, import-untyped]
-    Yolo2DDetector,
-)
-
-try:
-    from dimos.perception.detection2d.detic_2d_det import (  # type: ignore[import-not-found, import-untyped]
-        Detic2DDetector,
-    )
-
-    DETIC_AVAILABLE = True
-except (ModuleNotFoundError, ImportError):
-    DETIC_AVAILABLE = False
-    Detic2DDetector = None
-from collections.abc import Callable
-from typing import TYPE_CHECKING
-
-from dimos.models.depth.metric3d import Metric3D
-from dimos.perception.common.utils import draw_object_detection_visualization
-from dimos.perception.detection2d.utils import (  # type: ignore[attr-defined]
-    calculate_depth_from_bbox,
-    calculate_object_size_from_bbox,
-    calculate_position_rotation_from_bbox,
-)
-from dimos.types.vector import Vector
-from dimos.utils.logging_config import setup_logger
-from dimos.utils.transform_utils import transform_robot_to_map  # type: ignore[attr-defined]
-
-if TYPE_CHECKING:
-    from dimos.types.manipulation import ObjectData
-
-# Initialize logger for the ObjectDetectionStream
-logger = setup_logger()
-
-
-class ObjectDetectionStream:
-    """
-    A stream processor that:
-    1. Detects objects using a Detector (Detic or Yolo)
-    2. Estimates depth using Metric3D
-    3. Calculates 3D position and dimensions using camera intrinsics
-    4. Transforms coordinates to map frame
-    5. Draws bounding boxes and segmentation masks on the frame
-
-    Provides a stream of structured object data with position and rotation information.
-    """
-
-    def __init__(  # type: ignore[no-untyped-def]
-        self,
-        camera_intrinsics=None,  # [fx, fy, cx, cy]
-        device: str = "cuda",
-        gt_depth_scale: float = 1000.0,
-        min_confidence: float = 0.7,
-        class_filter=None,  # Optional list of class names to filter (e.g., ["person", "car"])
-        get_pose: Callable | None = None,  # type: ignore[type-arg]  # Optional function to transform coordinates to map frame
-        detector: Detic2DDetector | Yolo2DDetector | None = None,
-        video_stream: Observable = None,  # type: ignore[assignment, type-arg]
-        disable_depth: bool = False,  # Flag to disable monocular Metric3D depth estimation
-        draw_masks: bool = False,  # Flag to enable drawing segmentation masks
-    ) -> None:
-        """
-        Initialize the ObjectDetectionStream.
-
-        Args:
-            camera_intrinsics: List [fx, fy, cx, cy] with camera parameters
-            device: Device to run inference on ("cuda" or "cpu")
-            gt_depth_scale: Ground truth depth scale for Metric3D
-            min_confidence: Minimum confidence for detections
-            class_filter: Optional list of class names to filter
-            get_pose: Optional function to transform pose to map coordinates
-            detector: Optional detector instance (Detic or Yolo)
-            video_stream: Observable of video frames to process (if provided, returns a stream immediately)
-            disable_depth: Flag to disable monocular Metric3D depth estimation
-            draw_masks: Flag to enable drawing segmentation masks
-        """
-        self.min_confidence = min_confidence
-        self.class_filter = class_filter
-        self.get_pose = get_pose
-        self.disable_depth = disable_depth
-        self.draw_masks = draw_masks
-        # Initialize object detector
-        if detector is not None:
-            self.detector = detector
-        else:
-            if DETIC_AVAILABLE:
-                try:
-                    self.detector = Detic2DDetector(vocabulary=None, threshold=min_confidence)
-                    logger.info("Using Detic2DDetector")
-                except Exception as e:
-                    logger.warning(
-                        f"Failed to initialize Detic2DDetector: {e}. Falling back to Yolo2DDetector."
-                    )
-                    self.detector = Yolo2DDetector()
-            else:
-                logger.info("Detic not available. Using Yolo2DDetector.")
-                self.detector = Yolo2DDetector()
-        # Set up camera intrinsics
-        self.camera_intrinsics = camera_intrinsics
-
-        # Initialize depth estimation model
-        self.depth_model = None
-        if not disable_depth:
-            try:
-                self.depth_model = Metric3D(gt_depth_scale=gt_depth_scale)
-
-                if camera_intrinsics is not None:
-                    self.depth_model.update_intrinsic(camera_intrinsics)  # type: ignore[no-untyped-call]
-
-                    # Create 3x3 camera matrix for calculations
-                    fx, fy, cx, cy = camera_intrinsics
-                    self.camera_matrix = np.array(
-                        [[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32
-                    )
-                else:
-                    raise ValueError("camera_intrinsics must be provided")
-
-                logger.info("Depth estimation enabled with Metric3D")
-            except Exception as e:
-                logger.warning(f"Failed to initialize Metric3D depth model: {e}")
-                logger.warning("Falling back to disable_depth=True mode")
-                self.disable_depth = True
-                self.depth_model = None
-        else:
-            logger.info("Depth estimation disabled")
-
-        # If video_stream is provided, create and store the stream immediately
-        self.stream = None
-        if video_stream is not None:
-            self.stream = self.create_stream(video_stream)
-
-    def create_stream(self, video_stream: Observable) -> Observable:  # type: ignore[type-arg]
-        """
-        Create an Observable stream of object data from a video stream.
-
-        Args:
-            video_stream: Observable that emits video frames
-
-        Returns:
-            Observable that emits dictionaries containing object data
-            with position and rotation information
-        """
-
-        def process_frame(frame):  # type: ignore[no-untyped-def]
-            # TODO: More modular detector output interface
-            bboxes, track_ids, class_ids, confidences, names, *mask_data = (  # type: ignore[misc]
-                *self.detector.process_image(frame),
-                [],
-            )
-
-            masks = (
-                mask_data[0]  # type: ignore[has-type]
-                if mask_data and len(mask_data[0]) == len(bboxes)  # type: ignore[has-type]
-                else [None] * len(bboxes)  # type: ignore[has-type]
-            )
-
-            # Create visualization
-            viz_frame = frame.copy()
-
-            # Process detections
-            objects = []
-            if not self.disable_depth:
-                depth_map = self.depth_model.infer_depth(frame)  # type: ignore[union-attr]
-                depth_map = np.array(depth_map)
-            else:
-                depth_map = None
-
-            for i, bbox in enumerate(bboxes):  # type: ignore[has-type]
-                # Skip if confidence is too low
-                if i < len(confidences) and confidences[i] < self.min_confidence:  # type: ignore[has-type]
-                    continue
-
-                # Skip if class filter is active and class not in filter
-                class_name = names[i] if i < len(names) else None  # type: ignore[has-type]
-                if self.class_filter and class_name not in self.class_filter:
-                    continue
-
-                if not self.disable_depth and depth_map is not None:
-                    # Get depth for this object
-                    depth = calculate_depth_from_bbox(depth_map, bbox)  # type: ignore[no-untyped-call]
-                    if depth is None:
-                        # Skip objects with invalid depth
-                        continue
-                    # Calculate object position and rotation
-                    position, rotation = calculate_position_rotation_from_bbox(
-                        bbox, depth, self.camera_intrinsics
-                    )
-                    # Get object dimensions
-                    width, height = calculate_object_size_from_bbox(
-                        bbox, depth, self.camera_intrinsics
-                    )
-
-                    # Transform to map frame if a transform function is provided
-                    try:
-                        if self.get_pose:
-                            # position and rotation are already Vector objects, no need to convert
-                            robot_pose = self.get_pose()
-                            position, rotation = transform_robot_to_map(
-                                robot_pose["position"], robot_pose["rotation"], position, rotation
-                            )
-                    except Exception as e:
-                        logger.error(f"Error transforming to map frame: {e}")
-                        position, rotation = position, rotation
-
-                else:
-                    depth = -1
-                    position = Vector(0, 0, 0)  # type: ignore[arg-type]
-                    rotation = Vector(0, 0, 0)  # type: ignore[arg-type]
-                    width = -1
-                    height = -1
-
-                # Create a properly typed ObjectData instance
-                object_data: ObjectData = {
-                    "object_id": track_ids[i] if i < len(track_ids) else -1,  # type: ignore[has-type]
-                    "bbox": bbox,
-                    "depth": depth,
-                    "confidence": confidences[i] if i < len(confidences) else None,  # type: ignore[has-type, typeddict-item]
-                    "class_id": class_ids[i] if i < len(class_ids) else None,  # type: ignore[has-type, typeddict-item]
-                    "label": class_name,  # type: ignore[typeddict-item]
-                    "position": position,
-                    "rotation": rotation,
-                    "size": {"width": width, "height": height},
-                    "segmentation_mask": masks[i],
-                }
-
-                objects.append(object_data)
-
-            # Create visualization using common function
-            viz_frame = draw_object_detection_visualization(
-                viz_frame, objects, draw_masks=self.draw_masks, font_scale=1.5
-            )
-
-            return {"frame": frame, "viz_frame": viz_frame, "objects": objects}
-
-        self.stream = video_stream.pipe(ops.map(process_frame))
-
-        return self.stream
-
-    def get_stream(self):  # type: ignore[no-untyped-def]
-        """
-        Returns the current detection stream if available.
-        Creates a new one with the provided video_stream if not already created.
-
-        Returns:
-            Observable: The reactive stream of detection results
-        """
-        if self.stream is None:
-            raise ValueError(
-                "Stream not initialized. Either provide a video_stream during initialization or call create_stream first."
-            )
-        return self.stream
-
-    def get_formatted_stream(self):  # type: ignore[no-untyped-def]
-        """
-        Returns a formatted stream of object detection data for better readability.
-        This is especially useful for LLMs like Claude that need structured text input.
-
-        Returns:
-            Observable: A stream of formatted string representations of object data
-        """
-        if self.stream is None:
-            raise ValueError(
-                "Stream not initialized. Either provide a video_stream during initialization or call create_stream first."
-            )
-
-        def format_detection_data(result):  # type: ignore[no-untyped-def]
-            # Extract objects from result
-            objects = result.get("objects", [])
-
-            if not objects:
-                return "No objects detected."
-
-            formatted_data = "[DETECTED OBJECTS]\n"
-            try:
-                for i, obj in enumerate(objects):
-                    pos = obj["position"]
-                    rot = obj["rotation"]
-                    size = obj["size"]
-                    bbox = obj["bbox"]
-
-                    # Format each object with a multiline f-string for better readability
-                    bbox_str = f"[{bbox[0]}, {bbox[1]}, {bbox[2]}, {bbox[3]}]"
-                    formatted_data += (
-                        f"Object {i + 1}: {obj['label']}\n"
-                        f"  ID: {obj['object_id']}\n"
-                        f"  Confidence: {obj['confidence']:.2f}\n"
-                        f"  Position: x={pos.x:.2f}m, y={pos.y:.2f}m, z={pos.z:.2f}m\n"
-                        f"  Rotation: yaw={rot.z:.2f} rad\n"
-                        f"  Size: width={size['width']:.2f}m, height={size['height']:.2f}m\n"
-                        f"  Depth: {obj['depth']:.2f}m\n"
-                        f"  Bounding box: {bbox_str}\n"
-                        "----------------------------------\n"
-                    )
-            except Exception as e:
-                logger.warning(f"Error formatting object {i}: {e}")
-                formatted_data += f"Object {i + 1}: [Error formatting data]"
-                formatted_data += "\n----------------------------------\n"
-
-            return formatted_data
-
-        # Return a new stream with the formatter applied
-        return self.stream.pipe(ops.map(format_detection_data))
-
-    def cleanup(self) -> None:
-        """Clean up resources."""
-        pass
diff --git a/dimos/perception/object_scene_registration.py b/dimos/perception/object_scene_registration.py
new file mode 100644
index 0000000000..c21e31bf33
--- /dev/null
+++ b/dimos/perception/object_scene_registration.py
@@ -0,0 +1,247 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from typing import Any
+
+import numpy as np
+from numpy.typing import NDArray
+
+from dimos.core import In, Out, rpc
+from dimos.core.skill_module import SkillModule
+from dimos.msgs.foxglove_msgs import ImageAnnotations
+from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2
+from dimos.msgs.std_msgs import Header
+from dimos.msgs.vision_msgs import Detection2DArray, Detection3DArray
+from dimos.perception.detection.detectors.yoloe import Yoloe2DDetector, YoloePromptMode
+from dimos.perception.detection.objectDB import ObjectDB
+from dimos.perception.detection.type import ImageDetections2D
+from dimos.perception.detection.type.detection3d.object import (
+    Object,
+    aggregate_pointclouds,
+    to_detection3d_array,
+)
+from dimos.protocol.skill.skill import skill
+from dimos.types.timestamped import align_timestamped
+from dimos.utils.logging_config import setup_logger
+from dimos.utils.reactive import backpressure
+
+logger = setup_logger()
+
+
+class ObjectSceneRegistrationModule(SkillModule):
+    """Module for detecting objects in camera images using YOLO-E with 2D and 3D detection."""
+
+    color_image: In[Image]
+    depth_image: In[Image]
+    camera_info: In[CameraInfo]
+
+    detections_2d: Out[Detection2DArray]
+    detections_3d: Out[Detection3DArray]
+    overlay: Out[ImageAnnotations]
+    pointcloud: Out[PointCloud2]
+
+    _detector: Yoloe2DDetector | None = None
+    _camera_info: CameraInfo | None = None
+    _object_db: ObjectDB
+
+    def __init__(
+        self,
+        target_frame: str = "map",
+        prompt_mode: YoloePromptMode = YoloePromptMode.LRPC,
+    ) -> None:
+        super().__init__()
+        self._target_frame = target_frame
+        self._prompt_mode = prompt_mode
+        self._object_db = ObjectDB()
+
+    @rpc
+    def start(self) -> None:
+        super().start()
+
+        if self._prompt_mode == YoloePromptMode.LRPC:
+            model_name = "yoloe-11l-seg-pf.pt"
+        else:
+            model_name = "yoloe-11l-seg.pt"
+
+        self._detector = Yoloe2DDetector(
+            model_name=model_name,
+            prompt_mode=self._prompt_mode,
+        )
+
+        self.camera_info.subscribe(lambda msg: setattr(self, "_camera_info", msg))
+
+        aligned_frames = align_timestamped(
+            self.color_image.observable(),  # type: ignore[no-untyped-call]
+            self.depth_image.observable(),  # type: ignore[no-untyped-call]
+            buffer_size=2.0,
+            match_tolerance=0.1,
+        )
+        backpressure(aligned_frames).subscribe(self._on_aligned_frames)
+
+    @rpc
+    def stop(self) -> None:
+        """Stop the module and clean up resources."""
+
+        if self._detector:
+            self._detector.stop()
+            self._detector = None
+
+        self._object_db.clear()
+
+        logger.info("ObjectSceneRegistrationModule stopped")
+        super().stop()
+
+    @rpc
+    def set_prompts(
+        self,
+        text: list[str] | None = None,
+        bboxes: NDArray[np.float64] | None = None,
+    ) -> None:
+        """Set prompts for detection. Provide either text or bboxes, not both."""
+        if self._detector is not None:
+            self._detector.set_prompts(text=text, bboxes=bboxes)
+
+    @rpc
+    def select_object(self, track_id: int) -> dict[str, Any] | None:
+        """Get object data by track_id and promote to permanent."""
+        for obj in self._object_db.get_all_objects():
+            if obj.track_id == track_id:
+                self._object_db.promote(obj.object_id)
+                return obj.to_dict()
+        return None
+
+    @rpc
+    def get_object_track_ids(self) -> list[int]:
+        """Get track_ids of all permanent objects."""
+        return [obj.track_id for obj in self._object_db.get_all_objects()]
+
+    @skill()
+    def detect(self, *prompts: str) -> str:
+        """Detect objects matching the given text prompts. Returns track_ids after 2 seconds of detection.
+
+        Do NOT call this tool multiple times for one query. Pass all objects in a single call.
+        For example, to detect a cup and mouse, call detect("cup", "mouse") not detect("cup") then detect("mouse").
+
+        Args:
+            prompts (str): Text descriptions of objects to detect (e.g., "person", "car", "dog")
+
+        Returns:
+            str: A message containing the track_ids of detected objects
+
+        Example:
+            detect("person", "car", "dog")
+            detect("person")
+        """
+        if not prompts:
+            return "No prompts provided."
+        if self._detector is None:
+            return "Detector not initialized."
+
+        self._detector.set_prompts(text=list(prompts))
+        time.sleep(2.0)
+
+        track_ids = self.get_object_track_ids()
+        if not track_ids:
+            return "No objects detected."
+        return f"Detected objects with track_ids: {track_ids}"
+
+    @skill()
+    def select(self, track_id: int) -> str:
+        """Select an object by track_id and promote it to permanent.
+
+        Example:
+            select(5)
+        """
+        result = self.select_object(track_id)
+        if result is None:
+            return f"No object found with track_id {track_id}."
+        return f"Selected object {track_id}: {result['name']}"
+
+    def _on_aligned_frames(self, frames) -> None:  # type: ignore[no-untyped-def]
+        color_msg, depth_msg = frames
+        self._process_images(color_msg, depth_msg)
+
+    def _process_images(self, color_msg: Image, depth_msg: Image) -> None:
+        """Process synchronized color and depth images (runs in background thread)."""
+        if not self._detector or not self._camera_info:
+            return
+
+        color_image = color_msg
+        depth_image = depth_msg.to_depth_meters()
+
+        # Run 2D detection
+        detections_2d: ImageDetections2D[Any] = self._detector.process_image(color_image)
+
+        detections_2d_msg = Detection2DArray(
+            detections_length=len(detections_2d.detections),
+            header=Header(color_image.ts, color_image.frame_id or ""),
+            detections=[det.to_ros_detection2d() for det in detections_2d.detections],
+        )
+        self.detections_2d.publish(detections_2d_msg)
+
+        overlay_annotations = detections_2d.to_foxglove_annotations()
+        self.overlay.publish(overlay_annotations)
+
+        # Process 3D detections
+        self._process_3d_detections(detections_2d, color_image, depth_image)
+
+    def _process_3d_detections(
+        self,
+        detections_2d: ImageDetections2D[Any],
+        color_image: Image,
+        depth_image: Image,
+    ) -> None:
+        """Convert 2D detections to 3D and publish."""
+        if self._camera_info is None:
+            return
+
+        # Look up transform from camera frame to target frame (e.g., map)
+        camera_transform = None
+        if self._target_frame != color_image.frame_id:
+            camera_transform = self.tf.get(
+                self._target_frame,
+                color_image.frame_id,
+                color_image.ts,
+                0.1,
+            )
+            if camera_transform is None:
+                logger.warning("Failed to lookup transform from camera frame to target frame")
+                return
+
+        objects = Object.from_2d_to_list(
+            detections_2d=detections_2d,
+            color_image=color_image,
+            depth_image=depth_image,
+            camera_info=self._camera_info,
+            camera_transform=camera_transform,
+        )
+        if not objects:
+            return
+
+        # Add objects to spatial memory database
+        objects = self._object_db.add_objects(objects)
+
+        detections_3d = to_detection3d_array(objects)
+        self.detections_3d.publish(detections_3d)
+
+        objects_for_pc = self._object_db.get_objects()
+        aggregated_pc = aggregate_pointclouds(objects_for_pc)
+        self.pointcloud.publish(aggregated_pc)
+        return
+
+
+object_scene_registration_module = ObjectSceneRegistrationModule.blueprint
+
+__all__ = ["ObjectSceneRegistrationModule", "object_scene_registration_module"]
diff --git a/dimos/perception/object_tracker.py b/dimos/perception/object_tracker.py
index 9260003ce2..54a5873435 100644
--- a/dimos/perception/object_tracker.py
+++ b/dimos/perception/object_tracker.py
@@ -28,7 +28,6 @@
 from reactivex.disposable import Disposable
 
 from dimos.core import In, Module, ModuleConfig, Out, rpc
-from dimos.manipulation.visual_servoing.utils import visualize_detections_3d
 from dimos.msgs.geometry_msgs import Pose, Quaternion, Transform, Vector3
 from dimos.msgs.sensor_msgs import (
     CameraInfo,
@@ -540,10 +539,13 @@ def _process_tracking(self) -> None:
                     y2 = det_2d.bbox.center.position.y + det_2d.bbox.size_y / 2
                     bbox_2d = [[x1, y1, x2, y2]]
 
-                # Create visualization
-                viz_image = visualize_detections_3d(
-                    frame, detections_3d, show_coordinates=True, bboxes_2d=bbox_2d
-                )
+                # Use frame directly for visualization
+                viz_image = frame.copy()
+
+                # Draw bounding boxes
+                for bbox in bbox_2d:
+                    x1, y1, x2, y2 = map(int, bbox)
+                    cv2.rectangle(viz_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
 
                 # Overlay REID feature matches if available
                 if self.last_good_matches and self.last_roi_kps and self.last_roi_bbox:
diff --git a/dimos/perception/object_tracker_3d.py b/dimos/perception/object_tracker_3d.py
index 22846e1e2f..fa6361ac65 100644
--- a/dimos/perception/object_tracker_3d.py
+++ b/dimos/perception/object_tracker_3d.py
@@ -14,6 +14,7 @@
 
 
 # Import LCM messages
+import cv2
 from dimos_lcm.sensor_msgs import CameraInfo
 from dimos_lcm.vision_msgs import (
     Detection3D,
@@ -22,7 +23,6 @@
 import numpy as np
 
 from dimos.core import In, Out, rpc
-from dimos.manipulation.visual_servoing.utils import visualize_detections_3d
 from dimos.msgs.geometry_msgs import Pose, Quaternion, Transform, Vector3
 from dimos.msgs.sensor_msgs import Image, ImageFormat
 from dimos.msgs.std_msgs import Header
@@ -145,10 +145,13 @@ def _process_tracking(self) -> None:
                 y2 = det_2d.bbox.center.position.y + det_2d.bbox.size_y / 2
                 bbox_2d = [[x1, y1, x2, y2]]
 
-                # Create 3D visualization
-                viz_image = visualize_detections_3d(
-                    frame, detection_3d.detections, show_coordinates=True, bboxes_2d=bbox_2d
-                )
+                # Use frame directly for visualization
+                viz_image = frame.copy()
+
+                # Draw bounding boxes
+                for bbox in bbox_2d:
+                    x1, y1, x2, y2 = map(int, bbox)
+                    cv2.rectangle(viz_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
 
                 # Overlay Re-ID matches
                 if self.last_good_matches and self.last_roi_kps and self.last_roi_bbox:
diff --git a/dimos/perception/person_tracker.py b/dimos/perception/person_tracker.py
deleted file mode 100644
index a138467850..0000000000
--- a/dimos/perception/person_tracker.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import cv2
-import numpy as np
-from reactivex import Observable, interval, operators as ops
-from reactivex.disposable import Disposable
-
-from dimos.core import In, Module, Out, rpc
-from dimos.msgs.sensor_msgs import Image
-from dimos.perception.common.ibvs import PersonDistanceEstimator
-from dimos.perception.detection2d.utils import filter_detections
-from dimos.perception.detection2d.yolo_2d_det import (  # type: ignore[import-not-found, import-untyped]
-    Yolo2DDetector,
-)
-from dimos.utils.logging_config import setup_logger
-
-logger = setup_logger()
-
-
-class PersonTrackingStream(Module):
-    """Module for person tracking with LCM input/output."""
-
-    # LCM inputs
-    video: In[Image]
-
-    # LCM outputs
-    tracking_data: Out[dict]  # type: ignore[type-arg]
-
-    def __init__(  # type: ignore[no-untyped-def]
-        self,
-        camera_intrinsics=None,
-        camera_pitch: float = 0.0,
-        camera_height: float = 1.0,
-    ) -> None:
-        """
-        Initialize a person tracking stream using Yolo2DDetector and PersonDistanceEstimator.
-
-        Args:
-            camera_intrinsics: List in format [fx, fy, cx, cy] where:
-                - fx: Focal length in x direction (pixels)
-                - fy: Focal length in y direction (pixels)
-                - cx: Principal point x-coordinate (pixels)
-                - cy: Principal point y-coordinate (pixels)
-            camera_pitch: Camera pitch angle in radians (positive is up)
-            camera_height: Height of the camera from the ground in meters
-        """
-        # Call parent Module init
-        super().__init__()
-
-        self.camera_intrinsics = camera_intrinsics
-        self.camera_pitch = camera_pitch
-        self.camera_height = camera_height
-
-        self.detector = Yolo2DDetector()
-
-        # Initialize distance estimator
-        if camera_intrinsics is None:
-            raise ValueError("Camera intrinsics are required for distance estimation")
-
-        # Validate camera intrinsics format [fx, fy, cx, cy]
-        if (
-            not isinstance(camera_intrinsics, list | tuple | np.ndarray)
-            or len(camera_intrinsics) != 4
-        ):
-            raise ValueError("Camera intrinsics must be provided as [fx, fy, cx, cy]")
-
-        # Convert [fx, fy, cx, cy] to 3x3 camera matrix
-        fx, fy, cx, cy = camera_intrinsics
-        K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
-
-        self.distance_estimator = PersonDistanceEstimator(
-            K=K, camera_pitch=camera_pitch, camera_height=camera_height
-        )
-
-        # For tracking latest frame data
-        self._latest_frame: np.ndarray | None = None  # type: ignore[type-arg]
-        self._process_interval = 0.1  # Process at 10Hz
-
-        # Tracking state - starts disabled
-        self._tracking_enabled = False
-
-    @rpc
-    def start(self) -> None:
-        """Start the person tracking module and subscribe to LCM streams."""
-
-        super().start()
-
-        # Subscribe to video stream
-        def set_video(image_msg: Image) -> None:
-            if hasattr(image_msg, "data"):
-                self._latest_frame = image_msg.data
-            else:
-                logger.warning("Received image message without data attribute")
-
-        unsub = self.video.subscribe(set_video)
-        self._disposables.add(Disposable(unsub))
-
-        # Start periodic processing
-        unsub = interval(self._process_interval).subscribe(lambda _: self._process_frame())  # type: ignore[assignment]
-        self._disposables.add(unsub)  # type: ignore[arg-type]
-
-        logger.info("PersonTracking module started and subscribed to LCM streams")
-
-    @rpc
-    def stop(self) -> None:
-        super().stop()
-
-    def _process_frame(self) -> None:
-        """Process the latest frame if available."""
-        if self._latest_frame is None:
-            return
-
-        # Only process and publish if tracking is enabled
-        if not self._tracking_enabled:
-            return
-
-        # Process frame through tracking pipeline
-        result = self._process_tracking(self._latest_frame)  # type: ignore[no-untyped-call]
-
-        # Publish result to LCM
-        if result:
-            self.tracking_data.publish(result)
-
-    def _process_tracking(self, frame):  # type: ignore[no-untyped-def]
-        """Process a single frame for person tracking."""
-        # Detect people in the frame
-        bboxes, track_ids, class_ids, confidences, names = self.detector.process_image(frame)
-
-        # Filter to keep only person detections using filter_detections
-        (
-            filtered_bboxes,
-            filtered_track_ids,
-            filtered_class_ids,
-            filtered_confidences,
-            filtered_names,
-        ) = filter_detections(
-            bboxes,
-            track_ids,
-            class_ids,
-            confidences,
-            names,
-            class_filter=[0],  # 0 is the class_id for person
-            name_filter=["person"],
-        )
-
-        # Create visualization
-        viz_frame = self.detector.visualize_results(
-            frame,
-            filtered_bboxes,
-            filtered_track_ids,
-            filtered_class_ids,
-            filtered_confidences,
-            filtered_names,
-        )
-
-        # Calculate distance and angle for each person
-        targets = []
-        for i, bbox in enumerate(filtered_bboxes):
-            target_data = {
-                "target_id": filtered_track_ids[i] if i < len(filtered_track_ids) else -1,
-                "bbox": bbox,
-                "confidence": filtered_confidences[i] if i < len(filtered_confidences) else None,
-            }
-
-            distance, angle = self.distance_estimator.estimate_distance_angle(bbox)
-            target_data["distance"] = distance
-            target_data["angle"] = angle
-
-            # Add text to visualization
-            _x1, y1, x2, _y2 = map(int, bbox)
-            dist_text = f"{distance:.2f}m, {np.rad2deg(angle):.1f} deg"
-
-            # Add black background for better visibility
-            text_size = cv2.getTextSize(dist_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0]
-            # Position at top-right corner
-            cv2.rectangle(
-                viz_frame, (x2 - text_size[0], y1 - text_size[1] - 5), (x2, y1), (0, 0, 0), -1
-            )
-
-            # Draw text in white at top-right
-            cv2.putText(
-                viz_frame,
-                dist_text,
-                (x2 - text_size[0], y1 - 5),
-                cv2.FONT_HERSHEY_SIMPLEX,
-                0.5,
-                (255, 255, 255),
-                2,
-            )
-
-            targets.append(target_data)
-
-        # Create the result dictionary
-        return {"frame": frame, "viz_frame": viz_frame, "targets": targets}
-
-    @rpc
-    def enable_tracking(self) -> bool:
-        """Enable person tracking.
-
-        Returns:
-            bool: True if tracking was enabled successfully
-        """
-        self._tracking_enabled = True
-        logger.info("Person tracking enabled")
-        return True
-
-    @rpc
-    def disable_tracking(self) -> bool:
-        """Disable person tracking.
-
-        Returns:
-            bool: True if tracking was disabled successfully
-        """
-        self._tracking_enabled = False
-        logger.info("Person tracking disabled")
-        return True
-
-    @rpc
-    def is_tracking_enabled(self) -> bool:
-        """Check if tracking is currently enabled.
-
-        Returns:
-            bool: True if tracking is enabled
-        """
-        return self._tracking_enabled
-
-    @rpc
-    def get_tracking_data(self) -> dict:  # type: ignore[type-arg]
-        """Get the latest tracking data.
-
-        Returns:
-            Dictionary containing tracking results
-        """
-        if self._latest_frame is not None:
-            return self._process_tracking(self._latest_frame)  # type: ignore[no-any-return, no-untyped-call]
-        return {"frame": None, "viz_frame": None, "targets": []}
-
-    def create_stream(self, video_stream: Observable) -> Observable:  # type: ignore[type-arg]
-        """
-        Create an Observable stream of person tracking results from a video stream.
-
-        Args:
-            video_stream: Observable that emits video frames
-
-        Returns:
-            Observable that emits dictionaries containing tracking results and visualizations
-        """
-
-        return video_stream.pipe(ops.map(self._process_tracking))
diff --git a/dimos/perception/pointcloud/__init__.py b/dimos/perception/pointcloud/__init__.py
deleted file mode 100644
index a380e2aadf..0000000000
--- a/dimos/perception/pointcloud/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .cuboid_fit import *
-from .pointcloud_filtering import *
-from .utils import *
diff --git a/dimos/perception/pointcloud/cuboid_fit.py b/dimos/perception/pointcloud/cuboid_fit.py
deleted file mode 100644
index dfec2d9297..0000000000
--- a/dimos/perception/pointcloud/cuboid_fit.py
+++ /dev/null
@@ -1,420 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import cv2
-import numpy as np
-import open3d as o3d  # type: ignore[import-untyped]
-
-
-def fit_cuboid(
-    points: np.ndarray | o3d.geometry.PointCloud,  # type: ignore[type-arg]
-    method: str = "minimal",
-) -> dict | None:  # type: ignore[type-arg]
-    """
-    Fit a cuboid to a point cloud using Open3D's built-in methods.
-
-    Args:
-        points: Nx3 array of points or Open3D PointCloud
-        method: Fitting method:
-            - 'minimal': Minimal oriented bounding box (best fit)
-            - 'oriented': PCA-based oriented bounding box
-            - 'axis_aligned': Axis-aligned bounding box
-
-    Returns:
-        Dictionary containing:
-            - center: 3D center point
-            - dimensions: 3D dimensions (extent)
-            - rotation: 3x3 rotation matrix
-            - error: Fitting error
-            - bounding_box: Open3D OrientedBoundingBox object
-        Returns None if insufficient points or fitting fails.
-
-    Raises:
-        ValueError: If method is invalid or inputs are malformed
-    """
-    # Validate method
-    valid_methods = ["minimal", "oriented", "axis_aligned"]
-    if method not in valid_methods:
-        raise ValueError(f"method must be one of {valid_methods}, got '{method}'")
-
-    # Convert to point cloud if needed
-    if isinstance(points, np.ndarray):
-        points = np.asarray(points)
-        if len(points.shape) != 2 or points.shape[1] != 3:
-            raise ValueError(f"points array must be Nx3, got shape {points.shape}")
-        if len(points) < 4:
-            return None
-
-        pcd = o3d.geometry.PointCloud()
-        pcd.points = o3d.utility.Vector3dVector(points)
-    elif isinstance(points, o3d.geometry.PointCloud):
-        pcd = points
-        points = np.asarray(pcd.points)
-        if len(points) < 4:
-            return None
-    else:
-        raise ValueError(f"points must be numpy array or Open3D PointCloud, got {type(points)}")
-
-    try:
-        # Get bounding box based on method
-        if method == "minimal":
-            obb = pcd.get_minimal_oriented_bounding_box(robust=True)
-        elif method == "oriented":
-            obb = pcd.get_oriented_bounding_box(robust=True)
-        elif method == "axis_aligned":
-            # Convert axis-aligned to oriented format for consistency
-            aabb = pcd.get_axis_aligned_bounding_box()
-            obb = o3d.geometry.OrientedBoundingBox()
-            obb.center = aabb.get_center()
-            obb.extent = aabb.get_extent()
-            obb.R = np.eye(3)  # Identity rotation for axis-aligned
-
-        # Extract parameters
-        center = np.asarray(obb.center)
-        dimensions = np.asarray(obb.extent)
-        rotation = np.asarray(obb.R)
-
-        # Calculate fitting error
-        error = _compute_fitting_error(points, center, dimensions, rotation)
-
-        return {
-            "center": center,
-            "dimensions": dimensions,
-            "rotation": rotation,
-            "error": error,
-            "bounding_box": obb,
-            "method": method,
-        }
-
-    except Exception as e:
-        # Log error but don't crash - return None for graceful handling
-        print(f"Warning: Cuboid fitting failed with method '{method}': {e}")
-        return None
-
-
-def fit_cuboid_simple(points: np.ndarray | o3d.geometry.PointCloud) -> dict | None:  # type: ignore[type-arg]
-    """
-    Simple wrapper for minimal oriented bounding box fitting.
-
-    Args:
-        points: Nx3 array of points or Open3D PointCloud
-
-    Returns:
-        Dictionary with center, dimensions, rotation, and bounding_box,
-        or None if insufficient points
-    """
-    return fit_cuboid(points, method="minimal")
-
-
-def _compute_fitting_error(
-    points: np.ndarray,  # type: ignore[type-arg]
-    center: np.ndarray,  # type: ignore[type-arg]
-    dimensions: np.ndarray,  # type: ignore[type-arg]
-    rotation: np.ndarray,  # type: ignore[type-arg]
-) -> float:
-    """
-    Compute fitting error as mean squared distance from points to cuboid surface.
-
-    Args:
-        points: Nx3 array of points
-        center: 3D center point
-        dimensions: 3D dimensions
-        rotation: 3x3 rotation matrix
-
-    Returns:
-        Mean squared error
-    """
-    if len(points) == 0:
-        return 0.0
-
-    # Transform points to local coordinates
-    local_points = (points - center) @ rotation
-    half_dims = dimensions / 2
-
-    # Calculate distance to cuboid surface
-    dx = np.abs(local_points[:, 0]) - half_dims[0]
-    dy = np.abs(local_points[:, 1]) - half_dims[1]
-    dz = np.abs(local_points[:, 2]) - half_dims[2]
-
-    # Points outside: distance to nearest face
-    # Points inside: negative distance to nearest face
-    outside_dist = np.sqrt(np.maximum(dx, 0) ** 2 + np.maximum(dy, 0) ** 2 + np.maximum(dz, 0) ** 2)
-    inside_dist = np.minimum(np.minimum(dx, dy), dz)
-    distances = np.where((dx > 0) | (dy > 0) | (dz > 0), outside_dist, -inside_dist)
-
-    return float(np.mean(distances**2))
-
-
-def get_cuboid_corners(
-    center: np.ndarray,  # type: ignore[type-arg]
-    dimensions: np.ndarray,  # type: ignore[type-arg]
-    rotation: np.ndarray,  # type: ignore[type-arg]
-) -> np.ndarray:  # type: ignore[type-arg]
-    """
-    Get the 8 corners of a cuboid.
-
-    Args:
-        center: 3D center point
-        dimensions: 3D dimensions
-        rotation: 3x3 rotation matrix
-
-    Returns:
-        8x3 array of corner coordinates
-    """
-    half_dims = dimensions / 2
-    corners_local = (
-        np.array(
-            [
-                [-1, -1, -1],  # 0: left  bottom back
-                [-1, -1, 1],  # 1: left  bottom front
-                [-1, 1, -1],  # 2: left  top    back
-                [-1, 1, 1],  # 3: left  top    front
-                [1, -1, -1],  # 4: right bottom back
-                [1, -1, 1],  # 5: right bottom front
-                [1, 1, -1],  # 6: right top    back
-                [1, 1, 1],  # 7: right top    front
-            ]
-        )
-        * half_dims
-    )
-
-    # Apply rotation and translation
-    return corners_local @ rotation.T + center  # type: ignore[no-any-return]
-
-
-def visualize_cuboid_on_image(
-    image: np.ndarray,  # type: ignore[type-arg]
-    cuboid_params: dict,  # type: ignore[type-arg]
-    camera_matrix: np.ndarray,  # type: ignore[type-arg]
-    extrinsic_rotation: np.ndarray | None = None,  # type: ignore[type-arg]
-    extrinsic_translation: np.ndarray | None = None,  # type: ignore[type-arg]
-    color: tuple[int, int, int] = (0, 255, 0),
-    thickness: int = 2,
-    show_dimensions: bool = True,
-) -> np.ndarray:  # type: ignore[type-arg]
-    """
-    Draw a fitted cuboid on an image using camera projection.
-
-    Args:
-        image: Input image to draw on
-        cuboid_params: Dictionary containing cuboid parameters
-        camera_matrix: Camera intrinsic matrix (3x3)
-        extrinsic_rotation: Optional external rotation (3x3)
-        extrinsic_translation: Optional external translation (3x1)
-        color: Line color as (B, G, R) tuple
-        thickness: Line thickness
-        show_dimensions: Whether to display dimension text
-
-    Returns:
-        Image with cuboid visualization
-
-    Raises:
-        ValueError: If required parameters are missing or invalid
-    """
-    # Validate inputs
-    required_keys = ["center", "dimensions", "rotation"]
-    if not all(key in cuboid_params for key in required_keys):
-        raise ValueError(f"cuboid_params must contain keys: {required_keys}")
-
-    if camera_matrix.shape != (3, 3):
-        raise ValueError(f"camera_matrix must be 3x3, got {camera_matrix.shape}")
-
-    # Get corners in world coordinates
-    corners = get_cuboid_corners(
-        cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"]
-    )
-
-    # Transform corners if extrinsic parameters are provided
-    if extrinsic_rotation is not None and extrinsic_translation is not None:
-        if extrinsic_rotation.shape != (3, 3):
-            raise ValueError(f"extrinsic_rotation must be 3x3, got {extrinsic_rotation.shape}")
-        if extrinsic_translation.shape not in [(3,), (3, 1)]:
-            raise ValueError(
-                f"extrinsic_translation must be (3,) or (3,1), got {extrinsic_translation.shape}"
-            )
-
-        extrinsic_translation = extrinsic_translation.flatten()
-        corners = (extrinsic_rotation @ corners.T).T + extrinsic_translation
-
-    try:
-        # Project 3D corners to image coordinates
-        corners_img, _ = cv2.projectPoints(  # type: ignore[call-overload]
-            corners.astype(np.float32),
-            np.zeros(3),
-            np.zeros(3),  # No additional rotation/translation
-            camera_matrix.astype(np.float32),
-            None,  # No distortion
-        )
-        corners_img = corners_img.reshape(-1, 2).astype(int)
-
-        # Check if corners are within image bounds
-        h, w = image.shape[:2]
-        valid_corners = (
-            (corners_img[:, 0] >= 0)
-            & (corners_img[:, 0] < w)
-            & (corners_img[:, 1] >= 0)
-            & (corners_img[:, 1] < h)
-        )
-
-        if not np.any(valid_corners):
-            print("Warning: All cuboid corners are outside image bounds")
-            return image.copy()
-
-    except Exception as e:
-        print(f"Warning: Failed to project cuboid corners: {e}")
-        return image.copy()
-
-    # Define edges for wireframe visualization
-    edges = [
-        # Bottom face
-        (0, 1),
-        (1, 5),
-        (5, 4),
-        (4, 0),
-        # Top face
-        (2, 3),
-        (3, 7),
-        (7, 6),
-        (6, 2),
-        # Vertical edges
-        (0, 2),
-        (1, 3),
-        (5, 7),
-        (4, 6),
-    ]
-
-    # Draw edges
-    vis_img = image.copy()
-    for i, j in edges:
-        # Only draw edge if both corners are valid
-        if valid_corners[i] and valid_corners[j]:
-            cv2.line(vis_img, tuple(corners_img[i]), tuple(corners_img[j]), color, thickness)
-
-    # Add dimension text if requested
-    if show_dimensions and np.any(valid_corners):
-        dims = cuboid_params["dimensions"]
-        dim_text = f"Dims: {dims[0]:.3f} x {dims[1]:.3f} x {dims[2]:.3f}"
-
-        # Find a good position for text (top-left of image)
-        text_pos = (10, 30)
-        font_scale = 0.7
-
-        # Add background rectangle for better readability
-        text_size = cv2.getTextSize(dim_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 2)[0]
-        cv2.rectangle(
-            vis_img,
-            (text_pos[0] - 5, text_pos[1] - text_size[1] - 5),
-            (text_pos[0] + text_size[0] + 5, text_pos[1] + 5),
-            (0, 0, 0),
-            -1,
-        )
-
-        cv2.putText(vis_img, dim_text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, font_scale, color, 2)
-
-    return vis_img
-
-
-def compute_cuboid_volume(cuboid_params: dict) -> float:  # type: ignore[type-arg]
-    """
-    Compute the volume of a cuboid.
-
-    Args:
-        cuboid_params: Dictionary containing cuboid parameters
-
-    Returns:
-        Volume in cubic units
-    """
-    if "dimensions" not in cuboid_params:
-        raise ValueError("cuboid_params must contain 'dimensions' key")
-
-    dims = cuboid_params["dimensions"]
-    return float(np.prod(dims))
-
-
-def compute_cuboid_surface_area(cuboid_params: dict) -> float:  # type: ignore[type-arg]
-    """
-    Compute the surface area of a cuboid.
-
-    Args:
-        cuboid_params: Dictionary containing cuboid parameters
-
-    Returns:
-        Surface area in square units
-    """
-    if "dimensions" not in cuboid_params:
-        raise ValueError("cuboid_params must contain 'dimensions' key")
-
-    dims = cuboid_params["dimensions"]
-    return 2.0 * (dims[0] * dims[1] + dims[1] * dims[2] + dims[2] * dims[0])  # type: ignore[no-any-return]
-
-
-def check_cuboid_quality(cuboid_params: dict, points: np.ndarray) -> dict:  # type: ignore[type-arg]
-    """
-    Assess the quality of a cuboid fit.
-
-    Args:
-        cuboid_params: Dictionary containing cuboid parameters
-        points: Original points used for fitting
-
-    Returns:
-        Dictionary with quality metrics
-    """
-    if len(points) == 0:
-        return {"error": "No points provided"}
-
-    # Basic metrics
-    volume = compute_cuboid_volume(cuboid_params)
-    surface_area = compute_cuboid_surface_area(cuboid_params)
-    error = cuboid_params.get("error", 0.0)
-
-    # Aspect ratio analysis
-    dims = cuboid_params["dimensions"]
-    aspect_ratios = [
-        dims[0] / dims[1] if dims[1] > 0 else float("inf"),
-        dims[1] / dims[2] if dims[2] > 0 else float("inf"),
-        dims[2] / dims[0] if dims[0] > 0 else float("inf"),
-    ]
-    max_aspect_ratio = max(aspect_ratios)
-
-    # Volume ratio (cuboid volume vs convex hull volume)
-    try:
-        pcd = o3d.geometry.PointCloud()
-        pcd.points = o3d.utility.Vector3dVector(points)
-        hull, _ = pcd.compute_convex_hull()
-        hull_volume = hull.get_volume()
-        volume_ratio = volume / hull_volume if hull_volume > 0 else float("inf")
-    except:
-        volume_ratio = None
-
-    return {
-        "fitting_error": error,
-        "volume": volume,
-        "surface_area": surface_area,
-        "max_aspect_ratio": max_aspect_ratio,
-        "volume_ratio": volume_ratio,
-        "num_points": len(points),
-        "method": cuboid_params.get("method", "unknown"),
-    }
-
-
-# Backward compatibility
-def visualize_fit(image, cuboid_params, camera_matrix, R=None, t=None):  # type: ignore[no-untyped-def]
-    """
-    Legacy function for backward compatibility.
-    Use visualize_cuboid_on_image instead.
-    """
-    return visualize_cuboid_on_image(
-        image, cuboid_params, camera_matrix, R, t, show_dimensions=True
-    )
diff --git a/dimos/perception/pointcloud/pointcloud_filtering.py b/dimos/perception/pointcloud/pointcloud_filtering.py
deleted file mode 100644
index d6aa2b835f..0000000000
--- a/dimos/perception/pointcloud/pointcloud_filtering.py
+++ /dev/null
@@ -1,370 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import cv2
-import numpy as np
-import open3d as o3d  # type: ignore[import-untyped]
-import torch
-
-from dimos.perception.pointcloud.cuboid_fit import fit_cuboid
-from dimos.perception.pointcloud.utils import (
-    create_point_cloud_and_extract_masks,
-    load_camera_matrix_from_yaml,
-)
-from dimos.types.manipulation import ObjectData
-from dimos.types.vector import Vector
-
-
-class PointcloudFiltering:
-    """
-    A production-ready point cloud filtering pipeline for segmented objects.
-
-    This class takes segmentation results and produces clean, filtered point clouds
-    for each object with consistent coloring and optional outlier removal.
-    """
-
-    def __init__(
-        self,
-        color_intrinsics: str | list[float] | np.ndarray | None = None,  # type: ignore[type-arg]
-        depth_intrinsics: str | list[float] | np.ndarray | None = None,  # type: ignore[type-arg]
-        color_weight: float = 0.3,
-        enable_statistical_filtering: bool = True,
-        statistical_neighbors: int = 20,
-        statistical_std_ratio: float = 1.5,
-        enable_radius_filtering: bool = True,
-        radius_filtering_radius: float = 0.015,
-        radius_filtering_min_neighbors: int = 25,
-        enable_subsampling: bool = True,
-        voxel_size: float = 0.005,
-        max_num_objects: int = 10,
-        min_points_for_cuboid: int = 10,
-        cuboid_method: str = "oriented",
-        max_bbox_size_percent: float = 30.0,
-    ) -> None:
-        """
-        Initialize the point cloud filtering pipeline.
-
-        Args:
-            color_intrinsics: Camera intrinsics for color image
-            depth_intrinsics: Camera intrinsics for depth image
-            color_weight: Weight for blending generated color with original (0.0-1.0)
-            enable_statistical_filtering: Enable/disable statistical outlier filtering
-            statistical_neighbors: Number of neighbors for statistical filtering
-            statistical_std_ratio: Standard deviation ratio for statistical filtering
-            enable_radius_filtering: Enable/disable radius outlier filtering
-            radius_filtering_radius: Search radius for radius filtering (meters)
-            radius_filtering_min_neighbors: Min neighbors within radius
-            enable_subsampling: Enable/disable point cloud subsampling
-            voxel_size: Voxel size for downsampling (meters, when subsampling enabled)
-            max_num_objects: Maximum number of objects to process (top N by confidence)
-            min_points_for_cuboid: Minimum points required for cuboid fitting
-            cuboid_method: Method for cuboid fitting ('minimal', 'oriented', 'axis_aligned')
-            max_bbox_size_percent: Maximum percentage of image size for object bboxes (0-100)
-
-        Raises:
-            ValueError: If invalid parameters are provided
-        """
-        # Validate parameters
-        if not 0.0 <= color_weight <= 1.0:
-            raise ValueError(f"color_weight must be between 0.0 and 1.0, got {color_weight}")
-        if not 0.0 <= max_bbox_size_percent <= 100.0:
-            raise ValueError(
-                f"max_bbox_size_percent must be between 0.0 and 100.0, got {max_bbox_size_percent}"
-            )
-
-        # Store settings
-        self.color_weight = color_weight
-        self.enable_statistical_filtering = enable_statistical_filtering
-        self.statistical_neighbors = statistical_neighbors
-        self.statistical_std_ratio = statistical_std_ratio
-        self.enable_radius_filtering = enable_radius_filtering
-        self.radius_filtering_radius = radius_filtering_radius
-        self.radius_filtering_min_neighbors = radius_filtering_min_neighbors
-        self.enable_subsampling = enable_subsampling
-        self.voxel_size = voxel_size
-        self.max_num_objects = max_num_objects
-        self.min_points_for_cuboid = min_points_for_cuboid
-        self.cuboid_method = cuboid_method
-        self.max_bbox_size_percent = max_bbox_size_percent
-
-        # Load camera matrices
-        self.color_camera_matrix = load_camera_matrix_from_yaml(color_intrinsics)
-        self.depth_camera_matrix = load_camera_matrix_from_yaml(depth_intrinsics)
-
-        # Store the full point cloud
-        self.full_pcd = None
-
-    def generate_color_from_id(self, object_id: int) -> np.ndarray:  # type: ignore[type-arg]
-        """Generate a consistent color for a given object ID."""
-        np.random.seed(object_id)
-        color = np.random.randint(0, 255, 3, dtype=np.uint8)
-        np.random.seed(None)
-        return color
-
-    def _validate_inputs(  # type: ignore[no-untyped-def]
-        self,
-        color_img: np.ndarray,  # type: ignore[type-arg]
-        depth_img: np.ndarray,  # type: ignore[type-arg]
-        objects: list[ObjectData],
-    ):
-        """Validate input parameters."""
-        if color_img.shape[:2] != depth_img.shape:
-            raise ValueError("Color and depth image dimensions don't match")
-
-    def _prepare_masks(self, masks: list[np.ndarray], target_shape: tuple) -> list[np.ndarray]:  # type: ignore[type-arg]
-        """Prepare and validate masks to match target shape."""
-        processed_masks = []
-        for mask in masks:
-            # Convert mask to numpy if it's a tensor
-            if hasattr(mask, "cpu"):
-                mask = mask.cpu().numpy()
-
-            mask = mask.astype(bool)
-
-            # Handle shape mismatches
-            if mask.shape != target_shape:
-                if len(mask.shape) > 2:
-                    mask = mask[:, :, 0]
-
-                if mask.shape != target_shape:
-                    mask = cv2.resize(
-                        mask.astype(np.uint8),
-                        (target_shape[1], target_shape[0]),
-                        interpolation=cv2.INTER_NEAREST,
-                    ).astype(bool)
-
-            processed_masks.append(mask)
-
-        return processed_masks
-
-    def _apply_color_mask(
-        self,
-        pcd: o3d.geometry.PointCloud,
-        rgb_color: np.ndarray,  # type: ignore[type-arg]
-    ) -> o3d.geometry.PointCloud:
-        """Apply weighted color mask to point cloud."""
-        if len(np.asarray(pcd.colors)) > 0:
-            original_colors = np.asarray(pcd.colors)
-            generated_color = rgb_color.astype(np.float32) / 255.0
-            colored_mask = (
-                1.0 - self.color_weight
-            ) * original_colors + self.color_weight * generated_color
-            colored_mask = np.clip(colored_mask, 0.0, 1.0)
-            pcd.colors = o3d.utility.Vector3dVector(colored_mask)
-        return pcd
-
-    def _apply_filtering(self, pcd: o3d.geometry.PointCloud) -> o3d.geometry.PointCloud:
-        """Apply optional filtering to point cloud based on enabled flags."""
-        current_pcd = pcd
-
-        # Apply statistical filtering if enabled
-        if self.enable_statistical_filtering:
-            current_pcd, _ = current_pcd.remove_statistical_outlier(
-                nb_neighbors=self.statistical_neighbors, std_ratio=self.statistical_std_ratio
-            )
-
-        # Apply radius filtering if enabled
-        if self.enable_radius_filtering:
-            current_pcd, _ = current_pcd.remove_radius_outlier(
-                nb_points=self.radius_filtering_min_neighbors, radius=self.radius_filtering_radius
-            )
-
-        return current_pcd
-
-    def _apply_subsampling(self, pcd: o3d.geometry.PointCloud) -> o3d.geometry.PointCloud:
-        """Apply subsampling to limit point cloud size using Open3D's voxel downsampling."""
-        if self.enable_subsampling:
-            return pcd.voxel_down_sample(self.voxel_size)
-        return pcd
-
-    def _extract_masks_from_objects(self, objects: list[ObjectData]) -> list[np.ndarray]:  # type: ignore[type-arg]
-        """Extract segmentation masks from ObjectData objects."""
-        return [obj["segmentation_mask"] for obj in objects]
-
-    def get_full_point_cloud(self) -> o3d.geometry.PointCloud:
-        """Get the full point cloud."""
-        return self._apply_subsampling(self.full_pcd)
-
-    def process_images(
-        self,
-        color_img: np.ndarray,  # type: ignore[type-arg]
-        depth_img: np.ndarray,  # type: ignore[type-arg]
-        objects: list[ObjectData],
-    ) -> list[ObjectData]:
-        """
-        Process color and depth images with object detection results to create filtered point clouds.
-
-        Args:
-            color_img: RGB image as numpy array (H, W, 3)
-            depth_img: Depth image as numpy array (H, W) in meters
-            objects: List of ObjectData from object detection stream
-
-        Returns:
-            List of updated ObjectData with pointcloud and 3D information. Each ObjectData
-            dictionary is enhanced with the following new fields:
-
-            **3D Spatial Information** (added when sufficient points for cuboid fitting):
-            - "position": Vector(x, y, z) - 3D center position in world coordinates (meters)
-            - "rotation": Vector(roll, pitch, yaw) - 3D orientation as Euler angles (radians)
-            - "size": {"width": float, "height": float, "depth": float} - 3D bounding box dimensions (meters)
-
-            **Point Cloud Data**:
-            - "point_cloud": o3d.geometry.PointCloud - Filtered Open3D point cloud with colors
-            - "color": np.ndarray - Consistent RGB color [R,G,B] (0-255) generated from object_id
-
-            **Grasp Generation Arrays** (Dimensional grasp format):
-            - "point_cloud_numpy": np.ndarray - Nx3 XYZ coordinates as float32 (meters)
-            - "colors_numpy": np.ndarray - Nx3 RGB colors as float32 (0.0-1.0 range)
-
-        Raises:
-            ValueError: If inputs are invalid
-            RuntimeError: If processing fails
-        """
-        # Validate inputs
-        self._validate_inputs(color_img, depth_img, objects)
-
-        if not objects:
-            return []
-
-        # Filter to top N objects by confidence
-        if len(objects) > self.max_num_objects:
-            # Sort objects by confidence (highest first), handle None confidences
-            sorted_objects = sorted(
-                objects,
-                key=lambda obj: obj.get("confidence", 0.0)
-                if obj.get("confidence") is not None
-                else 0.0,
-                reverse=True,
-            )
-            objects = sorted_objects[: self.max_num_objects]
-
-        # Filter out objects with bboxes too large
-        image_area = color_img.shape[0] * color_img.shape[1]
-        max_bbox_area = image_area * (self.max_bbox_size_percent / 100.0)
-
-        filtered_objects = []
-        for obj in objects:
-            if "bbox" in obj and obj["bbox"] is not None:
-                bbox = obj["bbox"]
-                # Calculate bbox area (assuming bbox format [x1, y1, x2, y2])
-                bbox_area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
-                if bbox_area <= max_bbox_area:
-                    filtered_objects.append(obj)
-            else:
-                filtered_objects.append(obj)
-
-        objects = filtered_objects
-
-        # Extract masks from ObjectData
-        masks = self._extract_masks_from_objects(objects)
-
-        # Prepare masks
-        processed_masks = self._prepare_masks(masks, depth_img.shape)
-
-        # Create point clouds efficiently
-        self.full_pcd, masked_pcds = create_point_cloud_and_extract_masks(
-            color_img,
-            depth_img,
-            processed_masks,
-            self.depth_camera_matrix,  # type: ignore[arg-type]
-            depth_scale=1.0,
-        )
-
-        # Process each object and update ObjectData
-        updated_objects = []
-
-        for i, (obj, _mask, pcd) in enumerate(
-            zip(objects, processed_masks, masked_pcds, strict=False)
-        ):
-            # Skip empty point clouds
-            if len(np.asarray(pcd.points)) == 0:
-                continue
-
-            # Create a copy of the object data to avoid modifying the original
-            updated_obj = obj.copy()
-
-            # Generate consistent color
-            object_id = obj.get("object_id", i)
-            rgb_color = self.generate_color_from_id(object_id)
-
-            # Apply color mask
-            pcd = self._apply_color_mask(pcd, rgb_color)
-
-            # Apply subsampling to control point cloud size
-            pcd = self._apply_subsampling(pcd)
-
-            # Apply filtering (optional based on flags)
-            pcd_filtered = self._apply_filtering(pcd)
-
-            # Fit cuboid and extract 3D information
-            points = np.asarray(pcd_filtered.points)
-            if len(points) >= self.min_points_for_cuboid:
-                cuboid_params = fit_cuboid(points, method=self.cuboid_method)
-                if cuboid_params is not None:
-                    # Update position, rotation, and size from cuboid
-                    center = cuboid_params["center"]
-                    dimensions = cuboid_params["dimensions"]
-                    rotation_matrix = cuboid_params["rotation"]
-
-                    # Convert rotation matrix to euler angles (roll, pitch, yaw)
-                    sy = np.sqrt(
-                        rotation_matrix[0, 0] * rotation_matrix[0, 0]
-                        + rotation_matrix[1, 0] * rotation_matrix[1, 0]
-                    )
-                    singular = sy < 1e-6
-
-                    if not singular:
-                        roll = np.arctan2(rotation_matrix[2, 1], rotation_matrix[2, 2])
-                        pitch = np.arctan2(-rotation_matrix[2, 0], sy)
-                        yaw = np.arctan2(rotation_matrix[1, 0], rotation_matrix[0, 0])
-                    else:
-                        roll = np.arctan2(-rotation_matrix[1, 2], rotation_matrix[1, 1])
-                        pitch = np.arctan2(-rotation_matrix[2, 0], sy)
-                        yaw = 0
-
-                    # Update position, rotation, and size from cuboid
-                    updated_obj["position"] = Vector(center[0], center[1], center[2])
-                    updated_obj["rotation"] = Vector(roll, pitch, yaw)
-                    updated_obj["size"] = {
-                        "width": float(dimensions[0]),
-                        "height": float(dimensions[1]),
-                        "depth": float(dimensions[2]),
-                    }
-
-            # Add point cloud data to ObjectData
-            updated_obj["point_cloud"] = pcd_filtered
-            updated_obj["color"] = rgb_color
-
-            # Extract numpy arrays for grasp generation
-            points_array = np.asarray(pcd_filtered.points).astype(np.float32)  # Nx3 XYZ coordinates
-            if pcd_filtered.has_colors():
-                colors_array = np.asarray(pcd_filtered.colors).astype(
-                    np.float32
-                )  # Nx3 RGB (0-1 range)
-            else:
-                # If no colors, create array of zeros
-                colors_array = np.zeros((len(points_array), 3), dtype=np.float32)
-
-            updated_obj["point_cloud_numpy"] = points_array
-            updated_obj["colors_numpy"] = colors_array  # type: ignore[typeddict-unknown-key]
-
-            updated_objects.append(updated_obj)
-
-        return updated_objects
-
-    def cleanup(self) -> None:
-        """Clean up resources."""
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
diff --git a/dimos/perception/pointcloud/test_pointcloud_filtering.py b/dimos/perception/pointcloud/test_pointcloud_filtering.py
deleted file mode 100644
index 4ac7e5cb2d..0000000000
--- a/dimos/perception/pointcloud/test_pointcloud_filtering.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import TYPE_CHECKING
-
-import cv2
-import numpy as np
-import open3d as o3d
-import pytest
-
-from dimos.perception.pointcloud.pointcloud_filtering import PointcloudFiltering
-from dimos.perception.pointcloud.utils import load_camera_matrix_from_yaml
-
-if TYPE_CHECKING:
-    from dimos.types.manipulation import ObjectData
-
-
-class TestPointcloudFiltering:
-    def test_pointcloud_filtering_initialization(self) -> None:
-        """Test PointcloudFiltering initializes correctly with default parameters."""
-        try:
-            filtering = PointcloudFiltering()
-            assert filtering is not None
-            assert filtering.color_weight == 0.3
-            assert filtering.enable_statistical_filtering
-            assert filtering.enable_radius_filtering
-            assert filtering.enable_subsampling
-        except Exception as e:
-            pytest.skip(f"Skipping test due to initialization error: {e}")
-
-    def test_pointcloud_filtering_with_custom_params(self) -> None:
-        """Test PointcloudFiltering with custom parameters."""
-        try:
-            filtering = PointcloudFiltering(
-                color_weight=0.5,
-                enable_statistical_filtering=False,
-                enable_radius_filtering=False,
-                voxel_size=0.01,
-                max_num_objects=5,
-            )
-            assert filtering.color_weight == 0.5
-            assert not filtering.enable_statistical_filtering
-            assert not filtering.enable_radius_filtering
-            assert filtering.voxel_size == 0.01
-            assert filtering.max_num_objects == 5
-        except Exception as e:
-            pytest.skip(f"Skipping test due to initialization error: {e}")
-
-    def test_pointcloud_filtering_process_images(self) -> None:
-        """Test PointcloudFiltering can process RGB-D images and return filtered point clouds."""
-        try:
-            # Import data inside method to avoid pytest fixture confusion
-            from dimos.utils.data import get_data
-
-            # Load test RGB-D data
-            data_dir = get_data("rgbd_frames")
-
-            # Load first frame
-            color_path = os.path.join(data_dir, "color", "00000.png")
-            depth_path = os.path.join(data_dir, "depth", "00000.png")
-            intrinsics_path = os.path.join(data_dir, "color_camera_info.yaml")
-
-            assert os.path.exists(color_path), f"Color image not found: {color_path}"
-            assert os.path.exists(depth_path), f"Depth image not found: {depth_path}"
-            assert os.path.exists(intrinsics_path), f"Intrinsics file not found: {intrinsics_path}"
-
-            # Load images
-            color_img = cv2.imread(color_path)
-            color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB)
-
-            depth_img = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
-            if depth_img.dtype == np.uint16:
-                depth_img = depth_img.astype(np.float32) / 1000.0
-
-            # Load camera intrinsics
-            camera_matrix = load_camera_matrix_from_yaml(intrinsics_path)
-            if camera_matrix is None:
-                pytest.skip("Failed to load camera intrinsics")
-
-            # Create mock objects with segmentation masks
-            height, width = color_img.shape[:2]
-
-            # Create simple rectangular masks for testing
-            mock_objects = []
-
-            # Object 1: Top-left quadrant
-            mask1 = np.zeros((height, width), dtype=bool)
-            mask1[height // 4 : height // 2, width // 4 : width // 2] = True
-
-            obj1: ObjectData = {
-                "object_id": 1,
-                "confidence": 0.9,
-                "bbox": [width // 4, height // 4, width // 2, height // 2],
-                "segmentation_mask": mask1,
-                "name": "test_object_1",
-            }
-            mock_objects.append(obj1)
-
-            # Object 2: Bottom-right quadrant
-            mask2 = np.zeros((height, width), dtype=bool)
-            mask2[height // 2 : 3 * height // 4, width // 2 : 3 * width // 4] = True
-
-            obj2: ObjectData = {
-                "object_id": 2,
-                "confidence": 0.8,
-                "bbox": [width // 2, height // 2, 3 * width // 4, 3 * height // 4],
-                "segmentation_mask": mask2,
-                "name": "test_object_2",
-            }
-            mock_objects.append(obj2)
-
-            # Initialize filtering with intrinsics
-            filtering = PointcloudFiltering(
-                color_intrinsics=camera_matrix,
-                depth_intrinsics=camera_matrix,
-                enable_statistical_filtering=False,  # Disable for faster testing
-                enable_radius_filtering=False,  # Disable for faster testing
-                voxel_size=0.01,  # Larger voxel for faster processing
-            )
-
-            # Process images
-            results = filtering.process_images(color_img, depth_img, mock_objects)
-
-            print(
-                f"Processing results - Input objects: {len(mock_objects)}, Output objects: {len(results)}"
-            )
-
-            # Verify results
-            assert isinstance(results, list), "Results should be a list"
-            assert len(results) <= len(mock_objects), "Should not return more objects than input"
-
-            # Check each result object
-            for i, result in enumerate(results):
-                print(f"Object {i}: {result.get('name', 'unknown')}")
-
-                # Verify required fields exist
-                assert "point_cloud" in result, "Result should contain point_cloud"
-                assert "color" in result, "Result should contain color"
-                assert "point_cloud_numpy" in result, "Result should contain point_cloud_numpy"
-
-                # Verify point cloud is valid Open3D object
-                pcd = result["point_cloud"]
-                assert isinstance(pcd, o3d.geometry.PointCloud), (
-                    "point_cloud should be Open3D PointCloud"
-                )
-
-                # Verify numpy arrays
-                points_array = result["point_cloud_numpy"]
-                assert isinstance(points_array, np.ndarray), (
-                    "point_cloud_numpy should be numpy array"
-                )
-                assert points_array.shape[1] == 3, "Point array should have 3 columns (x,y,z)"
-                assert points_array.dtype == np.float32, "Point array should be float32"
-
-                # Verify color
-                color = result["color"]
-                assert isinstance(color, np.ndarray), "Color should be numpy array"
-                assert color.shape == (3,), "Color should be RGB triplet"
-                assert color.dtype == np.uint8, "Color should be uint8"
-
-                # Check if 3D information was added (when enough points for cuboid fitting)
-                points = np.asarray(pcd.points)
-                if len(points) >= filtering.min_points_for_cuboid:
-                    if "position" in result:
-                        assert "rotation" in result, "Should have rotation if position exists"
-                        assert "size" in result, "Should have size if position exists"
-
-                        # Verify position format
-                        from dimos.types.vector import Vector
-
-                        position = result["position"]
-                        assert isinstance(position, Vector), "Position should be Vector"
-
-                        # Verify size format
-                        size = result["size"]
-                        assert isinstance(size, dict), "Size should be dict"
-                        assert "width" in size and "height" in size and "depth" in size
-
-                print(f"  - Points: {len(points)}")
-                print(f"  - Color: {color}")
-                if "position" in result:
-                    print(f"  - Position: {result['position']}")
-                    print(f"  - Size: {result['size']}")
-
-            # Test full point cloud access
-            full_pcd = filtering.get_full_point_cloud()
-            if full_pcd is not None:
-                assert isinstance(full_pcd, o3d.geometry.PointCloud), (
-                    "Full point cloud should be Open3D PointCloud"
-                )
-                full_points = np.asarray(full_pcd.points)
-                print(f"Full point cloud points: {len(full_points)}")
-
-            print("All pointcloud filtering tests passed!")
-
-        except Exception as e:
-            pytest.skip(f"Skipping test due to error: {e}")
-
-    def test_pointcloud_filtering_empty_objects(self) -> None:
-        """Test PointcloudFiltering with empty object list."""
-        try:
-            from dimos.utils.data import get_data
-
-            # Load test data
-            data_dir = get_data("rgbd_frames")
-            color_path = os.path.join(data_dir, "color", "00000.png")
-            depth_path = os.path.join(data_dir, "depth", "00000.png")
-
-            if not (os.path.exists(color_path) and os.path.exists(depth_path)):
-                pytest.skip("Test images not found")
-
-            color_img = cv2.imread(color_path)
-            color_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB)
-            depth_img = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
-            if depth_img.dtype == np.uint16:
-                depth_img = depth_img.astype(np.float32) / 1000.0
-
-            filtering = PointcloudFiltering()
-
-            # Test with empty object list
-            results = filtering.process_images(color_img, depth_img, [])
-
-            assert isinstance(results, list), "Results should be a list"
-            assert len(results) == 0, "Should return empty list for empty input"
-
-        except Exception as e:
-            pytest.skip(f"Skipping test due to error: {e}")
-
-    def test_color_generation_consistency(self) -> None:
-        """Test that color generation is consistent for the same object ID."""
-        try:
-            filtering = PointcloudFiltering()
-
-            # Test color generation consistency
-            color1 = filtering.generate_color_from_id(42)
-            color2 = filtering.generate_color_from_id(42)
-            color3 = filtering.generate_color_from_id(43)
-
-            assert np.array_equal(color1, color2), "Same ID should generate same color"
-            assert not np.array_equal(color1, color3), (
-                "Different IDs should generate different colors"
-            )
-            assert color1.shape == (3,), "Color should be RGB triplet"
-            assert color1.dtype == np.uint8, "Color should be uint8"
-
-        except Exception as e:
-            pytest.skip(f"Skipping test due to error: {e}")
-
-
-if __name__ == "__main__":
-    pytest.main(["-v", __file__])
diff --git a/dimos/perception/pointcloud/utils.py b/dimos/perception/pointcloud/utils.py
deleted file mode 100644
index b2bb561000..0000000000
--- a/dimos/perception/pointcloud/utils.py
+++ /dev/null
@@ -1,1113 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Point cloud utilities for RGBD data processing.
-
-This module provides efficient utilities for creating and manipulating point clouds
-from RGBD images using Open3D.
-"""
-
-import os
-from typing import Any
-
-import cv2
-import numpy as np
-import open3d as o3d  # type: ignore[import-untyped]
-from scipy.spatial import cKDTree  # type: ignore[import-untyped]
-import yaml  # type: ignore[import-untyped]
-
-from dimos.perception.common.utils import project_3d_points_to_2d
-
-
-def load_camera_matrix_from_yaml(
-    camera_info: str | list[float] | np.ndarray | dict | None,  # type: ignore[type-arg]
-) -> np.ndarray | None:  # type: ignore[type-arg]
-    """
-    Load camera intrinsic matrix from various input formats.
-
-    Args:
-        camera_info: Can be:
-            - Path to YAML file containing camera parameters
-            - List of [fx, fy, cx, cy]
-            - 3x3 numpy array (returned as-is)
-            - Dict with camera parameters
-            - None (returns None)
-
-    Returns:
-        3x3 camera intrinsic matrix or None if input is None
-
-    Raises:
-        ValueError: If camera_info format is invalid or file cannot be read
-        FileNotFoundError: If YAML file path doesn't exist
-    """
-    if camera_info is None:
-        return None
-
-    # Handle case where camera_info is already a matrix
-    if isinstance(camera_info, np.ndarray) and camera_info.shape == (3, 3):
-        return camera_info.astype(np.float32)
-
-    # Handle case where camera_info is [fx, fy, cx, cy] format
-    if isinstance(camera_info, list) and len(camera_info) == 4:
-        fx, fy, cx, cy = camera_info
-        return np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
-
-    # Handle case where camera_info is a dict
-    if isinstance(camera_info, dict):
-        return _extract_matrix_from_dict(camera_info)
-
-    # Handle case where camera_info is a path to a YAML file
-    if isinstance(camera_info, str):
-        if not os.path.isfile(camera_info):
-            raise FileNotFoundError(f"Camera info file not found: {camera_info}")
-
-        try:
-            with open(camera_info) as f:
-                data = yaml.safe_load(f)
-            return _extract_matrix_from_dict(data)
-        except Exception as e:
-            raise ValueError(f"Failed to read camera info from {camera_info}: {e}")
-
-    raise ValueError(
-        f"Invalid camera_info format. Expected str, list, dict, or numpy array, got {type(camera_info)}"
-    )
-
-
-def _extract_matrix_from_dict(data: dict) -> np.ndarray:  # type: ignore[type-arg]
-    """Extract camera matrix from dictionary with various formats."""
-    # ROS format with 'K' field (most common)
-    if "K" in data:
-        k_data = data["K"]
-        if len(k_data) == 9:
-            return np.array(k_data, dtype=np.float32).reshape(3, 3)
-
-    # Standard format with 'camera_matrix'
-    if "camera_matrix" in data:
-        if "data" in data["camera_matrix"]:
-            matrix_data = data["camera_matrix"]["data"]
-            if len(matrix_data) == 9:
-                return np.array(matrix_data, dtype=np.float32).reshape(3, 3)
-
-    # Explicit intrinsics format
-    if all(k in data for k in ["fx", "fy", "cx", "cy"]):
-        fx, fy = float(data["fx"]), float(data["fy"])
-        cx, cy = float(data["cx"]), float(data["cy"])
-        return np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
-
-    # Error case - provide helpful debug info
-    available_keys = list(data.keys())
-    if "K" in data:
-        k_info = f"K field length: {len(data['K']) if hasattr(data['K'], '__len__') else 'unknown'}"
-    else:
-        k_info = "K field not found"
-
-    raise ValueError(
-        f"Cannot extract camera matrix from data. "
-        f"Available keys: {available_keys}. {k_info}. "
-        f"Expected formats: 'K' (9 elements), 'camera_matrix.data' (9 elements), "
-        f"or individual 'fx', 'fy', 'cx', 'cy' fields."
-    )
-
-
-def create_o3d_point_cloud_from_rgbd(
-    color_img: np.ndarray,  # type: ignore[type-arg]
-    depth_img: np.ndarray,  # type: ignore[type-arg]
-    intrinsic: np.ndarray,  # type: ignore[type-arg]
-    depth_scale: float = 1.0,
-    depth_trunc: float = 3.0,
-) -> o3d.geometry.PointCloud:
-    """
-    Create an Open3D point cloud from RGB and depth images.
-
-    Args:
-        color_img: RGB image as numpy array (H, W, 3)
-        depth_img: Depth image as numpy array (H, W)
-        intrinsic: Camera intrinsic matrix (3x3 numpy array)
-        depth_scale: Scale factor to convert depth to meters
-        depth_trunc: Maximum depth in meters
-
-    Returns:
-        Open3D point cloud object
-
-    Raises:
-        ValueError: If input dimensions are invalid
-    """
-    # Validate inputs
-    if len(color_img.shape) != 3 or color_img.shape[2] != 3:
-        raise ValueError(f"color_img must be (H, W, 3), got {color_img.shape}")
-    if len(depth_img.shape) != 2:
-        raise ValueError(f"depth_img must be (H, W), got {depth_img.shape}")
-    if color_img.shape[:2] != depth_img.shape:
-        raise ValueError(
-            f"Color and depth image dimensions don't match: {color_img.shape[:2]} vs {depth_img.shape}"
-        )
-    if intrinsic.shape != (3, 3):
-        raise ValueError(f"intrinsic must be (3, 3), got {intrinsic.shape}")
-
-    # Convert to Open3D format
-    color_o3d = o3d.geometry.Image(color_img.astype(np.uint8))
-
-    # Filter out inf and nan values from depth image
-    depth_filtered = depth_img.copy()
-
-    # Create mask for valid depth values (finite, positive, non-zero)
-    valid_mask = np.isfinite(depth_filtered) & (depth_filtered > 0)
-
-    # Set invalid values to 0 (which Open3D treats as no depth)
-    depth_filtered[~valid_mask] = 0.0
-
-    depth_o3d = o3d.geometry.Image(depth_filtered.astype(np.float32))
-
-    # Create Open3D intrinsic object
-    height, width = color_img.shape[:2]
-    fx, fy = intrinsic[0, 0], intrinsic[1, 1]
-    cx, cy = intrinsic[0, 2], intrinsic[1, 2]
-    intrinsic_o3d = o3d.camera.PinholeCameraIntrinsic(
-        width,
-        height,
-        fx,
-        fy,  # fx, fy
-        cx,
-        cy,  # cx, cy
-    )
-
-    # Create RGBD image
-    rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth(
-        color_o3d,
-        depth_o3d,
-        depth_scale=depth_scale,
-        depth_trunc=depth_trunc,
-        convert_rgb_to_intensity=False,
-    )
-
-    # Create point cloud
-    pcd = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, intrinsic_o3d)
-
-    return pcd
-
-
-def create_point_cloud_and_extract_masks(
-    color_img: np.ndarray,  # type: ignore[type-arg]
-    depth_img: np.ndarray,  # type: ignore[type-arg]
-    masks: list[np.ndarray],  # type: ignore[type-arg]
-    intrinsic: np.ndarray,  # type: ignore[type-arg]
-    depth_scale: float = 1.0,
-    depth_trunc: float = 3.0,
-) -> tuple[o3d.geometry.PointCloud, list[o3d.geometry.PointCloud]]:
-    """
-    Efficiently create a point cloud once and extract multiple masked regions.
-
-    Args:
-        color_img: RGB image (H, W, 3)
-        depth_img: Depth image (H, W)
-        masks: List of boolean masks, each of shape (H, W)
-        intrinsic: Camera intrinsic matrix (3x3 numpy array)
-        depth_scale: Scale factor to convert depth to meters
-        depth_trunc: Maximum depth in meters
-
-    Returns:
-        Tuple of (full_point_cloud, list_of_masked_point_clouds)
-    """
-    if not masks:
-        return o3d.geometry.PointCloud(), []
-
-    # Create the full point cloud
-    full_pcd = create_o3d_point_cloud_from_rgbd(
-        color_img, depth_img, intrinsic, depth_scale, depth_trunc
-    )
-
-    if len(np.asarray(full_pcd.points)) == 0:
-        return full_pcd, [o3d.geometry.PointCloud() for _ in masks]
-
-    # Create pixel-to-point mapping
-    valid_depth_mask = np.isfinite(depth_img) & (depth_img > 0) & (depth_img <= depth_trunc)
-
-    valid_depth = valid_depth_mask.flatten()
-    if not np.any(valid_depth):
-        return full_pcd, [o3d.geometry.PointCloud() for _ in masks]
-
-    pixel_to_point = np.full(len(valid_depth), -1, dtype=np.int32)
-    pixel_to_point[valid_depth] = np.arange(np.sum(valid_depth))
-
-    # Extract point clouds for each mask
-    masked_pcds = []
-    max_points = len(np.asarray(full_pcd.points))
-
-    for mask in masks:
-        if mask.shape != depth_img.shape:
-            masked_pcds.append(o3d.geometry.PointCloud())
-            continue
-
-        mask_flat = mask.flatten()
-        valid_mask_indices = mask_flat & valid_depth
-        point_indices = pixel_to_point[valid_mask_indices]
-        valid_point_indices = point_indices[point_indices >= 0]
-
-        if len(valid_point_indices) > 0:
-            valid_point_indices = np.clip(valid_point_indices, 0, max_points - 1)
-            valid_point_indices = np.unique(valid_point_indices)
-            masked_pcd = full_pcd.select_by_index(valid_point_indices.tolist())
-        else:
-            masked_pcd = o3d.geometry.PointCloud()
-
-        masked_pcds.append(masked_pcd)
-
-    return full_pcd, masked_pcds
-
-
-def filter_point_cloud_statistical(
-    pcd: o3d.geometry.PointCloud, nb_neighbors: int = 20, std_ratio: float = 2.0
-) -> tuple[o3d.geometry.PointCloud, np.ndarray]:  # type: ignore[type-arg]
-    """
-    Apply statistical outlier filtering to point cloud.
-
-    Args:
-        pcd: Input point cloud
-        nb_neighbors: Number of neighbors to analyze for each point
-        std_ratio: Threshold level based on standard deviation
-
-    Returns:
-        Tuple of (filtered_point_cloud, outlier_indices)
-    """
-    if len(np.asarray(pcd.points)) == 0:
-        return pcd, np.array([])
-
-    return pcd.remove_statistical_outlier(nb_neighbors=nb_neighbors, std_ratio=std_ratio)  # type: ignore[no-any-return]
-
-
-def filter_point_cloud_radius(
-    pcd: o3d.geometry.PointCloud, nb_points: int = 16, radius: float = 0.05
-) -> tuple[o3d.geometry.PointCloud, np.ndarray]:  # type: ignore[type-arg]
-    """
-    Apply radius-based outlier filtering to point cloud.
-
-    Args:
-        pcd: Input point cloud
-        nb_points: Minimum number of points within radius
-        radius: Search radius in meters
-
-    Returns:
-        Tuple of (filtered_point_cloud, outlier_indices)
-    """
-    if len(np.asarray(pcd.points)) == 0:
-        return pcd, np.array([])
-
-    return pcd.remove_radius_outlier(nb_points=nb_points, radius=radius)  # type: ignore[no-any-return]
-
-
-def overlay_point_clouds_on_image(
-    base_image: np.ndarray,  # type: ignore[type-arg]
-    point_clouds: list[o3d.geometry.PointCloud],
-    camera_intrinsics: list[float] | np.ndarray,  # type: ignore[type-arg]
-    colors: list[tuple[int, int, int]],
-    point_size: int = 2,
-    alpha: float = 0.7,
-) -> np.ndarray:  # type: ignore[type-arg]
-    """
-    Overlay multiple colored point clouds onto an image.
-
-    Args:
-        base_image: Base image to overlay onto (H, W, 3) - assumed to be RGB
-        point_clouds: List of Open3D point cloud objects
-        camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
-        colors: List of RGB color tuples for each point cloud. If None, generates distinct colors.
-        point_size: Size of points to draw (in pixels)
-        alpha: Blending factor for overlay (0.0 = fully transparent, 1.0 = fully opaque)
-
-    Returns:
-        Image with overlaid point clouds (H, W, 3)
-    """
-    if len(point_clouds) == 0:
-        return base_image.copy()
-
-    # Create overlay image
-    overlay = base_image.copy()
-    height, width = base_image.shape[:2]
-
-    # Process each point cloud
-    for i, pcd in enumerate(point_clouds):
-        if pcd is None:
-            continue
-
-        points_3d = np.asarray(pcd.points)
-        if len(points_3d) == 0:
-            continue
-
-        # Project 3D points to 2D
-        points_2d = project_3d_points_to_2d(points_3d, camera_intrinsics)
-
-        if len(points_2d) == 0:
-            continue
-
-        # Filter points within image bounds
-        valid_mask = (
-            (points_2d[:, 0] >= 0)
-            & (points_2d[:, 0] < width)
-            & (points_2d[:, 1] >= 0)
-            & (points_2d[:, 1] < height)
-        )
-        valid_points_2d = points_2d[valid_mask]
-
-        if len(valid_points_2d) == 0:
-            continue
-
-        # Get color for this point cloud
-        color = colors[i % len(colors)]
-
-        # Ensure color is a tuple of integers for OpenCV
-        if isinstance(color, list | tuple | np.ndarray):
-            color = tuple(int(c) for c in color[:3])  # type: ignore[assignment]
-        else:
-            color = (255, 255, 255)
-
-        # Draw points on overlay
-        for point in valid_points_2d:
-            u, v = point
-            # Draw a small filled circle for each point
-            cv2.circle(overlay, (u, v), point_size, color, -1)
-
-    # Blend overlay with base image
-    result = cv2.addWeighted(base_image, 1 - alpha, overlay, alpha, 0)
-
-    return result
-
-
-def create_point_cloud_overlay_visualization(
-    base_image: np.ndarray,  # type: ignore[type-arg]
-    objects: list[dict],  # type: ignore[type-arg]
-    intrinsics: np.ndarray,  # type: ignore[type-arg]
-) -> np.ndarray:  # type: ignore[type-arg]
-    """
-    Create a visualization showing object point clouds and bounding boxes overlaid on a base image.
-
-    Args:
-        base_image: Base image to overlay onto (H, W, 3)
-        objects: List of object dictionaries containing 'point_cloud', 'color', 'position', 'rotation', 'size' keys
-        intrinsics: Camera intrinsics as [fx, fy, cx, cy] or 3x3 matrix
-
-    Returns:
-        Visualization image with overlaid point clouds and bounding boxes (H, W, 3)
-    """
-    # Extract point clouds and colors from objects
-    point_clouds = []
-    colors = []
-    for obj in objects:
-        if "point_cloud" in obj and obj["point_cloud"] is not None:
-            point_clouds.append(obj["point_cloud"])
-
-            # Convert color to tuple
-            color = obj["color"]
-            if isinstance(color, np.ndarray):
-                color = tuple(int(c) for c in color)
-            elif isinstance(color, list | tuple):
-                color = tuple(int(c) for c in color[:3])
-            colors.append(color)
-
-    # Create visualization
-    if point_clouds:
-        result = overlay_point_clouds_on_image(
-            base_image=base_image,
-            point_clouds=point_clouds,
-            camera_intrinsics=intrinsics,
-            colors=colors,
-            point_size=3,
-            alpha=0.8,
-        )
-    else:
-        result = base_image.copy()
-
-    # Draw 3D bounding boxes
-    height_img, width_img = result.shape[:2]
-    for i, obj in enumerate(objects):
-        if all(key in obj and obj[key] is not None for key in ["position", "rotation", "size"]):
-            try:
-                # Create and project 3D bounding box
-                corners_3d = create_3d_bounding_box_corners(
-                    obj["position"], obj["rotation"], obj["size"]
-                )
-                corners_2d = project_3d_points_to_2d(corners_3d, intrinsics)
-
-                # Check if any corners are visible
-                valid_mask = (
-                    (corners_2d[:, 0] >= 0)
-                    & (corners_2d[:, 0] < width_img)
-                    & (corners_2d[:, 1] >= 0)
-                    & (corners_2d[:, 1] < height_img)
-                )
-
-                if np.any(valid_mask):
-                    # Get color
-                    bbox_color = colors[i] if i < len(colors) else (255, 255, 255)
-                    draw_3d_bounding_box_on_image(result, corners_2d, bbox_color, thickness=2)
-            except:
-                continue
-
-    return result
-
-
-def create_3d_bounding_box_corners(position, rotation, size: int):  # type: ignore[no-untyped-def]
-    """
-    Create 8 corners of a 3D bounding box from position, rotation, and size.
-
-    Args:
-        position: Vector or dict with x, y, z coordinates
-        rotation: Vector or dict with roll, pitch, yaw angles
-        size: Dict with width, height, depth
-
-    Returns:
-        8x3 numpy array of corner coordinates
-    """
-    # Convert position to numpy array
-    if hasattr(position, "x"):  # Vector object
-        center = np.array([position.x, position.y, position.z])
-    else:  # Dictionary
-        center = np.array([position["x"], position["y"], position["z"]])
-
-    # Convert rotation (euler angles) to rotation matrix
-    if hasattr(rotation, "x"):  # Vector object (roll, pitch, yaw)
-        roll, pitch, yaw = rotation.x, rotation.y, rotation.z
-    else:  # Dictionary
-        roll, pitch, yaw = rotation["roll"], rotation["pitch"], rotation["yaw"]
-
-    # Create rotation matrix from euler angles (ZYX order)
-    cos_r, sin_r = np.cos(roll), np.sin(roll)
-    cos_p, sin_p = np.cos(pitch), np.sin(pitch)
-    cos_y, sin_y = np.cos(yaw), np.sin(yaw)
-
-    # Rotation matrix for ZYX euler angles
-    R = np.array(
-        [
-            [
-                cos_y * cos_p,
-                cos_y * sin_p * sin_r - sin_y * cos_r,
-                cos_y * sin_p * cos_r + sin_y * sin_r,
-            ],
-            [
-                sin_y * cos_p,
-                sin_y * sin_p * sin_r + cos_y * cos_r,
-                sin_y * sin_p * cos_r - cos_y * sin_r,
-            ],
-            [-sin_p, cos_p * sin_r, cos_p * cos_r],
-        ]
-    )
-
-    # Get dimensions
-    width = size.get("width", 0.1)  # type: ignore[attr-defined]
-    height = size.get("height", 0.1)  # type: ignore[attr-defined]
-    depth = size.get("depth", 0.1)  # type: ignore[attr-defined]
-
-    # Create 8 corners of the bounding box (before rotation)
-    corners = np.array(
-        [
-            [-width / 2, -height / 2, -depth / 2],  # 0
-            [width / 2, -height / 2, -depth / 2],  # 1
-            [width / 2, height / 2, -depth / 2],  # 2
-            [-width / 2, height / 2, -depth / 2],  # 3
-            [-width / 2, -height / 2, depth / 2],  # 4
-            [width / 2, -height / 2, depth / 2],  # 5
-            [width / 2, height / 2, depth / 2],  # 6
-            [-width / 2, height / 2, depth / 2],  # 7
-        ]
-    )
-
-    # Apply rotation and translation
-    rotated_corners = corners @ R.T + center
-
-    return rotated_corners
-
-
-def draw_3d_bounding_box_on_image(image, corners_2d, color, thickness: int = 2) -> None:  # type: ignore[no-untyped-def]
-    """
-    Draw a 3D bounding box on an image using projected 2D corners.
-
-    Args:
-        image: Image to draw on
-        corners_2d: 8x2 array of 2D corner coordinates
-        color: RGB color tuple
-        thickness: Line thickness
-    """
-    # Define the 12 edges of a cube (connecting corner indices)
-    edges = [
-        (0, 1),
-        (1, 2),
-        (2, 3),
-        (3, 0),  # Bottom face
-        (4, 5),
-        (5, 6),
-        (6, 7),
-        (7, 4),  # Top face
-        (0, 4),
-        (1, 5),
-        (2, 6),
-        (3, 7),  # Vertical edges
-    ]
-
-    # Draw each edge
-    for start_idx, end_idx in edges:
-        start_point = tuple(corners_2d[start_idx].astype(int))
-        end_point = tuple(corners_2d[end_idx].astype(int))
-        cv2.line(image, start_point, end_point, color, thickness)
-
-
-def extract_and_cluster_misc_points(
-    full_pcd: o3d.geometry.PointCloud,
-    all_objects: list[dict],  # type: ignore[type-arg]
-    eps: float = 0.03,
-    min_points: int = 100,
-    enable_filtering: bool = True,
-    voxel_size: float = 0.02,
-) -> tuple[list[o3d.geometry.PointCloud], o3d.geometry.VoxelGrid]:
-    """
-    Extract miscellaneous/background points and cluster them using DBSCAN.
-
-    Args:
-        full_pcd: Complete scene point cloud
-        all_objects: List of objects with point clouds to subtract
-        eps: DBSCAN epsilon parameter (max distance between points in cluster)
-        min_points: DBSCAN min_samples parameter (min points to form cluster)
-        enable_filtering: Whether to apply statistical and radius filtering
-        voxel_size: Size of voxels for voxel grid generation
-
-    Returns:
-        Tuple of (clustered_point_clouds, voxel_grid)
-    """
-    if full_pcd is None or len(np.asarray(full_pcd.points)) == 0:
-        return [], o3d.geometry.VoxelGrid()
-
-    if not all_objects:
-        # If no objects detected, cluster the full point cloud
-        clusters = _cluster_point_cloud_dbscan(full_pcd, eps, min_points)
-        voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size)
-        return clusters, voxel_grid
-
-    try:
-        # Start with a copy of the full point cloud
-        misc_pcd = o3d.geometry.PointCloud(full_pcd)
-
-        # Remove object points by combining all object point clouds
-        all_object_points = []
-        for obj in all_objects:
-            if "point_cloud" in obj and obj["point_cloud"] is not None:
-                obj_points = np.asarray(obj["point_cloud"].points)
-                if len(obj_points) > 0:
-                    all_object_points.append(obj_points)
-
-        if not all_object_points:
-            # No object points to remove, cluster full point cloud
-            clusters = _cluster_point_cloud_dbscan(misc_pcd, eps, min_points)
-            voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size)
-            return clusters, voxel_grid
-
-        # Combine all object points
-        combined_obj_points = np.vstack(all_object_points)
-
-        # For efficiency, downsample both point clouds
-        misc_downsampled = misc_pcd.voxel_down_sample(voxel_size=0.005)
-
-        # Create object point cloud for efficient operations
-        obj_pcd = o3d.geometry.PointCloud()
-        obj_pcd.points = o3d.utility.Vector3dVector(combined_obj_points)
-        obj_downsampled = obj_pcd.voxel_down_sample(voxel_size=0.005)
-
-        misc_points = np.asarray(misc_downsampled.points)
-        obj_points_down = np.asarray(obj_downsampled.points)
-
-        if len(misc_points) == 0 or len(obj_points_down) == 0:
-            clusters = _cluster_point_cloud_dbscan(misc_downsampled, eps, min_points)
-            voxel_grid = _create_voxel_grid_from_clusters(clusters, voxel_size)
-            return clusters, voxel_grid
-
-        # Build tree for object points
-        obj_tree = cKDTree(obj_points_down)
-
-        # Find distances from misc points to nearest object points
-        distances, _ = obj_tree.query(misc_points, k=1)
-
-        # Keep points that are far enough from any object point
-        threshold = 0.015  # 1.5cm threshold
-        keep_mask = distances > threshold
-
-        if not np.any(keep_mask):
-            return [], o3d.geometry.VoxelGrid()
-
-        # Filter misc points
-        misc_indices = np.where(keep_mask)[0]
-        final_misc_pcd = misc_downsampled.select_by_index(misc_indices)
-
-        if len(np.asarray(final_misc_pcd.points)) == 0:
-            return [], o3d.geometry.VoxelGrid()
-
-        # Apply additional filtering if enabled
-        if enable_filtering:
-            # Apply statistical outlier filtering
-            filtered_misc_pcd, _ = filter_point_cloud_statistical(
-                final_misc_pcd, nb_neighbors=30, std_ratio=2.0
-            )
-
-            if len(np.asarray(filtered_misc_pcd.points)) == 0:
-                return [], o3d.geometry.VoxelGrid()
-
-            # Apply radius outlier filtering
-            final_filtered_misc_pcd, _ = filter_point_cloud_radius(
-                filtered_misc_pcd,
-                nb_points=20,
-                radius=0.03,  # 3cm radius
-            )
-
-            if len(np.asarray(final_filtered_misc_pcd.points)) == 0:
-                return [], o3d.geometry.VoxelGrid()
-
-            final_misc_pcd = final_filtered_misc_pcd
-
-        # Cluster the misc points using DBSCAN
-        clusters = _cluster_point_cloud_dbscan(final_misc_pcd, eps, min_points)
-
-        # Create voxel grid from all misc points (before clustering)
-        voxel_grid = _create_voxel_grid_from_point_cloud(final_misc_pcd, voxel_size)
-
-        return clusters, voxel_grid
-
-    except Exception as e:
-        print(f"Error in misc point extraction and clustering: {e}")
-        # Fallback: return downsampled full point cloud as single cluster
-        try:
-            downsampled = full_pcd.voxel_down_sample(voxel_size=0.02)
-            if len(np.asarray(downsampled.points)) > 0:
-                voxel_grid = _create_voxel_grid_from_point_cloud(downsampled, voxel_size)
-                return [downsampled], voxel_grid
-            else:
-                return [], o3d.geometry.VoxelGrid()
-        except:
-            return [], o3d.geometry.VoxelGrid()
-
-
-def _create_voxel_grid_from_point_cloud(
-    pcd: o3d.geometry.PointCloud, voxel_size: float = 0.02
-) -> o3d.geometry.VoxelGrid:
-    """
-    Create a voxel grid from a point cloud.
-
-    Args:
-        pcd: Input point cloud
-        voxel_size: Size of each voxel
-
-    Returns:
-        Open3D VoxelGrid object
-    """
-    if len(np.asarray(pcd.points)) == 0:
-        return o3d.geometry.VoxelGrid()
-
-    try:
-        # Create voxel grid from point cloud
-        voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud(pcd, voxel_size)
-
-        # Color the voxels with a semi-transparent gray
-        for voxel in voxel_grid.get_voxels():
-            voxel.color = [0.5, 0.5, 0.5]  # Gray color
-
-        print(
-            f"Created voxel grid with {len(voxel_grid.get_voxels())} voxels (voxel_size={voxel_size})"
-        )
-        return voxel_grid
-
-    except Exception as e:
-        print(f"Error creating voxel grid: {e}")
-        return o3d.geometry.VoxelGrid()
-
-
-def _create_voxel_grid_from_clusters(
-    clusters: list[o3d.geometry.PointCloud], voxel_size: float = 0.02
-) -> o3d.geometry.VoxelGrid:
-    """
-    Create a voxel grid from multiple clustered point clouds.
-
-    Args:
-        clusters: List of clustered point clouds
-        voxel_size: Size of each voxel
-
-    Returns:
-        Open3D VoxelGrid object
-    """
-    if not clusters:
-        return o3d.geometry.VoxelGrid()
-
-    # Combine all clusters into one point cloud
-    combined_points = []
-    for cluster in clusters:
-        points = np.asarray(cluster.points)
-        if len(points) > 0:
-            combined_points.append(points)
-
-    if not combined_points:
-        return o3d.geometry.VoxelGrid()
-
-    # Create combined point cloud
-    all_points = np.vstack(combined_points)
-    combined_pcd = o3d.geometry.PointCloud()
-    combined_pcd.points = o3d.utility.Vector3dVector(all_points)
-
-    return _create_voxel_grid_from_point_cloud(combined_pcd, voxel_size)
-
-
-def _cluster_point_cloud_dbscan(
-    pcd: o3d.geometry.PointCloud, eps: float = 0.05, min_points: int = 50
-) -> list[o3d.geometry.PointCloud]:
-    """
-    Cluster a point cloud using DBSCAN and return list of clustered point clouds.
-
-    Args:
-        pcd: Point cloud to cluster
-        eps: DBSCAN epsilon parameter
-        min_points: DBSCAN min_samples parameter
-
-    Returns:
-        List of point clouds, one for each cluster
-    """
-    if len(np.asarray(pcd.points)) == 0:
-        return []
-
-    try:
-        # Apply DBSCAN clustering
-        labels = np.array(pcd.cluster_dbscan(eps=eps, min_points=min_points))
-
-        # Get unique cluster labels (excluding noise points labeled as -1)
-        unique_labels = np.unique(labels)
-        cluster_pcds = []
-
-        for label in unique_labels:
-            if label == -1:  # Skip noise points
-                continue
-
-            # Get indices for this cluster
-            cluster_indices = np.where(labels == label)[0]
-
-            if len(cluster_indices) > 0:
-                # Create point cloud for this cluster
-                cluster_pcd = pcd.select_by_index(cluster_indices)
-
-                # Assign a random color to this cluster
-                cluster_color = np.random.rand(3)  # Random RGB color
-                cluster_pcd.paint_uniform_color(cluster_color)
-
-                cluster_pcds.append(cluster_pcd)
-
-        print(
-            f"DBSCAN clustering found {len(cluster_pcds)} clusters from {len(np.asarray(pcd.points))} points"
-        )
-        return cluster_pcds
-
-    except Exception as e:
-        print(f"Error in DBSCAN clustering: {e}")
-        return [pcd]  # Return original point cloud as fallback
-
-
-def get_standard_coordinate_transform():  # type: ignore[no-untyped-def]
-    """
-    Get a standard coordinate transformation matrix for consistent visualization.
-
-    This transformation ensures that:
-    - X (red) axis points right
-    - Y (green) axis points up
-    - Z (blue) axis points toward viewer
-
-    Returns:
-        4x4 transformation matrix
-    """
-    # Standard transformation matrix to ensure consistent coordinate frame orientation
-    transform = np.array(
-        [
-            [1, 0, 0, 0],  # X points right
-            [0, -1, 0, 0],  # Y points up (flip from OpenCV to standard)
-            [0, 0, -1, 0],  # Z points toward viewer (flip depth)
-            [0, 0, 0, 1],
-        ]
-    )
-    return transform
-
-
-def visualize_clustered_point_clouds(
-    clustered_pcds: list[o3d.geometry.PointCloud],
-    window_name: str = "Clustered Point Clouds",
-    point_size: float = 2.0,
-    show_coordinate_frame: bool = True,
-    coordinate_frame_size: float = 0.1,
-) -> None:
-    """
-    Visualize multiple clustered point clouds with different colors.
-
-    Args:
-        clustered_pcds: List of point clouds (already colored)
-        window_name: Name of the visualization window
-        point_size: Size of points in the visualization
-        show_coordinate_frame: Whether to show coordinate frame
-        coordinate_frame_size: Size of the coordinate frame
-    """
-    if not clustered_pcds:
-        print("Warning: No clustered point clouds to visualize")
-        return
-
-    # Apply standard coordinate transformation
-    transform = get_standard_coordinate_transform()  # type: ignore[no-untyped-call]
-    geometries = []
-    for pcd in clustered_pcds:
-        pcd_copy = o3d.geometry.PointCloud(pcd)
-        pcd_copy.transform(transform)
-        geometries.append(pcd_copy)
-
-    # Add coordinate frame
-    if show_coordinate_frame:
-        coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
-            size=coordinate_frame_size
-        )
-        coordinate_frame.transform(transform)
-        geometries.append(coordinate_frame)
-
-    total_points = sum(len(np.asarray(pcd.points)) for pcd in clustered_pcds)
-    print(f"Visualizing {len(clustered_pcds)} clusters with {total_points} total points")
-
-    try:
-        vis = o3d.visualization.Visualizer()
-        vis.create_window(window_name=window_name, width=1280, height=720)
-        for geom in geometries:
-            vis.add_geometry(geom)
-        render_option = vis.get_render_option()
-        render_option.point_size = point_size
-        vis.run()
-        vis.destroy_window()
-    except Exception as e:
-        print(f"Failed to create interactive visualization: {e}")
-        o3d.visualization.draw_geometries(
-            geometries, window_name=window_name, width=1280, height=720
-        )
-
-
-def visualize_pcd(
-    pcd: o3d.geometry.PointCloud,
-    window_name: str = "Point Cloud Visualization",
-    point_size: float = 1.0,
-    show_coordinate_frame: bool = True,
-    coordinate_frame_size: float = 0.1,
-) -> None:
-    """
-    Visualize an Open3D point cloud using Open3D's visualization window.
-
-    Args:
-        pcd: Open3D point cloud to visualize
-        window_name: Name of the visualization window
-        point_size: Size of points in the visualization
-        show_coordinate_frame: Whether to show coordinate frame
-        coordinate_frame_size: Size of the coordinate frame
-    """
-    if pcd is None:
-        print("Warning: Point cloud is None, nothing to visualize")
-        return
-
-    if len(np.asarray(pcd.points)) == 0:
-        print("Warning: Point cloud is empty, nothing to visualize")
-        return
-
-    # Apply standard coordinate transformation
-    transform = get_standard_coordinate_transform()  # type: ignore[no-untyped-call]
-    pcd_copy = o3d.geometry.PointCloud(pcd)
-    pcd_copy.transform(transform)
-    geometries = [pcd_copy]
-
-    # Add coordinate frame
-    if show_coordinate_frame:
-        coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
-            size=coordinate_frame_size
-        )
-        coordinate_frame.transform(transform)
-        geometries.append(coordinate_frame)
-
-    print(f"Visualizing point cloud with {len(np.asarray(pcd.points))} points")
-
-    try:
-        vis = o3d.visualization.Visualizer()
-        vis.create_window(window_name=window_name, width=1280, height=720)
-        for geom in geometries:
-            vis.add_geometry(geom)
-        render_option = vis.get_render_option()
-        render_option.point_size = point_size
-        vis.run()
-        vis.destroy_window()
-    except Exception as e:
-        print(f"Failed to create interactive visualization: {e}")
-        o3d.visualization.draw_geometries(
-            geometries, window_name=window_name, width=1280, height=720
-        )
-
-
-def visualize_voxel_grid(
-    voxel_grid: o3d.geometry.VoxelGrid,
-    window_name: str = "Voxel Grid Visualization",
-    show_coordinate_frame: bool = True,
-    coordinate_frame_size: float = 0.1,
-) -> None:
-    """
-    Visualize an Open3D voxel grid using Open3D's visualization window.
-
-    Args:
-        voxel_grid: Open3D voxel grid to visualize
-        window_name: Name of the visualization window
-        show_coordinate_frame: Whether to show coordinate frame
-        coordinate_frame_size: Size of the coordinate frame
-    """
-    if voxel_grid is None:
-        print("Warning: Voxel grid is None, nothing to visualize")
-        return
-
-    if len(voxel_grid.get_voxels()) == 0:
-        print("Warning: Voxel grid is empty, nothing to visualize")
-        return
-
-    # VoxelGrid doesn't support transform, so we need to transform the source points instead
-    # For now, just visualize as-is with transformed coordinate frame
-    geometries = [voxel_grid]
-
-    # Add coordinate frame
-    if show_coordinate_frame:
-        coordinate_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
-            size=coordinate_frame_size
-        )
-        coordinate_frame.transform(get_standard_coordinate_transform())  # type: ignore[no-untyped-call]
-        geometries.append(coordinate_frame)
-
-    print(f"Visualizing voxel grid with {len(voxel_grid.get_voxels())} voxels")
-
-    try:
-        vis = o3d.visualization.Visualizer()
-        vis.create_window(window_name=window_name, width=1280, height=720)
-        for geom in geometries:
-            vis.add_geometry(geom)
-        vis.run()
-        vis.destroy_window()
-    except Exception as e:
-        print(f"Failed to create interactive visualization: {e}")
-        o3d.visualization.draw_geometries(
-            geometries, window_name=window_name, width=1280, height=720
-        )
-
-
-def combine_object_pointclouds(
-    point_clouds: list[np.ndarray] | list[o3d.geometry.PointCloud],  # type: ignore[type-arg]
-    colors: list[np.ndarray] | None = None,  # type: ignore[type-arg]
-) -> o3d.geometry.PointCloud:
-    """
-    Combine multiple point clouds into a single Open3D point cloud.
-
-    Args:
-        point_clouds: List of point clouds as numpy arrays or Open3D point clouds
-        colors: List of colors as numpy arrays
-    Returns:
-        Combined Open3D point cloud
-    """
-    all_points = []
-    all_colors = []
-
-    for i, pcd in enumerate(point_clouds):
-        if isinstance(pcd, np.ndarray):
-            points = pcd[:, :3]
-            all_points.append(points)
-            if colors:
-                all_colors.append(colors[i])
-
-        elif isinstance(pcd, o3d.geometry.PointCloud):
-            points = np.asarray(pcd.points)
-            all_points.append(points)
-            if pcd.has_colors():
-                colors = np.asarray(pcd.colors)  # type: ignore[assignment]
-                all_colors.append(colors)  # type: ignore[arg-type]
-
-    if not all_points:
-        return o3d.geometry.PointCloud()
-
-    combined_pcd = o3d.geometry.PointCloud()
-    combined_pcd.points = o3d.utility.Vector3dVector(np.vstack(all_points))
-
-    if all_colors:
-        combined_pcd.colors = o3d.utility.Vector3dVector(np.vstack(all_colors))
-
-    return combined_pcd
-
-
-def extract_centroids_from_masks(
-    rgb_image: np.ndarray,  # type: ignore[type-arg]
-    depth_image: np.ndarray,  # type: ignore[type-arg]
-    masks: list[np.ndarray],  # type: ignore[type-arg]
-    camera_intrinsics: list[float] | np.ndarray,  # type: ignore[type-arg]
-) -> list[dict[str, Any]]:
-    """
-    Extract 3D centroids and orientations from segmentation masks.
-
-    Args:
-        rgb_image: RGB image (H, W, 3)
-        depth_image: Depth image (H, W) in meters
-        masks: List of boolean masks (H, W)
-        camera_intrinsics: Camera parameters as [fx, fy, cx, cy] or 3x3 matrix
-
-    Returns:
-        List of dictionaries containing:
-            - centroid: 3D centroid position [x, y, z] in camera frame
-            - orientation: Normalized direction vector from camera to centroid
-            - num_points: Number of valid 3D points
-            - mask_idx: Index of the mask in the input list
-    """
-    # Extract camera parameters
-    if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
-        fx, fy, cx, cy = camera_intrinsics
-    else:
-        fx = camera_intrinsics[0, 0]  # type: ignore[call-overload]
-        fy = camera_intrinsics[1, 1]  # type: ignore[call-overload]
-        cx = camera_intrinsics[0, 2]  # type: ignore[call-overload]
-        cy = camera_intrinsics[1, 2]  # type: ignore[call-overload]
-
-    results = []
-
-    for mask_idx, mask in enumerate(masks):
-        if mask is None or mask.sum() == 0:
-            continue
-
-        # Get pixel coordinates where mask is True
-        y_coords, x_coords = np.where(mask)
-
-        # Get depth values at mask locations
-        depths = depth_image[y_coords, x_coords]
-
-        # Convert to 3D points in camera frame
-        X = (x_coords - cx) * depths / fx
-        Y = (y_coords - cy) * depths / fy
-        Z = depths
-
-        # Calculate centroid
-        centroid_x = np.mean(X)
-        centroid_y = np.mean(Y)
-        centroid_z = np.mean(Z)
-        centroid = np.array([centroid_x, centroid_y, centroid_z])
-
-        # Calculate orientation as normalized direction from camera origin to centroid
-        # Camera origin is at (0, 0, 0)
-        orientation = centroid / np.linalg.norm(centroid)
-
-        results.append(
-            {
-                "centroid": centroid,
-                "orientation": orientation,
-                "num_points": int(mask.sum()),
-                "mask_idx": mask_idx,
-            }
-        )
-
-    return results
diff --git a/dimos/perception/segmentation/__init__.py b/dimos/perception/segmentation/__init__.py
deleted file mode 100644
index a48a76d6a4..0000000000
--- a/dimos/perception/segmentation/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .sam_2d_seg import *
-from .utils import *
diff --git a/dimos/perception/segmentation/config/custom_tracker.yaml b/dimos/perception/segmentation/config/custom_tracker.yaml
deleted file mode 100644
index 7a6748ebf6..0000000000
--- a/dimos/perception/segmentation/config/custom_tracker.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Default Ultralytics settings for BoT-SORT tracker when using mode="track"
-# For documentation and examples see https://docs.ultralytics.com/modes/track/
-# For BoT-SORT source code see https://github.com/NirAharon/BoT-SORT
-
-tracker_type: botsort # tracker type, ['botsort', 'bytetrack']
-track_high_thresh: 0.4 # threshold for the first association
-track_low_thresh: 0.2 # threshold for the second association
-new_track_thresh: 0.5 # threshold for init new track if the detection does not match any tracks
-track_buffer: 100 # buffer to calculate the time when to remove tracks
-match_thresh: 0.4 # threshold for matching tracks
-fuse_score: False # Whether to fuse confidence scores with the iou distances before matching
-# min_box_area: 10  # threshold for min box areas(for tracker evaluation, not used for now)
-
-# BoT-SORT settings
-gmc_method: sparseOptFlow # method of global motion compensation
-# ReID model related thresh (not supported yet)
-proximity_thresh: 0.6
-appearance_thresh: 0.35
-with_reid: False
diff --git a/dimos/perception/segmentation/image_analyzer.py b/dimos/perception/segmentation/image_analyzer.py
deleted file mode 100644
index 06db712ac7..0000000000
--- a/dimos/perception/segmentation/image_analyzer.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import base64
-import os
-
-import cv2
-from openai import OpenAI
-
-NORMAL_PROMPT = "What are in these images? Give a short word answer with at most two words, \
-                if not sure, give a description of its shape or color like 'small tube', 'blue item'. \" \
-                if does not look like an object, say 'unknown'. Export objects as a list of strings \
-                in this exact format '['object 1', 'object 2', '...']'."
-
-RICH_PROMPT = (
-    "What are in these images? Give a detailed description of each item, the first n images will be \
-               cropped patches of the original image detected by the object detection model. \
-               The last image will be the original image. Use the last image only for context, \
-               do not describe objects in the last image. \
-               Export the objects as a list of strings in this exact format, '['description of object 1', '...', '...']', \
-               don't include anything else. "
-)
-
-
-class ImageAnalyzer:
-    def __init__(self) -> None:
-        """
-        Initializes the ImageAnalyzer with OpenAI API credentials.
-        """
-        self.client = OpenAI()
-
-    def encode_image(self, image):  # type: ignore[no-untyped-def]
-        """
-        Encodes an image to Base64.
-
-        Parameters:
-        image (numpy array): Image array (BGR format).
-
-        Returns:
-        str: Base64 encoded string of the image.
-        """
-        _, buffer = cv2.imencode(".jpg", image)
-        return base64.b64encode(buffer).decode("utf-8")
-
-    def analyze_images(self, images, detail: str = "auto", prompt_type: str = "normal"):  # type: ignore[no-untyped-def]
-        """
-        Takes a list of cropped images and returns descriptions from OpenAI's Vision model.
-
-        Parameters:
-        images (list of numpy arrays): Cropped images from the original frame.
-        detail (str): "low", "high", or "auto" to set image processing detail.
-        prompt_type (str): "normal" or "rich" to set the prompt type.
-
-        Returns:
-        list of str: Descriptions of objects in each image.
-        """
-        image_data = [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": f"data:image/jpeg;base64,{self.encode_image(img)}",  # type: ignore[no-untyped-call]
-                    "detail": detail,
-                },
-            }
-            for img in images
-        ]
-
-        if prompt_type == "normal":
-            prompt = NORMAL_PROMPT
-        elif prompt_type == "rich":
-            prompt = RICH_PROMPT
-        else:
-            raise ValueError(f"Invalid prompt type: {prompt_type}")
-
-        response = self.client.chat.completions.create(
-            model="gpt-4o-mini",
-            messages=[
-                {  # type: ignore[list-item, misc]
-                    "role": "user",
-                    "content": [{"type": "text", "text": prompt}, *image_data],
-                }
-            ],
-            max_tokens=300,
-            timeout=5,
-        )
-
-        # Accessing the content of the response using dot notation
-        return next(choice.message.content for choice in response.choices)
-
-
-def main() -> None:
-    # Define the directory containing cropped images
-    cropped_images_dir = "cropped_images"
-    if not os.path.exists(cropped_images_dir):
-        print(f"Directory '{cropped_images_dir}' does not exist.")
-        return
-
-    # Load all images from the directory
-    images = []
-    for filename in os.listdir(cropped_images_dir):
-        if filename.endswith(".jpg") or filename.endswith(".png"):
-            image_path = os.path.join(cropped_images_dir, filename)
-            image = cv2.imread(image_path)
-            if image is not None:
-                images.append(image)
-            else:
-                print(f"Warning: Could not read image {image_path}")
-
-    if not images:
-        print("No valid images found in the directory.")
-        return
-
-    # Initialize ImageAnalyzer
-    analyzer = ImageAnalyzer()
-
-    # Analyze images
-    results = analyzer.analyze_images(images)
-
-    # Split results into a list of items
-    object_list = [item.strip()[2:] for item in results.split("\n")]
-
-    # Overlay text on images and display them
-    for i, (img, obj) in enumerate(zip(images, object_list, strict=False)):
-        if obj:  # Only process non-empty lines
-            # Add text to image
-            font = cv2.FONT_HERSHEY_SIMPLEX
-            font_scale = 0.5
-            thickness = 2
-            text = obj.strip()
-
-            # Get text size
-            (text_width, text_height), _ = cv2.getTextSize(text, font, font_scale, thickness)
-
-            # Position text at top of image
-            x = 10
-            y = text_height + 10
-
-            # Add white background for text
-            cv2.rectangle(
-                img, (x - 5, y - text_height - 5), (x + text_width + 5, y + 5), (255, 255, 255), -1
-            )
-            # Add text
-            cv2.putText(img, text, (x, y), font, font_scale, (0, 0, 0), thickness)
-
-            # Save or display the image
-            cv2.imwrite(f"annotated_image_{i}.jpg", img)
-            print(f"Detected object: {obj}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/dimos/perception/segmentation/sam_2d_seg.py b/dimos/perception/segmentation/sam_2d_seg.py
deleted file mode 100644
index 741f71a9ab..0000000000
--- a/dimos/perception/segmentation/sam_2d_seg.py
+++ /dev/null
@@ -1,366 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections import deque
-from collections.abc import Sequence
-from concurrent.futures import ThreadPoolExecutor
-import os
-import time
-
-import cv2
-import onnxruntime  # type: ignore[import-untyped]
-import torch
-from ultralytics import FastSAM  # type: ignore[attr-defined, import-not-found]
-
-from dimos.perception.common.detection2d_tracker import get_tracked_results, target2dTracker
-from dimos.perception.segmentation.image_analyzer import ImageAnalyzer
-from dimos.perception.segmentation.utils import (
-    crop_images_from_bboxes,
-    extract_masks_bboxes_probs_names,
-    filter_segmentation_results,
-    plot_results,
-)
-from dimos.utils.data import get_data
-from dimos.utils.logging_config import setup_logger
-
-logger = setup_logger()
-
-
-class Sam2DSegmenter:
-    def __init__(
-        self,
-        model_path: str = "models_fastsam",
-        model_name: str = "FastSAM-s.onnx",
-        min_analysis_interval: float = 5.0,
-        use_tracker: bool = True,
-        use_analyzer: bool = True,
-        use_rich_labeling: bool = False,
-        use_filtering: bool = True,
-    ) -> None:
-        # Use GPU if available, otherwise fall back to CPU
-        if torch.cuda.is_available():
-            logger.info("Using CUDA for SAM 2d segmenter")
-            if hasattr(onnxruntime, "preload_dlls"):  # Handles CUDA 11 / onnxruntime-gpu<=1.18
-                onnxruntime.preload_dlls(cuda=True, cudnn=True)
-            self.device = "cuda"
-        # MacOS Metal performance shaders
-        elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
-            logger.info("Using Metal for SAM 2d segmenter")
-            self.device = "mps"
-        else:
-            logger.info("Using CPU for SAM 2d segmenter")
-            self.device = "cpu"
-
-        # Core components
-        self.model = FastSAM(get_data(model_path) / model_name)
-        self.use_tracker = use_tracker
-        self.use_analyzer = use_analyzer
-        self.use_rich_labeling = use_rich_labeling
-        self.use_filtering = use_filtering
-
-        module_dir = os.path.dirname(__file__)
-        self.tracker_config = os.path.join(module_dir, "config", "custom_tracker.yaml")
-
-        # Initialize tracker if enabled
-        if self.use_tracker:
-            self.tracker = target2dTracker(
-                history_size=80,
-                score_threshold_start=0.7,
-                score_threshold_stop=0.05,
-                min_frame_count=10,
-                max_missed_frames=50,
-                min_area_ratio=0.05,
-                max_area_ratio=0.4,
-                texture_range=(0.0, 0.35),
-                border_safe_distance=100,
-                weights={"prob": 1.0, "temporal": 3.0, "texture": 2.0, "border": 3.0, "size": 1.0},
-            )
-
-        # Initialize analyzer components if enabled
-        if self.use_analyzer:
-            self.image_analyzer = ImageAnalyzer()
-            self.min_analysis_interval = min_analysis_interval
-            self.last_analysis_time = 0
-            self.to_be_analyzed = deque()  # type: ignore[var-annotated]
-            self.object_names = {}  # type: ignore[var-annotated]
-            self.analysis_executor = ThreadPoolExecutor(max_workers=1)
-            self.current_future = None
-            self.current_queue_ids = None
-
-    def process_image(self, image):  # type: ignore[no-untyped-def]
-        """Process an image and return segmentation results."""
-        results = self.model.track(
-            source=image,
-            device=self.device,
-            retina_masks=True,
-            conf=0.3,
-            iou=0.5,
-            persist=True,
-            verbose=False,
-        )
-
-        if len(results) > 0:
-            # Get initial segmentation results
-            masks, bboxes, track_ids, probs, names, areas = extract_masks_bboxes_probs_names(
-                results[0]
-            )
-
-            # Filter results
-            if self.use_filtering:
-                (
-                    filtered_masks,
-                    filtered_bboxes,
-                    filtered_track_ids,
-                    filtered_probs,
-                    filtered_names,
-                    filtered_texture_values,
-                ) = filter_segmentation_results(
-                    image, masks, bboxes, track_ids, probs, names, areas
-                )
-            else:
-                # Use original results without filtering
-                filtered_masks = masks
-                filtered_bboxes = bboxes
-                filtered_track_ids = track_ids
-                filtered_probs = probs
-                filtered_names = names
-                filtered_texture_values = []
-
-            if self.use_tracker:
-                # Update tracker with filtered results
-                tracked_targets = self.tracker.update(
-                    image,
-                    filtered_masks,
-                    filtered_bboxes,
-                    filtered_track_ids,
-                    filtered_probs,
-                    filtered_names,
-                    filtered_texture_values,
-                )
-
-                # Get tracked results
-                tracked_masks, tracked_bboxes, tracked_target_ids, tracked_probs, tracked_names = (
-                    get_tracked_results(tracked_targets)  # type: ignore[no-untyped-call]
-                )
-
-                if self.use_analyzer:
-                    # Update analysis queue with tracked IDs
-                    target_id_set = set(tracked_target_ids)
-
-                    # Remove untracked objects from object_names
-                    all_target_ids = list(self.tracker.targets.keys())
-                    self.object_names = {
-                        track_id: name
-                        for track_id, name in self.object_names.items()
-                        if track_id in all_target_ids
-                    }
-
-                    # Remove untracked objects from queue and results
-                    self.to_be_analyzed = deque(
-                        [track_id for track_id in self.to_be_analyzed if track_id in target_id_set]
-                    )
-
-                    # Filter out any IDs being analyzed from the to_be_analyzed queue
-                    if self.current_queue_ids:
-                        self.to_be_analyzed = deque(
-                            [
-                                tid
-                                for tid in self.to_be_analyzed
-                                if tid not in self.current_queue_ids
-                            ]
-                        )
-
-                    # Add new track_ids to analysis queue
-                    for track_id in tracked_target_ids:
-                        if (
-                            track_id not in self.object_names
-                            and track_id not in self.to_be_analyzed
-                        ):
-                            self.to_be_analyzed.append(track_id)
-
-                return (
-                    tracked_masks,
-                    tracked_bboxes,
-                    tracked_target_ids,
-                    tracked_probs,
-                    tracked_names,
-                )
-            else:
-                # When tracker disabled, just use the filtered results directly
-                if self.use_analyzer:
-                    # Add unanalyzed IDs to the analysis queue
-                    for track_id in filtered_track_ids:
-                        if (
-                            track_id not in self.object_names
-                            and track_id not in self.to_be_analyzed
-                        ):
-                            self.to_be_analyzed.append(track_id)
-
-                # Simply return filtered results
-                return (
-                    filtered_masks,
-                    filtered_bboxes,
-                    filtered_track_ids,
-                    filtered_probs,
-                    filtered_names,
-                )
-        return [], [], [], [], []
-
-    def check_analysis_status(self, tracked_target_ids):  # type: ignore[no-untyped-def]
-        """Check if analysis is complete and prepare new queue if needed."""
-        if not self.use_analyzer:
-            return None, None
-
-        current_time = time.time()
-
-        # Check if current queue analysis is complete
-        if self.current_future and self.current_future.done():
-            try:
-                results = self.current_future.result()
-                if results is not None:
-                    # Map results to track IDs
-                    object_list = eval(results)
-                    for track_id, result in zip(self.current_queue_ids, object_list, strict=False):
-                        self.object_names[track_id] = result
-            except Exception as e:
-                print(f"Queue analysis failed: {e}")
-            self.current_future = None
-            self.current_queue_ids = None
-            self.last_analysis_time = current_time
-
-        # If enough time has passed and we have items to analyze, start new analysis
-        if (
-            not self.current_future
-            and self.to_be_analyzed
-            and current_time - self.last_analysis_time >= self.min_analysis_interval
-        ):
-            queue_indices = []
-            queue_ids = []
-
-            # Collect all valid track IDs from the queue
-            while self.to_be_analyzed:
-                track_id = self.to_be_analyzed[0]
-                if track_id in tracked_target_ids:
-                    bbox_idx = tracked_target_ids.index(track_id)
-                    queue_indices.append(bbox_idx)
-                    queue_ids.append(track_id)
-                self.to_be_analyzed.popleft()
-
-            if queue_indices:
-                return queue_indices, queue_ids
-        return None, None
-
-    def run_analysis(self, frame, tracked_bboxes, tracked_target_ids) -> None:  # type: ignore[no-untyped-def]
-        """Run queue image analysis in background."""
-        if not self.use_analyzer:
-            return
-
-        queue_indices, queue_ids = self.check_analysis_status(tracked_target_ids)  # type: ignore[no-untyped-call]
-        if queue_indices:
-            selected_bboxes = [tracked_bboxes[i] for i in queue_indices]
-            cropped_images = crop_images_from_bboxes(frame, selected_bboxes)
-            if cropped_images:
-                self.current_queue_ids = queue_ids
-                print(f"Analyzing objects with track_ids: {queue_ids}")
-
-                if self.use_rich_labeling:
-                    prompt_type = "rich"
-                    cropped_images.append(frame)
-                else:
-                    prompt_type = "normal"
-
-                self.current_future = self.analysis_executor.submit(  # type: ignore[assignment]
-                    self.image_analyzer.analyze_images, cropped_images, prompt_type=prompt_type
-                )
-
-    def get_object_names(self, track_ids, tracked_names: Sequence[str]):  # type: ignore[no-untyped-def]
-        """Get object names for the given track IDs, falling back to tracked names."""
-        if not self.use_analyzer:
-            return tracked_names
-
-        return [
-            self.object_names.get(track_id, tracked_name)
-            for track_id, tracked_name in zip(track_ids, tracked_names, strict=False)
-        ]
-
-    def visualize_results(  # type: ignore[no-untyped-def]
-        self, image, masks, bboxes, track_ids, probs: Sequence[float], names: Sequence[str]
-    ):
-        """Generate an overlay visualization with segmentation results and object names."""
-        return plot_results(image, masks, bboxes, track_ids, probs, names)
-
-    def cleanup(self) -> None:
-        """Cleanup resources."""
-        if self.use_analyzer:
-            self.analysis_executor.shutdown()
-
-
-def main() -> None:
-    # Example usage with different configurations
-    cap = cv2.VideoCapture(0)
-
-    # Example 1: Full functionality with rich labeling
-    segmenter = Sam2DSegmenter(
-        min_analysis_interval=4.0,
-        use_tracker=True,
-        use_analyzer=True,
-        use_rich_labeling=True,  # Enable rich labeling
-    )
-
-    # Example 2: Full functionality with normal labeling
-    # segmenter = Sam2DSegmenter(min_analysis_interval=4.0, use_tracker=True, use_analyzer=True)
-
-    # Example 3: Tracker only (analyzer disabled)
-    # segmenter = Sam2DSegmenter(use_analyzer=False)
-
-    # Example 4: Basic segmentation only (both tracker and analyzer disabled)
-    # segmenter = Sam2DSegmenter(use_tracker=False, use_analyzer=False)
-
-    # Example 5: Analyzer without tracker (new capability)
-    # segmenter = Sam2DSegmenter(use_tracker=False, use_analyzer=True)
-
-    try:
-        while cap.isOpened():
-            ret, frame = cap.read()
-            if not ret:
-                break
-
-            time.time()
-
-            # Process image and get results
-            masks, bboxes, target_ids, probs, names = segmenter.process_image(frame)  # type: ignore[no-untyped-call]
-
-            # Run analysis if enabled
-            if segmenter.use_analyzer:
-                segmenter.run_analysis(frame, bboxes, target_ids)
-                names = segmenter.get_object_names(target_ids, names)
-
-            # processing_time = time.time() - start_time
-            # print(f"Processing time: {processing_time:.2f}s")
-
-            overlay = segmenter.visualize_results(frame, masks, bboxes, target_ids, probs, names)
-
-            cv2.imshow("Segmentation", overlay)
-            key = cv2.waitKey(1)
-            if key & 0xFF == ord("q"):
-                break
-
-    finally:
-        segmenter.cleanup()
-        cap.release()
-        cv2.destroyAllWindows()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/dimos/perception/segmentation/test_sam_2d_seg.py b/dimos/perception/segmentation/test_sam_2d_seg.py
deleted file mode 100644
index a9222ed2f2..0000000000
--- a/dimos/perception/segmentation/test_sam_2d_seg.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import time
-
-import numpy as np
-import pytest
-from reactivex import operators as ops
-
-from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter
-from dimos.perception.segmentation.utils import extract_masks_bboxes_probs_names
-from dimos.stream.video_provider import VideoProvider
-
-
-@pytest.mark.heavy
-class TestSam2DSegmenter:
-    def test_sam_segmenter_initialization(self) -> None:
-        """Test FastSAM segmenter initializes correctly with default model path."""
-        try:
-            # Try to initialize with the default model path and existing device setting
-            segmenter = Sam2DSegmenter(use_analyzer=False)
-            assert segmenter is not None
-            assert segmenter.model is not None
-        except Exception as e:
-            # If the model file doesn't exist, the test should still pass with a warning
-            pytest.skip(f"Skipping test due to model initialization error: {e}")
-
-    def test_sam_segmenter_process_image(self) -> None:
-        """Test FastSAM segmenter can process video frames and return segmentation masks."""
-        # Import get data inside method to avoid pytest fixture confusion
-        from dimos.utils.data import get_data
-
-        # Get test video path directly
-        video_path = get_data("assets") / "trimmed_video_office.mov"
-        try:
-            # Initialize segmenter without analyzer for faster testing
-            segmenter = Sam2DSegmenter(use_analyzer=False)
-
-            # Note: conf and iou are parameters for process_image, not constructor
-            # We'll monkey patch the process_image method to use lower thresholds
-
-            def patched_process_image(image):
-                results = segmenter.model.track(
-                    source=image,
-                    device=segmenter.device,
-                    retina_masks=True,
-                    conf=0.1,  # Lower confidence threshold for testing
-                    iou=0.5,  # Lower IoU threshold
-                    persist=True,
-                    verbose=False,
-                    tracker=segmenter.tracker_config
-                    if hasattr(segmenter, "tracker_config")
-                    else None,
-                )
-
-                if len(results) > 0:
-                    masks, bboxes, track_ids, probs, names, _areas = (
-                        extract_masks_bboxes_probs_names(results[0])
-                    )
-                    return masks, bboxes, track_ids, probs, names
-                return [], [], [], [], []
-
-            # Replace the method
-            segmenter.process_image = patched_process_image
-
-            # Create video provider and directly get a video stream observable
-            assert os.path.exists(video_path), f"Test video not found: {video_path}"
-            video_provider = VideoProvider(dev_name="test_video", video_source=video_path)
-
-            video_stream = video_provider.capture_video_as_observable(realtime=False, fps=1)
-
-            # Use ReactiveX operators to process the stream
-            def process_frame(frame):
-                try:
-                    # Process frame with FastSAM
-                    masks, bboxes, track_ids, probs, names = segmenter.process_image(frame)
-                    print(
-                        f"SAM results - masks: {len(masks)}, bboxes: {len(bboxes)}, track_ids: {len(track_ids)}, names: {len(names)}"
-                    )
-
-                    return {
-                        "frame": frame,
-                        "masks": masks,
-                        "bboxes": bboxes,
-                        "track_ids": track_ids,
-                        "probs": probs,
-                        "names": names,
-                    }
-                except Exception as e:
-                    print(f"Error in process_frame: {e}")
-                    return {}
-
-            # Create the segmentation stream using pipe and map operator
-            segmentation_stream = video_stream.pipe(ops.map(process_frame))
-
-            # Collect results from the stream
-            results = []
-            frames_processed = 0
-            target_frames = 5
-
-            def on_next(result) -> None:
-                nonlocal frames_processed, results
-                if not result:
-                    return
-
-                results.append(result)
-                frames_processed += 1
-
-                # Stop processing after target frames
-                if frames_processed >= target_frames:
-                    subscription.dispose()
-
-            def on_error(error) -> None:
-                pytest.fail(f"Error in segmentation stream: {error}")
-
-            def on_completed() -> None:
-                pass
-
-            # Subscribe and wait for results
-            subscription = segmentation_stream.subscribe(
-                on_next=on_next, on_error=on_error, on_completed=on_completed
-            )
-
-            # Wait for frames to be processed
-            timeout = 30.0  # seconds
-            start_time = time.time()
-            while frames_processed < target_frames and time.time() - start_time < timeout:
-                time.sleep(0.5)
-
-            # Clean up subscription
-            subscription.dispose()
-            video_provider.dispose_all()
-
-            # Check if we have results
-            if len(results) == 0:
-                pytest.skip(
-                    "No segmentation results found, but test connection established correctly"
-                )
-                return
-
-            print(f"Processed {len(results)} frames with segmentation results")
-
-            # Analyze the first result
-            result = results[0]
-
-            # Check that we have a frame
-            assert "frame" in result, "Result doesn't contain a frame"
-            assert isinstance(result["frame"], np.ndarray), "Frame is not a numpy array"
-
-            # Check that segmentation results are valid
-            assert isinstance(result["masks"], list)
-            assert isinstance(result["bboxes"], list)
-            assert isinstance(result["track_ids"], list)
-            assert isinstance(result["probs"], list)
-            assert isinstance(result["names"], list)
-
-            # All result lists should be the same length
-            assert (
-                len(result["masks"])
-                == len(result["bboxes"])
-                == len(result["track_ids"])
-                == len(result["probs"])
-                == len(result["names"])
-            )
-
-            # If we have masks, check that they have valid shape
-            if result.get("masks") and len(result["masks"]) > 0:
-                assert result["masks"][0].shape == (
-                    result["frame"].shape[0],
-                    result["frame"].shape[1],
-                ), "Mask shape should match image dimensions"
-                print(f"Found {len(result['masks'])} masks in first frame")
-            else:
-                print("No masks found in first frame, but test connection established correctly")
-
-            # Test visualization function
-            if result["masks"]:
-                vis_frame = segmenter.visualize_results(
-                    result["frame"],
-                    result["masks"],
-                    result["bboxes"],
-                    result["track_ids"],
-                    result["probs"],
-                    result["names"],
-                )
-                assert isinstance(vis_frame, np.ndarray), "Visualization output should be an image"
-                assert vis_frame.shape == result["frame"].shape, (
-                    "Visualization should have same dimensions as input frame"
-                )
-
-            # We've already tested visualization above, so no need for a duplicate test
-
-        except Exception as e:
-            pytest.skip(f"Skipping test due to error: {e}")
-
-
-if __name__ == "__main__":
-    pytest.main(["-v", __file__])
diff --git a/dimos/perception/segmentation/utils.py b/dimos/perception/segmentation/utils.py
deleted file mode 100644
index a23a256ca2..0000000000
--- a/dimos/perception/segmentation/utils.py
+++ /dev/null
@@ -1,343 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections.abc import Sequence
-
-import cv2
-import numpy as np
-import torch
-
-
-class SimpleTracker:
-    def __init__(
-        self, history_size: int = 100, min_count: int = 10, count_window: int = 20
-    ) -> None:
-        """
-        Simple temporal tracker that counts appearances in a fixed window.
-        :param history_size: Number of past frames to remember
-        :param min_count: Minimum number of appearances required
-        :param count_window: Number of latest frames to consider for counting
-        """
-        self.history = []  # type: ignore[var-annotated]
-        self.history_size = history_size
-        self.min_count = min_count
-        self.count_window = count_window
-        self.total_counts = {}  # type: ignore[var-annotated]
-
-    def update(self, track_ids):  # type: ignore[no-untyped-def]
-        # Add new frame's track IDs to history
-        self.history.append(track_ids)
-        if len(self.history) > self.history_size:
-            self.history.pop(0)
-
-        # Consider only the latest `count_window` frames for counting
-        recent_history = self.history[-self.count_window :]
-        all_tracks = np.concatenate(recent_history) if recent_history else np.array([])
-
-        # Compute occurrences efficiently using numpy
-        unique_ids, counts = np.unique(all_tracks, return_counts=True)
-        id_counts = dict(zip(unique_ids, counts, strict=False))
-
-        # Update total counts but ensure it only contains IDs within the history size
-        total_tracked_ids = np.concatenate(self.history) if self.history else np.array([])
-        unique_total_ids, total_counts = np.unique(total_tracked_ids, return_counts=True)
-        self.total_counts = dict(zip(unique_total_ids, total_counts, strict=False))
-
-        # Return IDs that appear often enough
-        return [track_id for track_id, count in id_counts.items() if count >= self.min_count]
-
-    def get_total_counts(self):  # type: ignore[no-untyped-def]
-        """Returns the total count of each tracking ID seen over time, limited to history size."""
-        return self.total_counts
-
-
-def extract_masks_bboxes_probs_names(result, max_size: float = 0.7):  # type: ignore[no-untyped-def]
-    """
-    Extracts masks, bounding boxes, probabilities, and class names from one Ultralytics result object.
-
-    Parameters:
-    result: Ultralytics result object
-    max_size: float, maximum allowed size of object relative to image (0-1)
-
-    Returns:
-    tuple: (masks, bboxes, track_ids, probs, names, areas)
-    """
-    masks = []  # type: ignore[var-annotated]
-    bboxes = []  # type: ignore[var-annotated]
-    track_ids = []  # type: ignore[var-annotated]
-    probs = []  # type: ignore[var-annotated]
-    names = []  # type: ignore[var-annotated]
-    areas = []  # type: ignore[var-annotated]
-
-    if result.masks is None:
-        return masks, bboxes, track_ids, probs, names, areas
-
-    total_area = result.masks.orig_shape[0] * result.masks.orig_shape[1]
-
-    for box, mask_data in zip(result.boxes, result.masks.data, strict=False):
-        mask_numpy = mask_data
-
-        # Extract bounding box
-        x1, y1, x2, y2 = box.xyxy[0].tolist()
-
-        # Extract track_id if available
-        track_id = -1  # default if no tracking
-        if hasattr(box, "id") and box.id is not None:
-            track_id = int(box.id[0].item())
-
-        # Extract probability and class index
-        conf = float(box.conf[0])
-        cls_idx = int(box.cls[0])
-        area = (x2 - x1) * (y2 - y1)
-
-        if area / total_area > max_size:
-            continue
-
-        masks.append(mask_numpy)
-        bboxes.append([x1, y1, x2, y2])
-        track_ids.append(track_id)
-        probs.append(conf)
-        names.append(result.names[cls_idx])
-        areas.append(area)
-
-    return masks, bboxes, track_ids, probs, names, areas
-
-
-def compute_texture_map(frame, blur_size: int = 3):  # type: ignore[no-untyped-def]
-    """
-    Compute texture map using gradient statistics.
-    Returns high values for textured regions and low values for smooth regions.
-
-    Parameters:
-    frame: BGR image
-    blur_size: Size of Gaussian blur kernel for pre-processing
-
-    Returns:
-    numpy array: Texture map with values normalized to [0,1]
-    """
-    # Convert to grayscale
-    if len(frame.shape) == 3:
-        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-    else:
-        gray = frame
-
-    # Pre-process with slight blur to reduce noise
-    if blur_size > 0:
-        gray = cv2.GaussianBlur(gray, (blur_size, blur_size), 0)
-
-    # Compute gradients in x and y directions
-    grad_x = cv2.Sobel(gray, cv2.CV_32F, 1, 0, ksize=3)
-    grad_y = cv2.Sobel(gray, cv2.CV_32F, 0, 1, ksize=3)
-
-    # Compute gradient magnitude and direction
-    magnitude = np.sqrt(grad_x**2 + grad_y**2)
-
-    # Compute local standard deviation of gradient magnitude
-    texture_map = cv2.GaussianBlur(magnitude, (15, 15), 0)
-
-    # Normalize to [0,1]
-    texture_map = (texture_map - texture_map.min()) / (texture_map.max() - texture_map.min() + 1e-8)
-
-    return texture_map
-
-
-def filter_segmentation_results(  # type: ignore[no-untyped-def]
-    frame,
-    masks,
-    bboxes,
-    track_ids,
-    probs: Sequence[float],
-    names: Sequence[str],
-    areas,
-    texture_threshold: float = 0.07,
-    size_filter: int = 800,
-):
-    """
-    Filters segmentation results using both overlap and saliency detection.
-    Uses mask_sum tensor for efficient overlap detection.
-
-    Parameters:
-    masks: list of torch.Tensor containing mask data
-    bboxes: list of bounding boxes [x1, y1, x2, y2]
-    track_ids: list of tracking IDs
-    probs: list of confidence scores
-    names: list of class names
-    areas: list of object areas
-    frame: BGR image for computing saliency
-    texture_threshold: Average texture value required for mask to be kept
-    size_filter: Minimum size of the object to be kept
-
-    Returns:
-    tuple: (filtered_masks, filtered_bboxes, filtered_track_ids, filtered_probs, filtered_names, filtered_texture_values, texture_map)
-    """
-    if len(masks) <= 1:
-        return masks, bboxes, track_ids, probs, names, []
-
-    # Compute texture map once and convert to tensor
-    texture_map = compute_texture_map(frame)
-
-    # Sort by area (smallest to largest)
-    sorted_indices = torch.tensor(areas).argsort(descending=False)
-
-    device = masks[0].device  # Get the device of the first mask
-
-    # Create mask_sum tensor where each pixel stores the index of the mask that claims it
-    mask_sum = torch.zeros_like(masks[0], dtype=torch.int32)
-
-    texture_map = torch.from_numpy(texture_map).to(
-        device
-    )  # Convert texture_map to tensor and move to device
-
-    filtered_texture_values = []  # List to store texture values of filtered masks
-
-    for i, idx in enumerate(sorted_indices):
-        mask = masks[idx]
-        # Compute average texture value within mask
-        texture_value = torch.mean(texture_map[mask > 0]) if torch.any(mask > 0) else 0
-
-        # Only claim pixels if mask passes texture threshold
-        if texture_value >= texture_threshold:
-            mask_sum[mask > 0] = i
-            filtered_texture_values.append(
-                texture_value.item()  # type: ignore[union-attr]
-            )  # Store the texture value as a Python float
-
-    # Get indices that appear in mask_sum (these are the masks we want to keep)
-    keep_indices, counts = torch.unique(mask_sum[mask_sum > 0], return_counts=True)
-    size_indices = counts > size_filter
-    keep_indices = keep_indices[size_indices]
-
-    sorted_indices = sorted_indices.cpu()
-    keep_indices = keep_indices.cpu()
-
-    # Map back to original indices and filter
-    final_indices = sorted_indices[keep_indices].tolist()
-
-    filtered_masks = [masks[i] for i in final_indices]
-    filtered_bboxes = [bboxes[i] for i in final_indices]
-    filtered_track_ids = [track_ids[i] for i in final_indices]
-    filtered_probs = [probs[i] for i in final_indices]
-    filtered_names = [names[i] for i in final_indices]
-
-    return (
-        filtered_masks,
-        filtered_bboxes,
-        filtered_track_ids,
-        filtered_probs,
-        filtered_names,
-        filtered_texture_values,
-    )
-
-
-def plot_results(  # type: ignore[no-untyped-def]
-    image,
-    masks,
-    bboxes,
-    track_ids,
-    probs: Sequence[float],
-    names: Sequence[str],
-    alpha: float = 0.5,
-):
-    """
-    Draws bounding boxes, masks, and labels on the given image with enhanced visualization.
-    Includes object names in the overlay and improved text visibility.
-    """
-    h, w = image.shape[:2]
-    overlay = image.copy()
-
-    for mask, bbox, track_id, prob, name in zip(
-        masks, bboxes, track_ids, probs, names, strict=False
-    ):
-        # Convert mask tensor to numpy if needed
-        if isinstance(mask, torch.Tensor):
-            mask = mask.cpu().numpy()
-
-        # Ensure mask is in proper format for OpenCV resize
-        if mask.dtype == bool:
-            mask = mask.astype(np.uint8)
-        elif mask.dtype != np.uint8 and mask.dtype != np.float32:
-            mask = mask.astype(np.float32)
-
-        mask_resized = cv2.resize(mask, (w, h), interpolation=cv2.INTER_LINEAR)
-
-        # Generate consistent color based on track_id
-        if track_id != -1:
-            np.random.seed(track_id)
-            color = np.random.randint(0, 255, (3,), dtype=np.uint8)
-            np.random.seed(None)
-        else:
-            color = np.random.randint(0, 255, (3,), dtype=np.uint8)
-
-        # Apply mask color
-        overlay[mask_resized > 0.5] = color
-
-        # Draw bounding box
-        x1, y1, x2, y2 = map(int, bbox)
-        cv2.rectangle(overlay, (x1, y1), (x2, y2), color.tolist(), 2)
-
-        # Prepare label text
-        label = f"ID:{track_id} {prob:.2f}"
-        if name:  # Add object name if available
-            label += f" {name}"
-
-        # Calculate text size for background rectangle
-        (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
-
-        # Draw background rectangle for text
-        cv2.rectangle(overlay, (x1, y1 - text_h - 8), (x1 + text_w + 4, y1), color.tolist(), -1)
-
-        # Draw text with white color for better visibility
-        cv2.putText(
-            overlay,
-            label,
-            (x1 + 2, y1 - 5),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            0.5,
-            (255, 255, 255),  # White text
-            1,
-        )
-
-    # Blend overlay with original image
-    result = cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0)
-    return result
-
-
-def crop_images_from_bboxes(image, bboxes, buffer: int = 0):  # type: ignore[no-untyped-def]
-    """
-    Crops regions from an image based on bounding boxes with an optional buffer.
-
-    Parameters:
-    image (numpy array): Input image.
-    bboxes (list of lists): List of bounding boxes [x1, y1, x2, y2].
-    buffer (int): Number of pixels to expand each bounding box.
-
-    Returns:
-    list of numpy arrays: Cropped image regions.
-    """
-    height, width, _ = image.shape
-    cropped_images = []
-
-    for bbox in bboxes:
-        x1, y1, x2, y2 = bbox
-
-        # Apply buffer
-        x1 = max(0, x1 - buffer)
-        y1 = max(0, y1 - buffer)
-        x2 = min(width, x2 + buffer)
-        y2 = min(height, y2 + buffer)
-
-        cropped_image = image[int(y1) : int(y2), int(x1) : int(x2)]
-        cropped_images.append(cropped_image)
-
-    return cropped_images
diff --git a/dimos/robot/agilex/README.md b/dimos/robot/agilex/README.md
deleted file mode 100644
index 8342a6045e..0000000000
--- a/dimos/robot/agilex/README.md
+++ /dev/null
@@ -1,371 +0,0 @@
-# DIMOS Manipulator Robot Development Guide
-
-This guide explains how to create robot classes, integrate agents, and use the DIMOS module system with LCM transport.
-
-## Table of Contents
-1. [Robot Class Architecture](#robot-class-architecture)
-2. [Module System & LCM Transport](#module-system--lcm-transport)
-3. [Agent Integration](#agent-integration)
-4. [Complete Example](#complete-example)
-
-## Robot Class Architecture
-
-### Basic Robot Class Structure
-
-A DIMOS robot class should follow this pattern:
-
-```python
-from typing import Optional, List
-from dimos import core
-from dimos.types.robot_capabilities import RobotCapability
-
-class YourRobot:
-    """Your robot implementation."""
-
-    def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None):
-        # Core components
-        self.dimos = None
-        self.modules = {}
-        self.skill_library = SkillLibrary()
-
-        # Define capabilities
-        self.capabilities = robot_capabilities or [
-            RobotCapability.VISION,
-            RobotCapability.MANIPULATION,
-        ]
-
-    async def start(self):
-        """Start the robot modules."""
-        # Initialize DIMOS with worker count
-        self.dimos = core.start(2)  # Number of workers needed
-
-        # Deploy modules
-        # ... (see Module System section)
-
-    def stop(self):
-        """Stop all modules and clean up."""
-        # Stop modules
-        # Close DIMOS
-        if self.dimos:
-            self.dimos.close()
-```
-
-### Key Components Explained
-
-1. **Initialization**: Store references to modules, skills, and capabilities
-2. **Async Start**: Modules must be deployed asynchronously
-3. **Proper Cleanup**: Always stop modules before closing DIMOS
-
-## Module System & LCM Transport
-
-### Understanding DIMOS Modules
-
-Modules are the building blocks of DIMOS robots. They:
-- Process data streams (inputs)
-- Produce outputs
-- Can be connected together
-- Communicate via LCM (Lightweight Communications and Marshalling)
-
-### Deploying a Module
-
-```python
-# Deploy a camera module
-self.camera = self.dimos.deploy(
-    ZEDModule,                    # Module class
-    camera_id=0,                  # Module parameters
-    resolution="HD720",
-    depth_mode="NEURAL",
-    fps=30,
-    publish_rate=30.0,
-    frame_id="camera_frame"
-)
-```
-
-### Setting Up LCM Transport
-
-LCM transport enables inter-module communication:
-
-```python
-# Enable LCM auto-configuration
-from dimos.protocol import pubsub
-pubsub.lcm.autoconf()
-
-# Configure output transport
-self.camera.color_image.transport = core.LCMTransport(
-    "/camera/color_image",        # Topic name
-    Image                         # Message type
-)
-self.camera.depth_image.transport = core.LCMTransport(
-    "/camera/depth_image",
-    Image
-)
-```
-
-### Connecting Modules
-
-Connect module outputs to inputs:
-
-```python
-# Connect manipulation module to camera outputs
-self.manipulation.rgb_image.connect(self.camera.color_image)
-self.manipulation.depth_image.connect(self.camera.depth_image)
-self.manipulation.camera_info.connect(self.camera.camera_info)
-```
-
-### Module Communication Pattern
-
-```
-┌──────────────┐  LCM    ┌────────────────┐  LCM    ┌──────────────┐
-│   Camera     │────────▶│  Manipulation  │────────▶│ Visualization│
-│   Module     │ Messages│     Module     │ Messages│    Output    │
-└──────────────┘         └────────────────┘         └──────────────┘
-     ▲                          ▲
-     │                          │
-     └──────────────────────────┘
-          Direct Connection via RPC call
-```
-
-## Agent Integration
-
-### Setting Up Agent with Robot
-
-The run file pattern for agent integration:
-
-```python
-#!/usr/bin/env python3
-import asyncio
-import reactivex as rx
-from dimos.agents_deprecated.claude_agent import ClaudeAgent
-from dimos.web.robot_web_interface import RobotWebInterface
-
-def main():
-    # 1. Create and start robot
-    robot = YourRobot()
-    asyncio.run(robot.start())
-
-    # 2. Set up skills
-    skills = robot.get_skills()
-    skills.add(YourSkill)
-    skills.create_instance("YourSkill", robot=robot)
-
-    # 3. Set up reactive streams
-    agent_response_subject = rx.subject.Subject()
-    agent_response_stream = agent_response_subject.pipe(ops.share())
-
-    # 4. Create web interface
-    web_interface = RobotWebInterface(
-        port=5555,
-        text_streams={"agent_responses": agent_response_stream},
-        audio_subject=rx.subject.Subject()
-    )
-
-    # 5. Create agent
-    agent = ClaudeAgent(
-        dev_name="your_agent",
-        input_query_stream=web_interface.query_stream,
-        skills=skills,
-        system_query="Your system prompt here",
-        model_name="claude-3-5-haiku-latest"
-    )
-
-    # 6. Connect agent responses
-    agent.get_response_observable().subscribe(
-        lambda x: agent_response_subject.on_next(x)
-    )
-
-    # 7. Run interface
-    web_interface.run()
-```
-
-### Key Integration Points
-
-1. **Reactive Streams**: Use RxPy for event-driven communication
-2. **Web Interface**: Provides user input/output
-3. **Agent**: Processes natural language and executes skills
-4. **Skills**: Define robot capabilities as executable actions
-
-## Complete Example
-
-### Step 1: Create Robot Class (`my_robot.py`)
-
-```python
-import asyncio
-from typing import Optional, List
-from dimos import core
-from dimos.hardware.camera import CameraModule
-from dimos.manipulation.module import ManipulationModule
-from dimos.skills.skills import SkillLibrary
-from dimos.types.robot_capabilities import RobotCapability
-from dimos_lcm.sensor_msgs import Image, CameraInfo
-from dimos.protocol import pubsub
-
-class MyRobot:
-    def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None):
-        self.dimos = None
-        self.camera = None
-        self.manipulation = None
-        self.skill_library = SkillLibrary()
-
-        self.capabilities = robot_capabilities or [
-            RobotCapability.VISION,
-            RobotCapability.MANIPULATION,
-        ]
-
-    async def start(self):
-        # Start DIMOS
-        self.dimos = core.start(2)
-
-        # Enable LCM
-        pubsub.lcm.autoconf()
-
-        # Deploy camera
-        self.camera = self.dimos.deploy(
-            CameraModule,
-            camera_id=0,
-            fps=30
-        )
-
-        # Configure camera LCM
-        self.camera.color_image.transport = core.LCMTransport("/camera/rgb", Image)
-        self.camera.depth_image.transport = core.LCMTransport("/camera/depth", Image)
-        self.camera.camera_info.transport = core.LCMTransport("/camera/info", CameraInfo)
-
-        # Deploy manipulation
-        self.manipulation = self.dimos.deploy(ManipulationModule)
-
-        # Connect modules
-        self.manipulation.rgb_image.connect(self.camera.color_image)
-        self.manipulation.depth_image.connect(self.camera.depth_image)
-        self.manipulation.camera_info.connect(self.camera.camera_info)
-
-        # Configure manipulation output
-        self.manipulation.viz_image.transport = core.LCMTransport("/viz/output", Image)
-
-        # Start modules
-        self.camera.start()
-        self.manipulation.start()
-
-        await asyncio.sleep(2)  # Allow initialization
-
-    def get_skills(self):
-        return self.skill_library
-
-    def stop(self):
-        if self.manipulation:
-            self.manipulation.stop()
-        if self.camera:
-            self.camera.stop()
-        if self.dimos:
-            self.dimos.close()
-```
-
-### Step 2: Create Run Script (`run.py`)
-
-```python
-#!/usr/bin/env python3
-import asyncio
-import os
-from my_robot import MyRobot
-from dimos.agents_deprecated.claude_agent import ClaudeAgent
-from dimos.skills.basic import BasicSkill
-from dimos.web.robot_web_interface import RobotWebInterface
-import reactivex as rx
-import reactivex.operators as ops
-
-SYSTEM_PROMPT = """You are a helpful robot assistant."""
-
-def main():
-    # Check API key
-    if not os.getenv("ANTHROPIC_API_KEY"):
-        print("Please set ANTHROPIC_API_KEY")
-        return
-
-    # Create robot
-    robot = MyRobot()
-
-    try:
-        # Start robot
-        asyncio.run(robot.start())
-
-        # Set up skills
-        skills = robot.get_skills()
-        skills.add(BasicSkill)
-        skills.create_instance("BasicSkill", robot=robot)
-
-        # Set up streams
-        agent_response_subject = rx.subject.Subject()
-        agent_response_stream = agent_response_subject.pipe(ops.share())
-
-        # Create web interface
-        web_interface = RobotWebInterface(
-            port=5555,
-            text_streams={"agent_responses": agent_response_stream}
-        )
-
-        # Create agent
-        agent = ClaudeAgent(
-            dev_name="my_agent",
-            input_query_stream=web_interface.query_stream,
-            skills=skills,
-            system_query=SYSTEM_PROMPT
-        )
-
-        # Connect responses
-        agent.get_response_observable().subscribe(
-            lambda x: agent_response_subject.on_next(x)
-        )
-
-        print("Robot ready at http://localhost:5555")
-
-        # Run
-        web_interface.run()
-
-    finally:
-        robot.stop()
-
-if __name__ == "__main__":
-    main()
-```
-
-### Step 3: Define Skills (`skills.py`)
-
-```python
-from dimos.skills import Skill, skill
-
-@skill(
-    description="Perform a basic action",
-    parameters={
-        "action": "The action to perform"
-    }
-)
-class BasicSkill(Skill):
-    def __init__(self, robot):
-        self.robot = robot
-
-    def run(self, action: str):
-        # Implement skill logic
-        return f"Performed: {action}"
-```
-
-## Best Practices
-
-1. **Module Lifecycle**: Always start DIMOS before deploying modules
-2. **LCM Topics**: Use descriptive topic names with namespaces
-3. **Error Handling**: Wrap module operations in try-except blocks
-4. **Resource Cleanup**: Ensure proper cleanup in stop() methods
-5. **Async Operations**: Use asyncio for non-blocking operations
-6. **Stream Management**: Use RxPy for reactive programming patterns
-
-## Debugging Tips
-
-1. **Check Module Status**: Print module.io().result() to see connections
-2. **Monitor LCM**: Use Foxglove to visualize LCM messages
-3. **Log Everything**: Use dimos.utils.logging_config.setup_logger()
-4. **Test Modules Independently**: Deploy and test one module at a time
-
-## Common Issues
-
-1. **"Module not started"**: Ensure start() is called after deployment
-2. **"No data received"**: Check LCM transport configuration
-3. **"Connection failed"**: Verify input/output types match
-4. **"Cleanup errors"**: Stop modules before closing DIMOS
diff --git a/dimos/robot/agilex/README_CN.md b/dimos/robot/agilex/README_CN.md
deleted file mode 100644
index a8d79ebec1..0000000000
--- a/dimos/robot/agilex/README_CN.md
+++ /dev/null
@@ -1,465 +0,0 @@
-# DIMOS 机械臂机器人开发指南
-
-本指南介绍如何创建机器人类、集成智能体（Agent）以及使用 DIMOS 模块系统和 LCM 传输。
-
-## 目录
-1. [机器人类架构](#机器人类架构)
-2. [模块系统与 LCM 传输](#模块系统与-lcm-传输)
-3. [智能体集成](#智能体集成)
-4. [完整示例](#完整示例)
-
-## 机器人类架构
-
-### 基本机器人类结构
-
-DIMOS 机器人类应遵循以下模式：
-
-```python
-from typing import Optional, List
-from dimos import core
-from dimos.types.robot_capabilities import RobotCapability
-
-class YourRobot:
-    """您的机器人实现。"""
-
-    def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None):
-        # 核心组件
-        self.dimos = None
-        self.modules = {}
-        self.skill_library = SkillLibrary()
-
-        # 定义能力
-        self.capabilities = robot_capabilities or [
-            RobotCapability.VISION,
-            RobotCapability.MANIPULATION,
-        ]
-
-    async def start(self):
-        """启动机器人模块。"""
-        # 初始化 DIMOS，指定工作线程数
-        self.dimos = core.start(2)  # 需要的工作线程数
-
-        # 部署模块
-        # ... (参见模块系统章节)
-
-    def stop(self):
-        """停止所有模块并清理资源。"""
-        # 停止模块
-        # 关闭 DIMOS
-        if self.dimos:
-            self.dimos.close()
-```
-
-### 关键组件说明
-
-1. **初始化**：存储模块、技能和能力的引用
-2. **异步启动**：模块必须异步部署
-3. **正确清理**：在关闭 DIMOS 之前始终停止模块
-
-## 模块系统与 LCM 传输
-
-### 理解 DIMOS 模块
-
-模块是 DIMOS 机器人的构建块。它们：
-- 处理数据流（输入）
-- 产生输出
-- 可以相互连接
-- 通过 LCM（轻量级通信和编组）进行通信
-
-### 部署模块
-
-```python
-# 部署相机模块
-self.camera = self.dimos.deploy(
-    ZEDModule,                    # 模块类
-    camera_id=0,                  # 模块参数
-    resolution="HD720",
-    depth_mode="NEURAL",
-    fps=30,
-    publish_rate=30.0,
-    frame_id="camera_frame"
-)
-```
-
-### 设置 LCM 传输
-
-LCM 传输实现模块间通信：
-
-```python
-# 启用 LCM 自动配置
-from dimos.protocol import pubsub
-pubsub.lcm.autoconf()
-
-# 配置输出传输
-self.camera.color_image.transport = core.LCMTransport(
-    "/camera/color_image",        # 主题名称
-    Image                         # 消息类型
-)
-self.camera.depth_image.transport = core.LCMTransport(
-    "/camera/depth_image",
-    Image
-)
-```
-
-### 连接模块
-
-将模块输出连接到输入：
-
-```python
-# 将操作模块连接到相机输出
-self.manipulation.rgb_image.connect(self.camera.color_image) # ROS set_callback
-self.manipulation.depth_image.connect(self.camera.depth_image)
-self.manipulation.camera_info.connect(self.camera.camera_info)
-```
-
-### 模块通信模式
-
-```
-┌──────────────┐  LCM    ┌────────────────┐  LCM    ┌──────────────┐
-│   相机模块    │────────▶│   操作模块      │────────▶│  可视化输出   │
-│              │  消息    │                │  消息    │              │
-└──────────────┘         └────────────────┘         └──────────────┘
-     ▲                          ▲
-     │                          │
-     └──────────────────────────┘
-              直接连接(RPC指令)
-```
-
-## 智能体集成
-
-### 设置智能体与机器人
-
-运行文件的智能体集成模式：
-
-```python
-#!/usr/bin/env python3
-import asyncio
-import reactivex as rx
-from dimos.agents_deprecated.claude_agent import ClaudeAgent
-from dimos.web.robot_web_interface import RobotWebInterface
-
-def main():
-    # 1. 创建并启动机器人
-    robot = YourRobot()
-    asyncio.run(robot.start())
-
-    # 2. 设置技能
-    skills = robot.get_skills()
-    skills.add(YourSkill)
-    skills.create_instance("YourSkill", robot=robot)
-
-    # 3. 设置响应式流
-    agent_response_subject = rx.subject.Subject()
-    agent_response_stream = agent_response_subject.pipe(ops.share())
-
-    # 4. 创建 Web 界面
-    web_interface = RobotWebInterface(
-        port=5555,
-        text_streams={"agent_responses": agent_response_stream},
-        audio_subject=rx.subject.Subject()
-    )
-
-    # 5. 创建智能体
-    agent = ClaudeAgent(
-        dev_name="your_agent",
-        input_query_stream=web_interface.query_stream,
-        skills=skills,
-        system_query="您的系统提示词",
-        model_name="claude-3-5-haiku-latest"
-    )
-
-    # 6. 连接智能体响应
-    agent.get_response_observable().subscribe(
-        lambda x: agent_response_subject.on_next(x)
-    )
-
-    # 7. 运行界面
-    web_interface.run()
-```
-
-### 关键集成点
-
-1. **响应式流**：使用 RxPy 进行事件驱动通信
-2. **Web 界面**：提供用户输入/输出
-3. **智能体**：处理自然语言并执行技能
-4. **技能**：将机器人能力定义为可执行动作
-
-## 完整示例
-
-### 步骤 1：创建机器人类（`my_robot.py`）
-
-```python
-import asyncio
-from typing import Optional, List
-from dimos import core
-from dimos.hardware.camera import CameraModule
-from dimos.manipulation.module import ManipulationModule
-from dimos.skills.skills import SkillLibrary
-from dimos.types.robot_capabilities import RobotCapability
-from dimos_lcm.sensor_msgs import Image, CameraInfo
-from dimos.protocol import pubsub
-
-class MyRobot:
-    def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None):
-        self.dimos = None
-        self.camera = None
-        self.manipulation = None
-        self.skill_library = SkillLibrary()
-
-        self.capabilities = robot_capabilities or [
-            RobotCapability.VISION,
-            RobotCapability.MANIPULATION,
-        ]
-
-    async def start(self):
-        # 启动 DIMOS
-        self.dimos = core.start(2)
-
-        # 启用 LCM
-        pubsub.lcm.autoconf()
-
-        # 部署相机
-        self.camera = self.dimos.deploy(
-            CameraModule,
-            camera_id=0,
-            fps=30
-        )
-
-        # 配置相机 LCM
-        self.camera.color_image.transport = core.LCMTransport("/camera/rgb", Image)
-        self.camera.depth_image.transport = core.LCMTransport("/camera/depth", Image)
-        self.camera.camera_info.transport = core.LCMTransport("/camera/info", CameraInfo)
-
-        # 部署操作模块
-        self.manipulation = self.dimos.deploy(ManipulationModule)
-
-        # 连接模块
-        self.manipulation.rgb_image.connect(self.camera.color_image)
-        self.manipulation.depth_image.connect(self.camera.depth_image)
-        self.manipulation.camera_info.connect(self.camera.camera_info)
-
-        # 配置操作输出
-        self.manipulation.viz_image.transport = core.LCMTransport("/viz/output", Image)
-
-        # 启动模块
-        self.camera.start()
-        self.manipulation.start()
-
-        await asyncio.sleep(2)  # 允许初始化
-
-    def get_skills(self):
-        return self.skill_library
-
-    def stop(self):
-        if self.manipulation:
-            self.manipulation.stop()
-        if self.camera:
-            self.camera.stop()
-        if self.dimos:
-            self.dimos.close()
-```
-
-### 步骤 2：创建运行脚本（`run.py`）
-
-```python
-#!/usr/bin/env python3
-import asyncio
-import os
-from my_robot import MyRobot
-from dimos.agents_deprecated.claude_agent import ClaudeAgent
-from dimos.skills.basic import BasicSkill
-from dimos.web.robot_web_interface import RobotWebInterface
-import reactivex as rx
-import reactivex.operators as ops
-
-SYSTEM_PROMPT = """您是一个有用的机器人助手。"""
-
-def main():
-    # 检查 API 密钥
-    if not os.getenv("ANTHROPIC_API_KEY"):
-        print("请设置 ANTHROPIC_API_KEY")
-        return
-
-    # 创建机器人
-    robot = MyRobot()
-
-    try:
-        # 启动机器人
-        asyncio.run(robot.start())
-
-        # 设置技能
-        skills = robot.get_skills()
-        skills.add(BasicSkill)
-        skills.create_instance("BasicSkill", robot=robot)
-
-        # 设置流
-        agent_response_subject = rx.subject.Subject()
-        agent_response_stream = agent_response_subject.pipe(ops.share())
-
-        # 创建 Web 界面
-        web_interface = RobotWebInterface(
-            port=5555,
-            text_streams={"agent_responses": agent_response_stream}
-        )
-
-        # 创建智能体
-        agent = ClaudeAgent(
-            dev_name="my_agent",
-            input_query_stream=web_interface.query_stream,
-            skills=skills,
-            system_query=SYSTEM_PROMPT
-        )
-
-        # 连接响应
-        agent.get_response_observable().subscribe(
-            lambda x: agent_response_subject.on_next(x)
-        )
-
-        print("机器人就绪，访问 http://localhost:5555")
-
-        # 运行
-        web_interface.run()
-
-    finally:
-        robot.stop()
-
-if __name__ == "__main__":
-    main()
-```
-
-### 步骤 3：定义技能（`skills.py`）
-
-```python
-from dimos.skills import Skill, skill
-
-@skill(
-    description="执行一个基本动作",
-    parameters={
-        "action": "要执行的动作"
-    }
-)
-class BasicSkill(Skill):
-    def __init__(self, robot):
-        self.robot = robot
-
-    def run(self, action: str):
-        # 实现技能逻辑
-        return f"已执行：{action}"
-```
-
-## 最佳实践
-
-1. **模块生命周期**：在部署模块之前始终先启动 DIMOS
-2. **LCM 主题**：使用带命名空间的描述性主题名称
-3. **错误处理**：用 try-except 块包装模块操作
-4. **资源清理**：确保在 stop() 方法中正确清理
-5. **异步操作**：使用 asyncio 进行非阻塞操作
-6. **流管理**：使用 RxPy 进行响应式编程模式
-
-## 调试技巧
-
-1. **检查模块状态**：打印 module.io().result() 查看连接
-2. **监控 LCM**：使用 Foxglove 可视化 LCM 消息
-3. **记录一切**：使用 dimos.utils.logging_config.setup_logger()
-4. **独立测试模块**：一次部署和测试一个模块
-
-## 常见问题
-
-1. **"模块未启动"**：确保在部署后调用 start()
-2. **"未收到数据"**：检查 LCM 传输配置
-3. **"连接失败"**：验证输入/输出类型是否匹配
-4. **"清理错误"**：在关闭 DIMOS 之前停止模块
-
-## 高级主题
-
-### 自定义模块开发
-
-创建自定义模块的基本结构：
-
-```python
-from dimos.core import Module, In, Out, rpc
-
-class CustomModule(Module):
-    # 定义输入
-    input_data: In[DataType]
-
-    # 定义输出
-    output_data: Out[DataType]
-
-    def __init__(self, param1, param2, **kwargs):
-        super().__init__(**kwargs)
-        self.param1 = param1
-        self.param2 = param2
-
-    @rpc
-    def start(self):
-        """启动模块处理。"""
-        self.input_data.subscribe(self._process_data)
-
-    def _process_data(self, data):
-        """处理输入数据。"""
-        # 处理逻辑
-        result = self.process(data)
-        # 发布输出
-        self.output_data.publish(result)
-
-    @rpc
-    def stop(self):
-        """停止模块。"""
-        # 清理资源
-        pass
-```
-
-### 技能开发指南
-
-技能是机器人可执行的高级动作：
-
-```python
-from dimos.skills import Skill, skill
-from typing import Optional
-
-@skill(
-    description="复杂操作技能",
-    parameters={
-        "target": "目标对象",
-        "location": "目标位置"
-    }
-)
-class ComplexSkill(Skill):
-    def __init__(self, robot, **kwargs):
-        super().__init__(**kwargs)
-        self.robot = robot
-
-    def run(self, target: str, location: Optional[str] = None):
-        """执行技能逻辑。"""
-        try:
-            # 1. 感知阶段
-            object_info = self.robot.detect_object(target)
-
-            # 2. 规划阶段
-            if location:
-                plan = self.robot.plan_movement(object_info, location)
-
-            # 3. 执行阶段
-            result = self.robot.execute_plan(plan)
-
-            return {
-                "success": True,
-                "message": f"成功移动 {target} 到 {location}"
-            }
-
-        except Exception as e:
-            return {
-                "success": False,
-                "error": str(e)
-            }
-```
-
-### 性能优化
-
-1. **并行处理**：使用多个工作线程处理不同模块
-2. **数据缓冲**：为高频数据流实现缓冲机制
-3. **延迟加载**：仅在需要时初始化重型模块
-4. **资源池化**：重用昂贵的资源（如神经网络模型）
-
-希望本指南能帮助您快速上手 DIMOS 机器人开发！
diff --git a/dimos/robot/agilex/piper_arm.py b/dimos/robot/agilex/piper_arm.py
deleted file mode 100644
index 29624b9a4c..0000000000
--- a/dimos/robot/agilex/piper_arm.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-
-# Import LCM message types
-from dimos_lcm.sensor_msgs import CameraInfo
-
-from dimos import core
-from dimos.hardware.sensors.camera.zed import ZEDModule
-from dimos.manipulation.visual_servoing.manipulation_module import ManipulationModule
-from dimos.msgs.sensor_msgs import Image
-from dimos.protocol import pubsub
-from dimos.robot.foxglove_bridge import FoxgloveBridge
-from dimos.robot.robot import Robot
-from dimos.skills.skills import SkillLibrary
-from dimos.types.robot_capabilities import RobotCapability
-from dimos.utils.logging_config import setup_logger
-
-logger = setup_logger()
-
-
-class PiperArmRobot(Robot):
-    """Piper Arm robot with ZED camera and manipulation capabilities."""
-
-    def __init__(self, robot_capabilities: list[RobotCapability] | None = None) -> None:
-        super().__init__()
-        self.dimos = None
-        self.stereo_camera = None
-        self.manipulation_interface = None
-        self.skill_library = SkillLibrary()  # type: ignore[assignment]
-
-        # Initialize capabilities
-        self.capabilities = robot_capabilities or [
-            RobotCapability.VISION,
-            RobotCapability.MANIPULATION,
-        ]
-
-    async def start(self) -> None:
-        """Start the robot modules."""
-        # Start Dimos
-        self.dimos = core.start(2)  # type: ignore[assignment]  # Need 2 workers for ZED and manipulation modules
-        self.foxglove_bridge = FoxgloveBridge()
-
-        # Enable LCM auto-configuration
-        pubsub.lcm.autoconf()  # type: ignore[attr-defined]
-
-        # Deploy ZED module
-        logger.info("Deploying ZED module...")
-        self.stereo_camera = self.dimos.deploy(  # type: ignore[attr-defined]
-            ZEDModule,
-            camera_id=0,
-            resolution="HD720",
-            depth_mode="NEURAL",
-            fps=30,
-            enable_tracking=False,  # We don't need tracking for manipulation
-            publish_rate=30.0,
-            frame_id="zed_camera",
-        )
-
-        # Configure ZED LCM transports
-        self.stereo_camera.color_image.transport = core.LCMTransport("/zed/color_image", Image)  # type: ignore[attr-defined]
-        self.stereo_camera.depth_image.transport = core.LCMTransport("/zed/depth_image", Image)  # type: ignore[attr-defined]
-        self.stereo_camera.camera_info.transport = core.LCMTransport("/zed/camera_info", CameraInfo)  # type: ignore[attr-defined]
-
-        # Deploy manipulation module
-        logger.info("Deploying manipulation module...")
-        self.manipulation_interface = self.dimos.deploy(ManipulationModule)  # type: ignore[attr-defined]
-
-        # Connect manipulation inputs to ZED outputs
-        self.manipulation_interface.rgb_image.connect(self.stereo_camera.color_image)  # type: ignore[attr-defined]
-        self.manipulation_interface.depth_image.connect(self.stereo_camera.depth_image)  # type: ignore[attr-defined]
-        self.manipulation_interface.camera_info.connect(self.stereo_camera.camera_info)  # type: ignore[attr-defined]
-
-        # Configure manipulation output
-        self.manipulation_interface.viz_image.transport = core.LCMTransport(  # type: ignore[attr-defined]
-            "/manipulation/viz", Image
-        )
-
-        # Print module info
-        logger.info("Modules configured:")
-        print("\nZED Module:")
-        print(self.stereo_camera.io())  # type: ignore[attr-defined]
-        print("\nManipulation Module:")
-        print(self.manipulation_interface.io())  # type: ignore[attr-defined]
-
-        # Start modules
-        logger.info("Starting modules...")
-        self.foxglove_bridge.start()
-        self.stereo_camera.start()  # type: ignore[attr-defined]
-        self.manipulation_interface.start()  # type: ignore[attr-defined]
-
-        # Give modules time to initialize
-        await asyncio.sleep(2)
-
-        logger.info("PiperArmRobot initialized and started")
-
-    def pick_and_place(  # type: ignore[no-untyped-def]
-        self, pick_x: int, pick_y: int, place_x: int | None = None, place_y: int | None = None
-    ):
-        """Execute pick and place task.
-
-        Args:
-            pick_x: X coordinate for pick location
-            pick_y: Y coordinate for pick location
-            place_x: X coordinate for place location (optional)
-            place_y: Y coordinate for place location (optional)
-
-        Returns:
-            Result of the pick and place operation
-        """
-        if self.manipulation_interface:
-            return self.manipulation_interface.pick_and_place(pick_x, pick_y, place_x, place_y)
-        else:
-            logger.error("Manipulation module not initialized")
-            return False
-
-    def handle_keyboard_command(self, key: str):  # type: ignore[no-untyped-def]
-        """Pass keyboard commands to manipulation module.
-
-        Args:
-            key: Keyboard key pressed
-
-        Returns:
-            Action taken or None
-        """
-        if self.manipulation_interface:
-            return self.manipulation_interface.handle_keyboard_command(key)
-        else:
-            logger.error("Manipulation module not initialized")
-            return None
-
-    def stop(self) -> None:
-        """Stop all modules and clean up."""
-        logger.info("Stopping PiperArmRobot...")
-
-        try:
-            if self.manipulation_interface:
-                self.manipulation_interface.stop()
-
-            if self.stereo_camera:
-                self.stereo_camera.stop()
-        except Exception as e:
-            logger.warning(f"Error stopping modules: {e}")
-
-        # Close dimos last to ensure workers are available for cleanup
-        if self.dimos:
-            self.dimos.close()
-
-        logger.info("PiperArmRobot stopped")
-
-
-async def run_piper_arm() -> None:
-    """Run the Piper Arm robot."""
-    robot = PiperArmRobot()  # type: ignore[abstract]
-
-    await robot.start()
-
-    # Keep the robot running
-    try:
-        while True:
-            await asyncio.sleep(1)
-    except KeyboardInterrupt:
-        logger.info("Keyboard interrupt received")
-    finally:
-        await robot.stop()  # type: ignore[func-returns-value]
-
-
-if __name__ == "__main__":
-    asyncio.run(run_piper_arm())
diff --git a/dimos/robot/agilex/run.py b/dimos/robot/agilex/run.py
deleted file mode 100644
index 64e0ae5470..0000000000
--- a/dimos/robot/agilex/run.py
+++ /dev/null
@@ -1,190 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Run script for Piper Arm robot with Claude agent integration.
-Provides manipulation capabilities with natural language interface.
-"""
-
-import asyncio
-import os
-import sys
-
-from dotenv import load_dotenv
-import reactivex as rx
-import reactivex.operators as ops
-
-from dimos.agents_deprecated.claude_agent import ClaudeAgent
-from dimos.robot.agilex.piper_arm import PiperArmRobot
-from dimos.skills.kill_skill import KillSkill
-from dimos.skills.manipulation.pick_and_place import PickAndPlace
-from dimos.stream.audio.pipelines import stt, tts
-from dimos.utils.logging_config import setup_logger
-from dimos.web.robot_web_interface import RobotWebInterface
-
-logger = setup_logger()
-
-# Load environment variables
-load_dotenv()
-
-# System prompt for the Piper Arm manipulation agent
-SYSTEM_PROMPT = """You are an intelligent robotic assistant controlling a Piper Arm robot with advanced manipulation capabilities. Your primary role is to help users with pick and place tasks using natural language understanding.
-
-## Your Capabilities:
-1. **Visual Perception**: You have access to a ZED stereo camera that provides RGB and depth information
-2. **Object Manipulation**: You can pick up and place objects using a 6-DOF robotic arm with a gripper
-3. **Language Understanding**: You use the Qwen vision-language model to identify objects and locations from natural language descriptions
-
-## Available Skills:
-- **PickAndPlace**: Execute pick and place operations based on object and location descriptions
-  - Pick only: "Pick up the red mug"
-  - Pick and place: "Move the book to the shelf"
-- **KillSkill**: Stop any currently running skill
-
-## Guidelines:
-1. **Safety First**: Always ensure safe operation. If unsure about an object's graspability or a placement location's stability, ask for clarification
-2. **Clear Communication**: Explain what you're doing and ask for confirmation when needed
-3. **Error Handling**: If a task fails, explain why and suggest alternatives
-4. **Precision**: When users give specific object descriptions, use them exactly as provided to the vision model
-
-## Interaction Examples:
-- User: "Pick up the coffee mug"
-  You: "I'll pick up the coffee mug for you." [Execute PickAndPlace with object_query="coffee mug"]
-
-- User: "Put the toy on the table"
-  You: "I'll place the toy on the table." [Execute PickAndPlace with object_query="toy", target_query="on the table"]
-
-- User: "What do you see?"
-
-Remember: You're here to assist with manipulation tasks. Be helpful, precise, and always prioritize safe operation of the robot."""
-
-
-def main():  # type: ignore[no-untyped-def]
-    """Main entry point."""
-    print("\n" + "=" * 60)
-    print("Piper Arm Robot with Claude Agent")
-    print("=" * 60)
-    print("\nThis system integrates:")
-    print("  - Piper Arm 6-DOF robot")
-    print("  - ZED stereo camera")
-    print("  - Claude AI for natural language understanding")
-    print("  - Qwen VLM for visual object detection")
-    print("  - Web interface with text and voice input")
-    print("  - Foxglove visualization via LCM")
-    print("\nStarting system...\n")
-
-    # Check for API key
-    if not os.getenv("ANTHROPIC_API_KEY"):
-        print("WARNING: ANTHROPIC_API_KEY not found in environment")
-        print("Please set your API key in .env file or environment")
-        sys.exit(1)
-
-    logger.info("Starting Piper Arm Robot with Agent")
-
-    # Create robot instance
-    robot = PiperArmRobot()  # type: ignore[abstract]
-
-    try:
-        # Start the robot (this is async, so we need asyncio.run)
-        logger.info("Initializing robot...")
-        asyncio.run(robot.start())
-        logger.info("Robot initialized successfully")
-
-        # Set up skill library
-        skills = robot.get_skills()  # type: ignore[no-untyped-call]
-        skills.add(PickAndPlace)
-        skills.add(KillSkill)
-
-        # Create skill instances
-        skills.create_instance("PickAndPlace", robot=robot)
-        skills.create_instance("KillSkill", robot=robot, skill_library=skills)
-
-        logger.info(f"Skills registered: {[skill.__name__ for skill in skills.get_class_skills()]}")
-
-        # Set up streams for agent and web interface
-        agent_response_subject = rx.subject.Subject()  # type: ignore[var-annotated]
-        agent_response_stream = agent_response_subject.pipe(ops.share())
-        audio_subject = rx.subject.Subject()  # type: ignore[var-annotated]
-
-        # Set up streams for web interface
-        streams = {}  # type: ignore[var-annotated]
-
-        text_streams = {
-            "agent_responses": agent_response_stream,
-        }
-
-        # Create web interface first (needed for agent)
-        try:
-            web_interface = RobotWebInterface(
-                port=5555, text_streams=text_streams, audio_subject=audio_subject, **streams
-            )
-            logger.info("Web interface created successfully")
-        except Exception as e:
-            logger.error(f"Failed to create web interface: {e}")
-            raise
-
-        # Set up speech-to-text
-        stt_node = stt()  # type: ignore[no-untyped-call]
-        stt_node.consume_audio(audio_subject.pipe(ops.share()))
-
-        # Create Claude agent
-        agent = ClaudeAgent(
-            dev_name="piper_arm_agent",
-            input_query_stream=web_interface.query_stream,  # Use text input from web interface
-            # input_query_stream=stt_node.emit_text(),  # Uncomment to use voice input
-            skills=skills,
-            system_query=SYSTEM_PROMPT,
-            model_name="claude-3-5-haiku-latest",
-            thinking_budget_tokens=0,
-            max_output_tokens_per_request=4096,
-        )
-
-        # Subscribe to agent responses
-        agent.get_response_observable().subscribe(lambda x: agent_response_subject.on_next(x))
-
-        # Set up text-to-speech for agent responses
-        tts_node = tts()  # type: ignore[no-untyped-call]
-        tts_node.consume_text(agent.get_response_observable())
-
-        logger.info("=" * 60)
-        logger.info("Piper Arm Agent Ready!")
-        logger.info("Web interface available at: http://localhost:5555")
-        logger.info("Foxglove visualization available at: ws://localhost:8765")
-        logger.info("You can:")
-        logger.info("  - Type commands in the web interface")
-        logger.info("  - Use voice commands")
-        logger.info("  - Ask the robot to pick up objects")
-        logger.info("  - Ask the robot to move objects to locations")
-        logger.info("=" * 60)
-
-        # Run web interface (this blocks)
-        web_interface.run()
-
-    except KeyboardInterrupt:
-        logger.info("Keyboard interrupt received")
-    except Exception as e:
-        logger.error(f"Error running robot: {e}")
-        import traceback
-
-        traceback.print_exc()
-    finally:
-        logger.info("Shutting down...")
-        # Stop the robot (this is also async)
-        robot.stop()
-        logger.info("Robot stopped")
-
-
-if __name__ == "__main__":
-    main()  # type: ignore[no-untyped-call]
diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py
index 05658e11c2..6dfa077498 100644
--- a/dimos/robot/all_blueprints.py
+++ b/dimos/robot/all_blueprints.py
@@ -49,9 +49,10 @@
     # Demo blueprints
     "demo-camera": "dimos.hardware.sensors.camera.module:demo_camera",
     "demo-osm": "dimos.mapping.osm.demo_osm:demo_osm",
-    "demo-skill": "dimos.agents.skills.demo_skill:demo_skill",
-    "demo-gps-nav": "dimos.agents.skills.demo_gps_nav:demo_gps_nav_skill",
-    "demo-google-maps-skill": "dimos.agents.skills.demo_google_maps_skill:demo_google_maps_skill",
+    "demo-skill": "dimos.agents2.skills.demo_skill:demo_skill",
+    "demo-gps-nav": "dimos.agents2.skills.demo_gps_nav:demo_gps_nav_skill",
+    "demo-google-maps-skill": "dimos.agents2.skills.demo_google_maps_skill:demo_google_maps_skill",
+    "demo-object-scene-registration": "dimos.perception.demo_object_scene_registration:demo_object_scene_registration",
     "demo-remapping": "dimos.robot.unitree_webrtc.demo_remapping:remapping",
     "demo-remapping-transport": "dimos.robot.unitree_webrtc.demo_remapping:remapping_and_transport",
     "demo-error-on-name-conflicts": "dimos.robot.unitree_webrtc.demo_error_on_name_conflicts:blueprint",
diff --git a/docs/depth_camera_integration.md b/docs/depth_camera_integration.md
new file mode 100644
index 0000000000..1c8760d5cb
--- /dev/null
+++ b/docs/depth_camera_integration.md
@@ -0,0 +1,147 @@
+# Depth Camera Integration Guide
+
+This folder contains camera drivers and modules for RGB-D (depth) cameras such as RealSense and ZED.
+Use this guide to add a new depth camera, wire TF correctly, and publish the required streams.
+
+## Add a New Depth Camera
+
+1) **Create a new driver module**
+   - Path: `dimos/dimos/hardware/sensors/camera/<vendor>/camera.py`
+   - Export a blueprint in `<vendor>/__init__.py` (match the `realsense` / `zed` pattern).
+
+2) **Define config**
+   - Inherit from `ModuleConfig` and `DepthCameraConfig`:
+     ```python
+     @dataclass
+     class MyDepthCameraConfig(ModuleConfig, DepthCameraConfig):
+         width: int = 1280
+         height: int = 720
+         fps: int = 15
+         camera_name: str = "camera"
+         base_frame_id: str = "base_link"
+         base_transform: Transform | None = field(default_factory=default_base_transform)
+         align_depth_to_color: bool = True
+         enable_depth: bool = True
+         enable_pointcloud: bool = False
+         pointcloud_fps: float = 5.0
+         camera_info_fps: float = 1.0
+     ```
+
+3) **Implement the module**
+   - Inherit from `DepthCameraHardware` and `Module` (see `RealSenseCamera` / `ZEDCamera`).
+   - Provide these outputs (matching `RealSenseCamera` / `ZEDCamera`):
+     - `color_image: Out[Image]`
+     - `depth_image: Out[Image]`
+     - `pointcloud: Out[PointCloud2]` (optional, can be disabled by config)
+     - `camera_info: Out[CameraInfo]`
+     - `depth_camera_info: Out[CameraInfo]`
+   - Implement RPCs:
+     - `start()` / `stop()`
+     - `get_color_camera_info()` / `get_depth_camera_info()`
+     - `get_depth_scale()` (meters per depth unit)
+
+4) **Publish frames**
+   - Color images: `Image(format=ImageFormat.RGB, frame_id=_color_optical_frame)`
+   - Depth images:
+     - If `align_depth_to_color`: use `_color_optical_frame`
+     - Else: use `_depth_optical_frame`
+   - CameraInfo frame_id must match the image frame_id you publish.
+
+5) **Publish camera info**
+   - Build `CameraInfo` from camera intrinsics.
+   - Publish at `camera_info_fps`.
+
+6) **Publish pointcloud (optional)**
+   - Use `PointCloud2.from_rgbd(color_image, depth_image, camera_info, depth_scale)`.
+   - Publish at `pointcloud_fps`.
+
+## TF: Required Frames and Transforms
+
+Frame names are defined by the abstract depth camera spec (`dimos/dimos/hardware/sensors/camera/spec.py`).
+Use the properties below to ensure consistent naming:
+
+- `_camera_link`: base link for the camera module (usually `{camera_name}_link`)
+- `_color_frame`: non-optical color frame
+- `_color_optical_frame`: optical color frame
+- `_depth_frame`: non-optical depth frame
+- `_depth_optical_frame`: optical depth frame
+
+Recommended transform chain (publish every frame or at your preferred TF rate):
+
+1) **Mounting transform** (from config):
+   - `base_frame_id -> _camera_link`
+   - Use `config.base_transform` if provided
+
+2) **Depth frame**
+   - `_camera_link -> _depth_frame` (identity unless the camera provides extrinsics)
+   - `_depth_frame -> _depth_optical_frame` using `OPTICAL_ROTATION`
+
+3) **Color frame**
+   - `_camera_link -> _color_frame` (from extrinsics, or identity if unavailable)
+   - `_color_frame -> _color_optical_frame` using `OPTICAL_ROTATION`
+
+Notes:
+- If you align depth to color, keep TFs the same but publish depth images in `_color_optical_frame`.
+- Ensure `color_image.frame_id` and `camera_info.frame_id` match; same for depth.
+
+## Required Streams / Topics
+
+Use these stream names in your module and attach transports as needed.
+Default LCM topics in `realsense` / `zed` demos are shown below.
+
+| Stream name        | Type         | Suggested topic         | Frame_id source |
+|-------------------|--------------|-------------------------|-----------------|
+| `color_image`     | `Image`      | `/camera/color`         | `_color_optical_frame` |
+| `depth_image`     | `Image`      | `/camera/depth`         | `_depth_optical_frame` or `_color_optical_frame` |
+| `pointcloud`      | `PointCloud2`| `/camera/pointcloud`    | (derived from CameraInfo) |
+| `camera_info`     | `CameraInfo` | `/camera/color_info`    | matches `color_image` |
+| `depth_camera_info` | `CameraInfo` | `/camera/depth_info`  | matches `depth_image` |
+
+For `ObjectSceneRegistrationModule`, the required inputs are:
+- `color_image`
+- `depth_image`
+- `camera_info`
+- TF tree resolving `target_frame` to `color_image.frame_id`
+
+## Object Scene Registration (Brief Overview)
+
+`ObjectSceneRegistrationModule` consumes synchronized RGB + depth + camera intrinsics and produces:
+- 2D detections (YOLO‑E)
+- 3D detections (projected via depth + intrinsics + TF)
+- Overlay annotations and aggregated pointclouds
+
+See:
+- `dimos/dimos/perception/object_scene_registration.py`
+- `dimos/dimos/perception/demo_object_scene_registration.py`
+
+Quick wiring example:
+
+```python
+from dimos.core.blueprints import autoconnect
+from dimos.hardware.sensors.camera.realsense import realsense_camera
+from dimos.perception.object_scene_registration import object_scene_registration_module
+
+pipeline = autoconnect(
+    realsense_camera(enable_pointcloud=False),
+    object_scene_registration_module(target_frame="world"),
+)
+```
+
+Run the demo via CLI:
+```bash
+dimos run demo-object-scene-registration
+```
+
+## Foxglove (Viewer)
+
+Install Foxglove from:
+- https://foxglove.dev/download
+
+## Modules and Skills (Short Intro)
+
+- **Modules** are typed components with `In[...]` / `Out[...]` streams and `start()` / `stop()` lifecycles.
+- **Skills** are callable methods (decorated with `@skill`) exposed by `SkillModule` for agents.
+
+Reference:
+- Modules overview: `dimos/docs/concepts/modules.md`
+- TF fundamentals: `dimos/docs/api/transforms.md`