From cdfaaede46f8d1bca4b38e2175dd06489f2f7d4e Mon Sep 17 00:00:00 2001
From: Paul Nechifor <paul@nechifor.net>
Date: Thu, 9 Oct 2025 01:52:39 +0300
Subject: [PATCH 01/16] Reapply "Image upgrades! Impls for CUDA + numpy, along
 with an abstraction and full backwards compatibility"

This reverts commit fcd8f843aada01e0ab9751d9581acc4303cd399d.
---
 dimos/msgs/sensor_msgs/Image.py               | 823 ++++++++--------
 .../sensor_msgs/image_impls/AbstractImage.py  | 210 ++++
 .../msgs/sensor_msgs/image_impls/CudaImage.py | 895 ++++++++++++++++++
 .../sensor_msgs/image_impls/NumpyImage.py     | 214 +++++
 dimos/perception/common/utils.py              | 478 +++++++---
 pyproject.toml                                |   3 +-
 tests/test_image_backend_utils.py             | 279 ++++++
 tests/test_image_backends.py                  | 534 +++++++++++
 8 files changed, 2890 insertions(+), 546 deletions(-)
 create mode 100644 dimos/msgs/sensor_msgs/image_impls/AbstractImage.py
 create mode 100644 dimos/msgs/sensor_msgs/image_impls/CudaImage.py
 create mode 100644 dimos/msgs/sensor_msgs/image_impls/NumpyImage.py
 create mode 100644 tests/test_image_backend_utils.py
 create mode 100644 tests/test_image_backends.py

diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py
index 30c74fd243..7f77d71e70 100644
--- a/dimos/msgs/sensor_msgs/Image.py
+++ b/dimos/msgs/sensor_msgs/Image.py
@@ -12,24 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import base64
 import functools
 import time
-from dataclasses import dataclass, field
-from enum import Enum
-from typing import Literal, Optional, Tuple, TypedDict
+from typing import Optional
 
 import cv2
 import numpy as np
 import reactivex as rx
+from dimos.msgs.sensor_msgs.image_impls.AbstractImage import (
+    AbstractImage,
+    HAS_CUDA,
+    HAS_NVIMGCODEC,
+    ImageFormat,
+    NVIMGCODEC_LAST_USED,
+)
+from dimos.msgs.sensor_msgs.image_impls.CudaImage import CudaImage
+from dimos.msgs.sensor_msgs.image_impls.NumpyImage import NumpyImage
 from dimos_lcm.sensor_msgs.Image import Image as LCMImage
 from dimos_lcm.std_msgs.Header import Header
+from dimos.types.timestamped import TimestampedBufferCollection, to_human_readable
+from dimos.utils.reactive import quality_barrier
 from reactivex import operators as ops
 from reactivex.observable import Observable
 from reactivex.scheduler import ThreadPoolScheduler
 
-from dimos.types.timestamped import Timestamped, TimestampedBufferCollection, to_human_readable
-from dimos.utils.reactive import quality_barrier
+try:
+    import cupy as cp  # type: ignore
+except Exception:
+    cp = None  # type: ignore
 
 try:
     from sensor_msgs.msg import Image as ROSImage
@@ -37,470 +50,366 @@
     ROSImage = None
 
 
-class ImageFormat(Enum):
-    """Supported image formats for internal representation."""
-
-    BGR = "BGR"  # 8-bit Blue-Green-Red color
-    RGB = "RGB"  # 8-bit Red-Green-Blue color
-    RGBA = "RGBA"  # 8-bit RGB with Alpha
-    BGRA = "BGRA"  # 8-bit BGR with Alpha
-    GRAY = "GRAY"  # 8-bit Grayscale
-    GRAY16 = "GRAY16"  # 16-bit Grayscale
-    DEPTH = "DEPTH"  # 32-bit Float Depth
-    DEPTH16 = "DEPTH16"  # 16-bit Integer Depth (millimeters)
-
-
-class AgentImageMessage(TypedDict):
-    """Type definition for agent-compatible image representation."""
-
-    type: Literal["image"]
-    source_type: Literal["base64"]
-    mime_type: Literal["image/jpeg", "image/png"]
-    data: str  # Base64 encoded image data
-
-
-@dataclass
-class Image(Timestamped):
-    """Standardized image type with LCM integration."""
-
+class Image:
     msg_name = "sensor_msgs.Image"
-    data: np.ndarray
-    format: ImageFormat = field(default=ImageFormat.BGR)
-    frame_id: str = field(default="")
-    ts: float = field(default_factory=time.time)
-
-    def __str__(self):
-        return f"Image(shape={self.shape}, format={self.format}, dtype={self.dtype}, ts={to_human_readable(self.ts)})"
 
-    def __post_init__(self):
-        """Validate image data and format."""
-        if self.data is None:
-            raise ValueError("Image data cannot be None")
+    def __init__(
+        self,
+        impl: AbstractImage | None = None,
+        *,
+        data=None,
+        format: ImageFormat | None = None,
+        frame_id: str | None = None,
+        ts: float | None = None,
+    ):
+        """Construct an Image facade.
+
+        Usage:
+        - Image(impl=<AbstractImage>)
+        - Image(data=<ndarray | cupy.ndarray>, format=ImageFormat.RGB, frame_id=str, ts=float)
+
+        Notes:
+        - When constructed from `data`, uses CudaImage if `data` is a CuPy array and CUDA is available; otherwise NumpyImage.
+        - `format` defaults to ImageFormat.RGB; `frame_id` defaults to ""; `ts` defaults to `time.time()`.
+        """
+        # Disallow mixing impl with raw kwargs
+        if impl is not None and any(x is not None for x in (data, format, frame_id, ts)):
+            raise TypeError(
+                "Provide either 'impl' or ('data', 'format', 'frame_id', 'ts'), not both"
+            )
 
-        if not isinstance(self.data, np.ndarray):
-            raise ValueError("Image data must be a numpy array")
+        if impl is not None:
+            self._impl = impl
+            return
 
-        if len(self.data.shape) < 2:
-            raise ValueError("Image data must be at least 2D")
+        # Raw constructor path
+        if data is None:
+            raise TypeError("'data' is required when constructing Image without 'impl'")
+        fmt = format if format is not None else ImageFormat.RGB
+        fid = frame_id if frame_id is not None else ""
+        tstamp = ts if ts is not None else time.time()
 
-        # Ensure data is contiguous for efficient operations
-        if not self.data.flags["C_CONTIGUOUS"]:
-            self.data = np.ascontiguousarray(self.data)
+        # Detect CuPy array without a hard dependency
+        is_cu = False
+        try:
+            import cupy as _cp  # type: ignore
 
-    @property
-    def height(self) -> int:
-        """Get image height."""
-        return self.data.shape[0]
-
-    @property
-    def width(self) -> int:
-        """Get image width."""
-        return self.data.shape[1]
+            is_cu = isinstance(data, _cp.ndarray)
+        except Exception:
+            is_cu = False
 
-    @property
-    def channels(self) -> int:
-        """Get number of channels."""
-        if len(self.data.shape) == 2:
-            return 1
-        elif len(self.data.shape) == 3:
-            return self.data.shape[2]
+        if is_cu and HAS_CUDA:
+            self._impl = CudaImage(data, fmt, fid, tstamp)  # type: ignore
         else:
-            raise ValueError("Invalid image dimensions")
-
-    @property
-    def shape(self) -> Tuple[int, ...]:
-        """Get image shape."""
-        return self.data.shape
-
-    @property
-    def dtype(self) -> np.dtype:
-        """Get image data type."""
-        return self.data.dtype
+            self._impl = NumpyImage(np.asarray(data), fmt, fid, tstamp)
 
-    def copy(self) -> "Image":
-        """Create a deep copy of the image."""
-        return self.__class__(
-            data=self.data.copy(),
-            format=self.format,
-            frame_id=self.frame_id,
-            ts=self.ts,
+    def __str__(self) -> str:
+        dev = "cuda" if self.is_cuda else "cpu"
+        return (
+            f"Image(shape={self.shape}, format={self.format.value}, dtype={self.dtype}, "
+            f"dev={dev}, ts={to_human_readable(self.ts)})"
         )
 
     @classmethod
-    def from_opencv(
-        cls, cv_image: np.ndarray, format: ImageFormat = ImageFormat.BGR, **kwargs
-    ) -> "Image":
-        """Create Image from OpenCV image array."""
-        return cls(data=cv_image, format=format, **kwargs)
+    def from_impl(cls, impl: AbstractImage) -> "Image":
+        return cls(impl)
 
     @classmethod
     def from_numpy(
-        cls, np_image: np.ndarray, format: ImageFormat = ImageFormat.BGR, **kwargs
+        cls,
+        np_image: np.ndarray,
+        format: ImageFormat = ImageFormat.RGB,
+        to_cuda: bool = False,
+        **kwargs,
     ) -> "Image":
-        """Create Image from numpy array."""
-        return cls(data=np_image, format=format, **kwargs)
+        if kwargs.pop("to_gpu", False):
+            to_cuda = True
+        if to_cuda and HAS_CUDA:
+            return cls(
+                CudaImage(
+                    np_image if hasattr(np_image, "shape") else np.asarray(np_image),
+                    format,
+                    kwargs.get("frame_id", ""),
+                    kwargs.get("ts", time.time()),
+                )
+            )  # type: ignore
+        return cls(
+            NumpyImage(
+                np.asarray(np_image),
+                format,
+                kwargs.get("frame_id", ""),
+                kwargs.get("ts", time.time()),
+            )
+        )
 
     @classmethod
-    def from_file(cls, filepath: str, format: ImageFormat = ImageFormat.BGR) -> "Image":
-        """Load image from file."""
-        # OpenCV loads as BGR by default
-        cv_image = cv2.imread(filepath, cv2.IMREAD_UNCHANGED)
-        if cv_image is None:
+    def from_file(
+        cls, filepath: str, format: ImageFormat = ImageFormat.RGB, to_cuda: bool = False, **kwargs
+    ) -> "Image":
+        if kwargs.pop("to_gpu", False):
+            to_cuda = True
+        arr = cv2.imread(filepath, cv2.IMREAD_UNCHANGED)
+        if arr is None:
             raise ValueError(f"Could not load image from {filepath}")
-
-        # Detect format based on channels and data type
-        if len(cv_image.shape) == 2:
-            if cv_image.dtype == np.uint16:
-                detected_format = ImageFormat.GRAY16
-            else:
-                detected_format = ImageFormat.GRAY
-        elif cv_image.shape[2] == 3:
-            detected_format = ImageFormat.BGR  # OpenCV default
-        elif cv_image.shape[2] == 4:
-            detected_format = ImageFormat.BGRA
+        if arr.ndim == 2:
+            detected = ImageFormat.GRAY16 if arr.dtype == np.uint16 else ImageFormat.GRAY
+        elif arr.shape[2] == 3:
+            detected = ImageFormat.BGR  # OpenCV default
+        elif arr.shape[2] == 4:
+            detected = ImageFormat.BGRA  # OpenCV default
         else:
-            detected_format = format
-
-        return cls(data=cv_image, format=detected_format)
+            detected = format
+        return cls(CudaImage(arr, detected) if to_cuda and HAS_CUDA else NumpyImage(arr, detected))  # type: ignore
 
     @classmethod
-    def from_depth(cls, depth_data: np.ndarray, frame_id: str = "", ts: float = None) -> "Image":
-        """Create Image from depth data (float32 array)."""
-        if depth_data.dtype != np.float32:
-            depth_data = depth_data.astype(np.float32)
-
+    def from_opencv(
+        cls, cv_image: np.ndarray, format: ImageFormat = ImageFormat.BGR, **kwargs
+    ) -> "Image":
+        """Construct from an OpenCV image (NumPy array)."""
         return cls(
-            data=depth_data,
-            format=ImageFormat.DEPTH,
-            frame_id=frame_id,
-            ts=ts if ts is not None else time.time(),
+            NumpyImage(cv_image, format, kwargs.get("frame_id", ""), kwargs.get("ts", time.time()))
         )
 
-    def to_opencv(self) -> np.ndarray:
-        """Convert to OpenCV-compatible array (BGR format)."""
-        if self.format == ImageFormat.BGR:
-            return self.data
-        elif self.format == ImageFormat.RGB:
-            return cv2.cvtColor(self.data, cv2.COLOR_RGB2BGR)
-        elif self.format == ImageFormat.RGBA:
-            return cv2.cvtColor(self.data, cv2.COLOR_RGBA2BGR)
-        elif self.format == ImageFormat.BGRA:
-            return cv2.cvtColor(self.data, cv2.COLOR_BGRA2BGR)
-        elif self.format == ImageFormat.GRAY:
-            return self.data
-        elif self.format == ImageFormat.GRAY16:
-            return self.data
-        elif self.format == ImageFormat.DEPTH:
-            return self.data  # Depth images are already in the correct format
-        elif self.format == ImageFormat.DEPTH16:
-            return self.data  # 16-bit depth images are already in the correct format
+    @classmethod
+    def from_depth(
+        cls, depth_data, frame_id: str = "", ts: float = None, to_cuda: bool = False
+    ) -> "Image":
+        arr = np.asarray(depth_data)
+        if arr.dtype != np.float32:
+            arr = arr.astype(np.float32)
+        impl = (
+            CudaImage(arr, ImageFormat.DEPTH, frame_id, time.time() if ts is None else ts)
+            if to_cuda and HAS_CUDA
+            else NumpyImage(arr, ImageFormat.DEPTH, frame_id, time.time() if ts is None else ts)
+        )  # type: ignore
+        return cls(impl)
+
+    # Delegation
+    @property
+    def is_cuda(self) -> bool:
+        return self._impl.is_cuda
+
+    @property
+    def data(self):
+        return self._impl.data
+
+    @data.setter
+    def data(self, value) -> None:
+        # Preserve backend semantics: ensure array type matches implementation
+        if isinstance(self._impl, NumpyImage):
+            self._impl.data = np.asarray(value)
+        elif isinstance(self._impl, CudaImage):  # type: ignore
+            if cp is None:
+                raise RuntimeError("CuPy not available to set CUDA image data")
+            self._impl.data = cp.asarray(value)  # type: ignore
         else:
-            raise ValueError(f"Unsupported format conversion: {self.format}")
+            self._impl.data = value
 
-    def to_rgb(self) -> "Image":
-        """Convert image to RGB format."""
-        if self.format == ImageFormat.RGB:
-            return self.copy()
-        elif self.format == ImageFormat.BGR:
-            rgb_data = cv2.cvtColor(self.data, cv2.COLOR_BGR2RGB)
-        elif self.format == ImageFormat.RGBA:
-            return self.copy()  # Already RGB with alpha
-        elif self.format == ImageFormat.BGRA:
-            rgb_data = cv2.cvtColor(self.data, cv2.COLOR_BGRA2RGBA)
-        elif self.format == ImageFormat.GRAY:
-            rgb_data = cv2.cvtColor(self.data, cv2.COLOR_GRAY2RGB)
-        elif self.format == ImageFormat.GRAY16:
-            # Convert 16-bit grayscale to 8-bit then to RGB
-            gray8 = (self.data / 256).astype(np.uint8)
-            rgb_data = cv2.cvtColor(gray8, cv2.COLOR_GRAY2RGB)
+    @property
+    def format(self) -> ImageFormat:
+        return self._impl.format
+
+    @format.setter
+    def format(self, value) -> None:
+        if isinstance(value, ImageFormat):
+            self._impl.format = value
+        elif isinstance(value, str):
+            try:
+                self._impl.format = ImageFormat[value]
+            except KeyError as e:
+                raise ValueError(f"Invalid ImageFormat: {value}") from e
         else:
-            raise ValueError(f"Unsupported format conversion from {self.format} to RGB")
+            raise TypeError("format must be ImageFormat or str name")
 
-        return self.__class__(
-            data=rgb_data,
-            format=ImageFormat.RGB if self.format != ImageFormat.BGRA else ImageFormat.RGBA,
-            frame_id=self.frame_id,
-            ts=self.ts,
-        )
+    @property
+    def frame_id(self) -> str:
+        return self._impl.frame_id
 
-    def to_bgr(self) -> "Image":
-        """Convert image to BGR format."""
-        if self.format == ImageFormat.BGR:
-            return self.copy()
-        elif self.format == ImageFormat.RGB:
-            bgr_data = cv2.cvtColor(self.data, cv2.COLOR_RGB2BGR)
-        elif self.format == ImageFormat.RGBA:
-            bgr_data = cv2.cvtColor(self.data, cv2.COLOR_RGBA2BGR)
-        elif self.format == ImageFormat.BGRA:
-            bgr_data = cv2.cvtColor(self.data, cv2.COLOR_BGRA2BGR)
-        elif self.format == ImageFormat.GRAY:
-            bgr_data = cv2.cvtColor(self.data, cv2.COLOR_GRAY2BGR)
-        elif self.format == ImageFormat.GRAY16:
-            # Convert 16-bit grayscale to 8-bit then to BGR
-            gray8 = (self.data / 256).astype(np.uint8)
-            bgr_data = cv2.cvtColor(gray8, cv2.COLOR_GRAY2BGR)
-        else:
-            raise ValueError(f"Unsupported format conversion from {self.format} to BGR")
+    @frame_id.setter
+    def frame_id(self, value: str) -> None:
+        self._impl.frame_id = str(value)
 
-        return self.__class__(
-            data=bgr_data,
-            format=ImageFormat.BGR,
-            frame_id=self.frame_id,
-            ts=self.ts,
-        )
+    @property
+    def ts(self) -> float:
+        return self._impl.ts
 
-    def to_grayscale(self) -> "Image":
-        """Convert image to grayscale."""
-        if self.format == ImageFormat.GRAY:
-            return self.copy()
-        elif self.format == ImageFormat.GRAY16:
-            return self.copy()
-        elif self.format == ImageFormat.BGR:
-            gray_data = cv2.cvtColor(self.data, cv2.COLOR_BGR2GRAY)
-        elif self.format == ImageFormat.RGB:
-            gray_data = cv2.cvtColor(self.data, cv2.COLOR_RGB2GRAY)
-        elif self.format == ImageFormat.RGBA:
-            gray_data = cv2.cvtColor(self.data, cv2.COLOR_RGBA2GRAY)
-        elif self.format == ImageFormat.BGRA:
-            gray_data = cv2.cvtColor(self.data, cv2.COLOR_BGRA2GRAY)
-        else:
-            raise ValueError(f"Unsupported format conversion from {self.format} to grayscale")
+    @ts.setter
+    def ts(self, value: float) -> None:
+        self._impl.ts = float(value)
 
-        return self.__class__(
-            data=gray_data,
-            format=ImageFormat.GRAY,
-            frame_id=self.frame_id,
-            ts=self.ts,
-        )
+    @property
+    def height(self) -> int:
+        return self._impl.height
 
-    def resize(self, width: int, height: int, interpolation: int = cv2.INTER_LINEAR) -> "Image":
-        """Resize the image to the specified dimensions."""
-        resized_data = cv2.resize(self.data, (width, height), interpolation=interpolation)
-
-        return self.__class__(
-            data=resized_data,
-            format=self.format,
-            frame_id=self.frame_id,
-            ts=self.ts,
-        )
+    @property
+    def width(self) -> int:
+        return self._impl.width
 
-    def crop(self, x: int, y: int, width: int, height: int) -> "Image":
-        """Crop the image to the specified region."""
-        # Ensure crop region is within image bounds
-        x = max(0, min(x, self.width))
-        y = max(0, min(y, self.height))
-        x2 = min(x + width, self.width)
-        y2 = min(y + height, self.height)
-
-        cropped_data = self.data[y:y2, x:x2]
-
-        return self.__class__(
-            data=cropped_data,
-            format=self.format,
-            frame_id=self.frame_id,
-            ts=self.ts,
-        )
+    @property
+    def channels(self) -> int:
+        return self._impl.channels
 
-    @functools.cached_property
-    def sharpness(self) -> float:
-        """
-        Compute the Tenengrad focus measure for an image.
-        Returns a normalized value between 0 and 1, where 1 is sharpest.
+    @property
+    def shape(self):
+        return self._impl.shape
 
-        Uses adaptive normalization based on image statistics for better
-        discrimination across different image types.
-        """
-        grayscale = self.to_grayscale()
-        # Sobel gradient computation in x and y directions
-        sx = cv2.Sobel(grayscale.data, cv2.CV_32F, 1, 0, ksize=5)
-        sy = cv2.Sobel(grayscale.data, cv2.CV_32F, 0, 1, ksize=5)
+    @property
+    def dtype(self):
+        return self._impl.dtype
+
+    def copy(self) -> "Image":
+        return Image(self._impl.copy())
 
-        # Compute gradient magnitude
-        magnitude = cv2.magnitude(sx, sy)
+    def to_cpu(self) -> "Image":
+        if isinstance(self._impl, NumpyImage):
+            return self.copy()
+        return Image(
+            NumpyImage(
+                np.asarray(self._impl.to_opencv()),
+                self._impl.format,
+                self._impl.frame_id,
+                self._impl.ts,
+            )
+        )
 
-        mean_mag = magnitude.mean()
+    def to_cupy(self) -> "Image":
+        if isinstance(self._impl, CudaImage):
+            return self.copy()
+        return Image(
+            CudaImage(
+                np.asarray(self._impl.data), self._impl.format, self._impl.frame_id, self._impl.ts
+            )
+        )  # type: ignore
 
-        # Use log-scale normalization for better discrimination
-        # This maps typical values more evenly across the 0-1 range:
-        # - Blurry images (mean ~50-150): 0.15-0.35
-        # - Medium sharp (mean ~150-500): 0.35-0.65
-        # - Sharp images (mean ~500-2000): 0.65-0.85
-        # - Very sharp (mean >2000): 0.85-1.0
+    def to_opencv(self) -> np.ndarray:
+        return self._impl.to_opencv()
 
-        if mean_mag <= 0:
-            return 0.0
+    def to_rgb(self) -> "Image":
+        return Image(self._impl.to_rgb())
 
-        # Log scale with offset to handle the full range
-        # log10(50) ≈ 1.7, log10(5000) ≈ 3.7
-        normalized = (np.log10(mean_mag + 1) - 1.7) / 2.0
+    def to_bgr(self) -> "Image":
+        return Image(self._impl.to_bgr())
 
-        return np.clip(normalized, 0.0, 1.0)
+    def to_grayscale(self) -> "Image":
+        return Image(self._impl.to_grayscale())
+
+    def resize(self, width: int, height: int, interpolation: int = cv2.INTER_LINEAR) -> "Image":
+        return Image(self._impl.resize(width, height, interpolation))
+
+    def sharpness(self):
+        """Return sharpness score as a callable float for backward compatibility."""
+        return self._impl.sharpness()
 
     def save(self, filepath: str) -> bool:
-        """Save image to file."""
-        # Convert to OpenCV format for saving
-        cv_image = self.to_opencv()
-        return cv2.imwrite(filepath, cv_image)
+        return self._impl.save(filepath)
 
-    def to_base64(self, max_width: int = 640, max_height: int = 480) -> str:
-        """Encode image to base64 JPEG format for agent processing.
+    def to_base64(
+        self,
+        quality: int = 80,
+        *,
+        max_width: Optional[int] = None,
+        max_height: Optional[int] = None,
+    ) -> str:
+        """Encode the image as a base64 JPEG string.
 
         Args:
-            max_width: Maximum width for resizing (default 640)
-            max_height: Maximum height for resizing (default 480)
+            quality: JPEG quality (0-100).
+            max_width: Optional maximum width to constrain the encoded image.
+            max_height: Optional maximum height to constrain the encoded image.
 
         Returns:
-            Base64 encoded JPEG string suitable for LLM/agent consumption.
+            Base64-encoded JPEG representation of the image.
         """
-        bgr_image = self.to_bgr()
-
-        # Encode as JPEG
-        encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 80]  # 80% quality
-        success, buffer = cv2.imencode(".jpg", bgr_image.data, encode_param)
-
+        bgr_image = self.to_bgr().to_opencv()
+        height, width = bgr_image.shape[:2]
+
+        scale = 1.0
+        if max_width is not None and width > max_width:
+            scale = min(scale, max_width / width)
+        if max_height is not None and height > max_height:
+            scale = min(scale, max_height / height)
+
+        if scale < 1.0:
+            new_width = max(1, int(round(width * scale)))
+            new_height = max(1, int(round(height * scale)))
+            bgr_image = cv2.resize(bgr_image, (new_width, new_height), interpolation=cv2.INTER_AREA)
+
+        encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), int(np.clip(quality, 0, 100))]
+        success, buffer = cv2.imencode(".jpg", bgr_image, encode_param)
         if not success:
             raise ValueError("Failed to encode image as JPEG")
 
-        # Convert to base64
-
-        jpeg_bytes = buffer.tobytes()
-        base64_str = base64.b64encode(jpeg_bytes).decode("utf-8")
+        return base64.b64encode(buffer.tobytes()).decode("utf-8")
 
-        return base64_str
+    def agent_encode(self, quality: int = 80) -> str:
+        """Return a base64-encoded JPEG suitable for agent pipelines."""
+        return self.to_base64(quality=quality)
 
-    def agent_encode(self) -> AgentImageMessage:
-        return [
-            {
-                "type": "image_url",
-                "image_url": {"url": f"data:image/jpeg;base64,{self.to_base64()}"},
-            }
-        ]
-
-    def lcm_encode(self, frame_id: Optional[str] = None) -> LCMImage:
-        """Convert to LCM Image message."""
+    # LCM encode/decode
+    def lcm_encode(self, frame_id: Optional[str] = None) -> bytes:
         msg = LCMImage()
-
-        # Header
         msg.header = Header()
-        msg.header.seq = 0  # Initialize sequence number
+        msg.header.seq = 0
         msg.header.frame_id = frame_id or self.frame_id
-
-        # Set timestamp properly as Time object
         if self.ts is not None:
             msg.header.stamp.sec = int(self.ts)
             msg.header.stamp.nsec = int((self.ts - int(self.ts)) * 1e9)
         else:
-            current_time = time.time()
-            msg.header.stamp.sec = int(current_time)
-            msg.header.stamp.nsec = int((current_time - int(current_time)) * 1e9)
-
-        # Image properties
-        msg.height = self.height
-        msg.width = self.width
-        msg.encoding = self._get_lcm_encoding()  # Convert format to LCM encoding
-        msg.is_bigendian = False  # Use little endian
-        msg.step = self._get_row_step()
-
-        # Image data
-        image_bytes = self.data.tobytes()
-        msg.data_length = len(image_bytes)
-        msg.data = image_bytes
-
+            now = time.time()
+            msg.header.stamp.sec = int(now)
+            msg.header.stamp.nsec = int((now - int(now)) * 1e9)
+
+        arr = (
+            self.to_opencv()
+            if self.format in (ImageFormat.BGR, ImageFormat.RGB, ImageFormat.RGBA, ImageFormat.BGRA)
+            else self.to_opencv()
+        )
+        msg.height = int(arr.shape[0])
+        msg.width = int(arr.shape[1])
+        msg.encoding = _get_lcm_encoding(self.format, arr.dtype)
+        msg.is_bigendian = False
+        channels = 1 if arr.ndim == 2 else int(arr.shape[2])
+        msg.step = int(arr.shape[1] * arr.dtype.itemsize * channels)
+        img_bytes = arr.tobytes()
+        msg.data_length = len(img_bytes)
+        msg.data = img_bytes
         return msg.lcm_encode()
 
     @classmethod
     def lcm_decode(cls, data: bytes, **kwargs) -> "Image":
-        """Create Image from LCM Image message."""
-        # Parse encoding to determine format and data type
         msg = LCMImage.lcm_decode(data)
-        format_info = cls._parse_encoding(msg.encoding)
-
-        # Convert bytes back to numpy array
-        data = np.frombuffer(msg.data, dtype=format_info["dtype"])
-
-        # Reshape to image dimensions
-        if format_info["channels"] == 1:
-            data = data.reshape((msg.height, msg.width))
+        fmt, dtype, channels = _parse_lcm_encoding(msg.encoding)
+        arr = np.frombuffer(msg.data, dtype=dtype)
+        if channels == 1:
+            arr = arr.reshape((msg.height, msg.width))
         else:
-            data = data.reshape((msg.height, msg.width, format_info["channels"]))
-
+            arr = arr.reshape((msg.height, msg.width, channels))
         return cls(
-            data=data,
-            format=format_info["format"],
-            frame_id=msg.header.frame_id if hasattr(msg, "header") else "",
-            ts=msg.header.stamp.sec + msg.header.stamp.nsec / 1e9
-            if hasattr(msg, "header") and msg.header.stamp.sec > 0
-            else time.time(),
-            **kwargs,
+            NumpyImage(
+                arr,
+                fmt,
+                msg.header.frame_id if hasattr(msg, "header") else "",
+                msg.header.stamp.sec + msg.header.stamp.nsec / 1e9
+                if hasattr(msg, "header") and getattr(msg.header, "stamp", None)
+                else time.time(),
+            )
         )
 
-    def _get_row_step(self) -> int:
-        """Calculate row step (bytes per row)."""
-        bytes_per_pixel = self._get_bytes_per_pixel()
-        return self.width * bytes_per_pixel
-
-    def _get_bytes_per_pixel(self) -> int:
-        """Calculate bytes per pixel based on format and data type."""
-        bytes_per_element = self.data.dtype.itemsize
-        return self.channels * bytes_per_element
-
-    def _get_lcm_encoding(self) -> str:
-        """Get LCM encoding string from internal format and data type."""
-        # Map internal format to LCM encoding based on format and dtype
-        if self.format == ImageFormat.GRAY:
-            if self.dtype == np.uint8:
-                return "mono8"
-            elif self.dtype == np.uint16:
-                return "mono16"
-        elif self.format == ImageFormat.GRAY16:
-            return "mono16"
-        elif self.format == ImageFormat.RGB:
-            return "rgb8"
-        elif self.format == ImageFormat.RGBA:
-            return "rgba8"
-        elif self.format == ImageFormat.BGR:
-            return "bgr8"
-        elif self.format == ImageFormat.BGRA:
-            return "bgra8"
-        elif self.format == ImageFormat.DEPTH:
-            if self.dtype == np.float32:
-                return "32FC1"
-            elif self.dtype == np.float64:
-                return "64FC1"
-        elif self.format == ImageFormat.DEPTH16:
-            if self.dtype == np.uint16:
-                return "16UC1"  # 16-bit unsigned depth
-            elif self.dtype == np.int16:
-                return "16SC1"  # 16-bit signed depth
-
-        raise ValueError(
-            f"Cannot determine LCM encoding for format={self.format}, dtype={self.dtype}"
-        )
+    # PnP wrappers
+    def solve_pnp(self, *args, **kwargs):
+        return self._impl.solve_pnp(*args, **kwargs)  # type: ignore
 
-    @staticmethod
-    def _parse_encoding(encoding: str) -> dict:
-        """Parse LCM image encoding string to determine format and data type."""
-        # Standard encodings
-        encoding_map = {
-            "mono8": {"format": ImageFormat.GRAY, "dtype": np.uint8, "channels": 1},
-            "mono16": {"format": ImageFormat.GRAY16, "dtype": np.uint16, "channels": 1},
-            "rgb8": {"format": ImageFormat.RGB, "dtype": np.uint8, "channels": 3},
-            "rgba8": {"format": ImageFormat.RGBA, "dtype": np.uint8, "channels": 4},
-            "bgr8": {"format": ImageFormat.BGR, "dtype": np.uint8, "channels": 3},
-            "bgra8": {"format": ImageFormat.BGRA, "dtype": np.uint8, "channels": 4},
-            # Depth/float encodings
-            "32FC1": {"format": ImageFormat.DEPTH, "dtype": np.float32, "channels": 1},
-            "32FC3": {"format": ImageFormat.RGB, "dtype": np.float32, "channels": 3},
-            "64FC1": {"format": ImageFormat.DEPTH, "dtype": np.float64, "channels": 1},
-            # 16-bit depth encodings
-            "16UC1": {"format": ImageFormat.DEPTH16, "dtype": np.uint16, "channels": 1},
-            "16SC1": {"format": ImageFormat.DEPTH16, "dtype": np.int16, "channels": 1},
-        }
+    def solve_pnp_ransac(self, *args, **kwargs):
+        return self._impl.solve_pnp_ransac(*args, **kwargs)  # type: ignore
+
+    def solve_pnp_batch(self, *args, **kwargs):
+        return self._impl.solve_pnp_batch(*args, **kwargs)  # type: ignore
 
-        if encoding not in encoding_map:
-            raise ValueError(f"Unsupported encoding: {encoding}")
+    def create_csrt_tracker(self, *args, **kwargs):
+        return self._impl.create_csrt_tracker(*args, **kwargs)  # type: ignore
 
-        return encoding_map[encoding]
+    def csrt_update(self, *args, **kwargs):
+        return self._impl.csrt_update(*args, **kwargs)  # type: ignore
 
     @classmethod
     def from_ros_msg(cls, ros_msg: ROSImage) -> "Image":
@@ -563,33 +472,135 @@ def from_ros_msg(cls, ros_msg: ROSImage) -> "Image":
             ts=ts,
         )
 
+    @staticmethod
+    def _parse_encoding(encoding: str) -> dict:
+        """Translate ROS encoding strings into format metadata."""
+        encoding_map = {
+            "mono8": {"format": ImageFormat.GRAY, "dtype": np.uint8, "channels": 1},
+            "mono16": {"format": ImageFormat.GRAY16, "dtype": np.uint16, "channels": 1},
+            "rgb8": {"format": ImageFormat.RGB, "dtype": np.uint8, "channels": 3},
+            "rgba8": {"format": ImageFormat.RGBA, "dtype": np.uint8, "channels": 4},
+            "bgr8": {"format": ImageFormat.BGR, "dtype": np.uint8, "channels": 3},
+            "bgra8": {"format": ImageFormat.BGRA, "dtype": np.uint8, "channels": 4},
+            "32FC1": {"format": ImageFormat.DEPTH, "dtype": np.float32, "channels": 1},
+            "32FC3": {"format": ImageFormat.RGB, "dtype": np.float32, "channels": 3},
+            "64FC1": {"format": ImageFormat.DEPTH, "dtype": np.float64, "channels": 1},
+            "16UC1": {"format": ImageFormat.DEPTH16, "dtype": np.uint16, "channels": 1},
+            "16SC1": {"format": ImageFormat.DEPTH16, "dtype": np.int16, "channels": 1},
+        }
+
+        key = encoding.strip()
+        for candidate in (key, key.lower(), key.upper()):
+            if candidate in encoding_map:
+                return dict(encoding_map[candidate])
+
+        raise ValueError(f"Unsupported encoding: {encoding}")
+
     def __repr__(self) -> str:
-        """String representation."""
-        return (
-            f"Image(shape={self.shape}, format={self.format.value}, "
-            f"dtype={self.dtype}, frame_id='{self.frame_id}', ts={self.ts})"
-        )
+        dev = "cuda" if self.is_cuda else "cpu"
+        return f"Image(shape={self.shape}, format={self.format.value}, dtype={self.dtype}, dev={dev}, frame_id='{self.frame_id}', ts={self.ts})"
 
     def __eq__(self, other) -> bool:
-        """Check equality with another Image."""
         if not isinstance(other, Image):
             return False
-
         return (
-            np.array_equal(self.data, other.data)
+            np.array_equal(self.to_opencv(), other.to_opencv())
             and self.format == other.format
             and self.frame_id == other.frame_id
             and abs(self.ts - other.ts) < 1e-6
         )
 
     def __len__(self) -> int:
-        """Return total number of pixels."""
-        return self.height * self.width
+        return int(self.height * self.width)
+
+
+# Re-exports for tests
+HAS_CUDA = HAS_CUDA
+ImageFormat = ImageFormat
+NVIMGCODEC_LAST_USED = NVIMGCODEC_LAST_USED
+HAS_NVIMGCODEC = HAS_NVIMGCODEC
+__all__ = [
+    "HAS_CUDA",
+    "ImageFormat",
+    "NVIMGCODEC_LAST_USED",
+    "HAS_NVIMGCODEC",
+    "sharpness_window",
+    "sharpness_barrier",
+]
 
 
 def sharpness_window(target_frequency: float, source: Observable[Image]) -> Observable[Image]:
-    raise NotImplementedError("use sharpness_barrier instead")
+    """Emit the sharpest Image seen within each sliding time window."""
+    if target_frequency <= 0:
+        raise ValueError("target_frequency must be positive")
+
+    window = TimestampedBufferCollection(1.0 / target_frequency)
+    source.subscribe(window.add)
+
+    thread_scheduler = ThreadPoolScheduler(max_workers=1)
+
+    def find_best(*_args):
+        if not window._items:
+            return None
+        return max(window._items, key=lambda img: img.sharpness)
+
+    return rx.interval(1.0 / target_frequency).pipe(
+        ops.observe_on(thread_scheduler),
+        ops.map(find_best),
+        ops.filter(lambda img: img is not None),
+    )
 
 
 def sharpness_barrier(target_frequency: float):
-    return quality_barrier(lambda x: x.sharpness, target_frequency)
+    """Select the sharpest Image within each time window."""
+    if target_frequency <= 0:
+        raise ValueError("target_frequency must be positive")
+    return quality_barrier(lambda image: image.sharpness, target_frequency)
+
+
+def _get_lcm_encoding(fmt: ImageFormat, dtype: np.dtype) -> str:
+    if fmt == ImageFormat.GRAY:
+        if dtype == np.uint8:
+            return "mono8"
+        if dtype == np.uint16:
+            return "mono16"
+    if fmt == ImageFormat.GRAY16:
+        return "mono16"
+    if fmt == ImageFormat.RGB:
+        return "rgb8"
+    if fmt == ImageFormat.RGBA:
+        return "rgba8"
+    if fmt == ImageFormat.BGR:
+        return "bgr8"
+    if fmt == ImageFormat.BGRA:
+        return "bgra8"
+    if fmt == ImageFormat.DEPTH:
+        if dtype == np.float32:
+            return "32FC1"
+        if dtype == np.float64:
+            return "64FC1"
+    if fmt == ImageFormat.DEPTH16:
+        if dtype == np.uint16:
+            return "16UC1"
+        if dtype == np.int16:
+            return "16SC1"
+    raise ValueError(f"Unsupported LCM encoding for fmt={fmt}, dtype={dtype}")
+
+
+def _parse_lcm_encoding(enc: str):
+    m = {
+        "mono8": (ImageFormat.GRAY, np.uint8, 1),
+        "mono16": (ImageFormat.GRAY16, np.uint16, 1),
+        "rgb8": (ImageFormat.RGB, np.uint8, 3),
+        "rgba8": (ImageFormat.RGBA, np.uint8, 4),
+        "bgr8": (ImageFormat.BGR, np.uint8, 3),
+        "bgra8": (ImageFormat.BGRA, np.uint8, 4),
+        "32FC1": (ImageFormat.DEPTH, np.float32, 1),
+        "32FC3": (ImageFormat.RGB, np.float32, 3),
+        "64FC1": (ImageFormat.DEPTH, np.float64, 1),
+        "16UC1": (ImageFormat.DEPTH16, np.uint16, 1),
+        "16SC1": (ImageFormat.DEPTH16, np.int16, 1),
+    }
+    if enc not in m:
+        raise ValueError(f"Unsupported encoding: {enc}")
+    return m[enc]
diff --git a/dimos/msgs/sensor_msgs/image_impls/AbstractImage.py b/dimos/msgs/sensor_msgs/image_impls/AbstractImage.py
new file mode 100644
index 0000000000..2f7da1d0d9
--- /dev/null
+++ b/dimos/msgs/sensor_msgs/image_impls/AbstractImage.py
@@ -0,0 +1,210 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import base64
+import os
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Any
+
+import cv2
+import numpy as np
+
+try:
+    import cupy as cp  # type: ignore
+
+    HAS_CUDA = True
+except Exception:  # pragma: no cover - optional dependency
+    cp = None  # type: ignore
+    HAS_CUDA = False
+
+# Optional nvImageCodec (preferred GPU codec)
+USE_NVIMGCODEC = os.environ.get("USE_NVIMGCODEC", "0") == "1"
+NVIMGCODEC_LAST_USED = False
+try:  # pragma: no cover - optional dependency
+    if HAS_CUDA and USE_NVIMGCODEC:
+        from nvidia import nvimgcodec  # type: ignore
+
+        try:
+            _enc_probe = nvimgcodec.Encoder()  # type: ignore[attr-defined]
+            HAS_NVIMGCODEC = True
+        except Exception:
+            nvimgcodec = None  # type: ignore
+            HAS_NVIMGCODEC = False
+    else:
+        nvimgcodec = None  # type: ignore
+        HAS_NVIMGCODEC = False
+except Exception:  # pragma: no cover - optional dependency
+    nvimgcodec = None  # type: ignore
+    HAS_NVIMGCODEC = False
+
+
+class ImageFormat(Enum):
+    BGR = "BGR"
+    RGB = "RGB"
+    RGBA = "RGBA"
+    BGRA = "BGRA"
+    GRAY = "GRAY"
+    GRAY16 = "GRAY16"
+    DEPTH = "DEPTH"
+    DEPTH16 = "DEPTH16"
+
+
+def _is_cu(x) -> bool:
+    return HAS_CUDA and cp is not None and isinstance(x, cp.ndarray)  # type: ignore
+
+
+def _ascontig(x):
+    if _is_cu(x):
+        return x if x.flags["C_CONTIGUOUS"] else cp.ascontiguousarray(x)  # type: ignore
+    return x if x.flags["C_CONTIGUOUS"] else np.ascontiguousarray(x)
+
+
+def _to_cpu(x):
+    return cp.asnumpy(x) if _is_cu(x) else x  # type: ignore
+
+
+def _to_cu(x):
+    if HAS_CUDA and cp is not None and isinstance(x, np.ndarray):  # type: ignore
+        return cp.asarray(x)  # type: ignore
+    return x
+
+
+def _encode_nvimgcodec_cuda(bgr_cu, quality: int = 80) -> bytes:  # pragma: no cover - optional
+    if not HAS_NVIMGCODEC or nvimgcodec is None:
+        raise RuntimeError("nvimgcodec not available")
+    if bgr_cu.ndim != 3 or bgr_cu.shape[2] != 3:
+        raise RuntimeError("nvimgcodec expects HxWx3 image")
+    if bgr_cu.dtype != cp.uint8:  # type: ignore[attr-defined]
+        raise RuntimeError("nvimgcodec requires uint8 input")
+    if not bgr_cu.flags["C_CONTIGUOUS"]:
+        bgr_cu = cp.ascontiguousarray(bgr_cu)  # type: ignore[attr-defined]
+    encoder = nvimgcodec.Encoder()  # type: ignore[attr-defined]
+    try:
+        img = nvimgcodec.Image(bgr_cu, nvimgcodec.PixelFormat.BGR)  # type: ignore[attr-defined]
+    except Exception:
+        img = nvimgcodec.Image(cp.asnumpy(bgr_cu), nvimgcodec.PixelFormat.BGR)  # type: ignore[attr-defined]
+    if hasattr(nvimgcodec, "EncodeParams"):
+        params = nvimgcodec.EncodeParams(quality=quality)  # type: ignore[attr-defined]
+        bitstreams = encoder.encode([img], [params])
+    else:
+        bitstreams = encoder.encode([img])
+    bs0 = bitstreams[0]
+    if hasattr(bs0, "buf"):
+        return bytes(bs0.buf)
+    return bytes(bs0)
+
+
+class AbstractImage(ABC):
+    data: Any
+    format: ImageFormat
+    frame_id: str
+    ts: float
+
+    @property
+    @abstractmethod
+    def is_cuda(self) -> bool:  # pragma: no cover - abstract
+        ...
+
+    @property
+    def height(self) -> int:
+        return int(self.data.shape[0])
+
+    @property
+    def width(self) -> int:
+        return int(self.data.shape[1])
+
+    @property
+    def channels(self) -> int:
+        if getattr(self.data, "ndim", 0) == 2:
+            return 1
+        if getattr(self.data, "ndim", 0) == 3:
+            return int(self.data.shape[2])
+        raise ValueError("Invalid image dimensions")
+
+    @property
+    def shape(self):
+        return tuple(self.data.shape)
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @abstractmethod
+    def to_opencv(self) -> np.ndarray:  # pragma: no cover - abstract
+        ...
+
+    @abstractmethod
+    def to_rgb(self) -> "AbstractImage":  # pragma: no cover - abstract
+        ...
+
+    @abstractmethod
+    def to_bgr(self) -> "AbstractImage":  # pragma: no cover - abstract
+        ...
+
+    @abstractmethod
+    def to_grayscale(self) -> "AbstractImage":  # pragma: no cover - abstract
+        ...
+
+    @abstractmethod
+    def resize(
+        self, width: int, height: int, interpolation: int = cv2.INTER_LINEAR
+    ) -> "AbstractImage":  # pragma: no cover - abstract
+        ...
+
+    @abstractmethod
+    def sharpness(self) -> float:  # pragma: no cover - abstract
+        ...
+
+    def copy(self) -> "AbstractImage":
+        return self.__class__(
+            data=self.data.copy(), format=self.format, frame_id=self.frame_id, ts=self.ts
+        )  # type: ignore
+
+    def save(self, filepath: str) -> bool:
+        global NVIMGCODEC_LAST_USED
+        if self.is_cuda and HAS_NVIMGCODEC and nvimgcodec is not None:
+            try:
+                bgr = self.to_bgr()
+                if _is_cu(bgr.data):
+                    jpeg = _encode_nvimgcodec_cuda(bgr.data)
+                    NVIMGCODEC_LAST_USED = True
+                    with open(filepath, "wb") as f:
+                        f.write(jpeg)
+                    return True
+            except Exception:
+                NVIMGCODEC_LAST_USED = False
+        arr = self.to_opencv()
+        return cv2.imwrite(filepath, arr)
+
+    def to_base64(self, quality: int = 80) -> str:
+        global NVIMGCODEC_LAST_USED
+        if self.is_cuda and HAS_NVIMGCODEC and nvimgcodec is not None:
+            try:
+                bgr = self.to_bgr()
+                if _is_cu(bgr.data):
+                    jpeg = _encode_nvimgcodec_cuda(bgr.data, quality=quality)
+                    NVIMGCODEC_LAST_USED = True
+                    return base64.b64encode(jpeg).decode("utf-8")
+            except Exception:
+                NVIMGCODEC_LAST_USED = False
+        bgr = self.to_bgr()
+        success, buffer = cv2.imencode(
+            ".jpg", _to_cpu(bgr.data), [int(cv2.IMWRITE_JPEG_QUALITY), int(quality)]
+        )
+        if not success:
+            raise ValueError("Failed to encode image as JPEG")
+        return base64.b64encode(buffer.tobytes()).decode("utf-8")
diff --git a/dimos/msgs/sensor_msgs/image_impls/CudaImage.py b/dimos/msgs/sensor_msgs/image_impls/CudaImage.py
new file mode 100644
index 0000000000..fc8577f0ac
--- /dev/null
+++ b/dimos/msgs/sensor_msgs/image_impls/CudaImage.py
@@ -0,0 +1,895 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass, field
+from typing import Optional, Tuple
+
+import cv2
+import numpy as np
+
+from dimos.msgs.sensor_msgs.image_impls.AbstractImage import (
+    AbstractImage,
+    ImageFormat,
+    HAS_CUDA,
+    _is_cu,
+    _to_cpu,
+    _ascontig,
+)
+from dimos.msgs.sensor_msgs.image_impls.NumpyImage import NumpyImage
+
+try:
+    import cupy as cp  # type: ignore
+    from cupyx.scipy import ndimage as cndimage  # type: ignore
+    from cupyx.scipy import signal as csignal  # type: ignore
+except Exception:  # pragma: no cover
+    cp = None  # type: ignore
+    cndimage = None  # type: ignore
+    csignal = None  # type: ignore
+
+
+_CUDA_SRC = r"""
+extern "C" {
+
+__device__ __forceinline__ void rodrigues_R(const float r[3], float R[9]){
+  float theta = sqrtf(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);
+  if(theta < 1e-8f){
+    R[0]=1.f; R[1]=0.f; R[2]=0.f;
+    R[3]=0.f; R[4]=1.f; R[5]=0.f;
+    R[6]=0.f; R[7]=0.f; R[8]=1.f;
+    return;
+  }
+  float kx=r[0]/theta, ky=r[1]/theta, kz=r[2]/theta;
+  float c=cosf(theta), s=sinf(theta), v=1.f-c;
+  R[0]=kx*kx*v + c;      R[1]=kx*ky*v - kz*s; R[2]=kx*kz*v + ky*s;
+  R[3]=ky*kx*v + kz*s;   R[4]=ky*ky*v + c;    R[5]=ky*kz*v - kx*s;
+  R[6]=kz*kx*v - ky*s;   R[7]=kz*ky*v + kx*s; R[8]=kz*kz*v + c;
+}
+
+__device__ __forceinline__ void mat3x3_vec3(const float R[9], const float x[3], float y[3]){
+  y[0] = R[0]*x[0] + R[1]*x[1] + R[2]*x[2];
+  y[1] = R[3]*x[0] + R[4]*x[1] + R[5]*x[2];
+  y[2] = R[6]*x[0] + R[7]*x[1] + R[8]*x[2];
+}
+
+__device__ __forceinline__ void cross_mat(const float v[3], float S[9]){
+  S[0]=0.f;     S[1]=-v[2]; S[2]= v[1];
+  S[3]= v[2];   S[4]=0.f;   S[5]=-v[0];
+  S[6]=-v[1];   S[7]= v[0]; S[8]=0.f;
+}
+
+// Solve a 6x6 system (JTJ * x = JTr) with Gauss–Jordan; JTJ is SPD after damping.
+__device__ void solve6_gauss_jordan(float A[36], float b[6], float x[6]){
+  float M[6][7];
+  #pragma unroll
+  for(int r=0;r<6;++r){
+    #pragma unroll
+    for(int c=0;c<6;++c) M[r][c] = A[r*6 + c];
+    M[r][6] = b[r];
+  }
+  for(int piv=0;piv<6;++piv){
+    float invd = 1.f / M[piv][piv];
+    for(int c=piv;c<7;++c) M[piv][c] *= invd;
+    for(int r=0;r<6;++r){
+      if(r==piv) continue;
+      float f = M[r][piv];
+      if(fabsf(f) < 1e-20f) continue;
+      for(int c=piv;c<7;++c) M[r][c] -= f * M[piv][c];
+    }
+  }
+  #pragma unroll
+  for(int r=0;r<6;++r) x[r] = M[r][6];
+}
+
+// One block solves one pose; dynamic shared memory holds per-thread accumulators.
+__global__ void pnp_gn_batch(
+    const float* __restrict__ obj,   // (B,N,3)
+    const float* __restrict__ img,   // (B,N,2)
+    const int N,
+    const float* __restrict__ intr,  // (B,4) -> fx, fy, cx, cy
+    const int max_iters,
+    const float damping,
+    float* __restrict__ rvec_out,    // (B,3)
+    float* __restrict__ tvec_out     // (B,3)
+){
+  if(N <= 0) return;
+  int b = blockIdx.x;
+  const float* obj_b = obj + b * N * 3;
+  const float* img_b = img + b * N * 2;
+  float fx = intr[4*b + 0];
+  float fy = intr[4*b + 1];
+  float cx = intr[4*b + 2];
+  float cy = intr[4*b + 3];
+
+  __shared__ float s_R[9];
+  __shared__ float s_rvec[3];
+  __shared__ float s_tvec[3];
+  __shared__ float s_JTJ[36];
+  __shared__ float s_JTr[6];
+  __shared__ int   s_done;
+
+  extern __shared__ float scratch[];
+  float* sh_JTJ = scratch;
+  float* sh_JTr = scratch + 36 * blockDim.x;
+
+  if(threadIdx.x==0){
+    s_rvec[0]=0.f; s_rvec[1]=0.f; s_rvec[2]=0.f;
+    s_tvec[0]=0.f; s_tvec[1]=0.f; s_tvec[2]=2.f;
+  }
+  __syncthreads();
+
+  for(int it=0; it<max_iters; ++it){
+    if(threadIdx.x==0){
+      rodrigues_R(s_rvec, s_R);
+      s_done = 0;
+    }
+    __syncthreads();
+
+    float lJTJ[36];
+    float lJTr[6];
+    #pragma unroll
+    for(int k=0;k<36;++k) lJTJ[k]=0.f;
+    #pragma unroll
+    for(int k=0;k<6;++k) lJTr[k]=0.f;
+
+    for(int i=threadIdx.x; i<N; i+=blockDim.x){
+      float Xi[3] = {obj_b[3*i+0], obj_b[3*i+1], obj_b[3*i+2]};
+      float mi_u  = img_b[2*i+0];
+      float mi_v  = img_b[2*i+1];
+
+      float RX[3];
+      mat3x3_vec3(s_R, Xi, RX);
+      float Xc[3];
+      Xc[0]=RX[0]+s_tvec[0];
+      Xc[1]=RX[1]+s_tvec[1];
+      Xc[2]=RX[2]+s_tvec[2];
+
+      float invZ = 1.f / fmaxf(Xc[2], 1e-6f);
+      float uhat = fx * Xc[0]*invZ + cx;
+      float vhat = fy * Xc[1]*invZ + cy;
+
+      float ru = mi_u - uhat;
+      float rv = mi_v - vhat;
+
+      float du_dX = fx * invZ;
+      float du_dY = 0.f;
+      float du_dZ = -fx * Xc[0] * invZ * invZ;
+
+      float dv_dX = 0.f;
+      float dv_dY = fy * invZ;
+      float dv_dZ = -fy * Xc[1] * invZ * invZ;
+
+      float S[9];
+      cross_mat(Xi, S);
+      float M[9];
+      M[0]=-(s_R[0]*S[0] + s_R[1]*S[3] + s_R[2]*S[6]);
+      M[1]=-(s_R[0]*S[1] + s_R[1]*S[4] + s_R[2]*S[7]);
+      M[2]=-(s_R[0]*S[2] + s_R[1]*S[5] + s_R[2]*S[8]);
+      M[3]=-(s_R[3]*S[0] + s_R[4]*S[3] + s_R[5]*S[6]);
+      M[4]=-(s_R[3]*S[1] + s_R[4]*S[4] + s_R[5]*S[7]);
+      M[5]=-(s_R[3]*S[2] + s_R[4]*S[5] + s_R[5]*S[8]);
+      M[6]=-(s_R[6]*S[0] + s_R[7]*S[3] + s_R[8]*S[6]);
+      M[7]=-(s_R[6]*S[1] + s_R[7]*S[4] + s_R[8]*S[7]);
+      M[8]=-(s_R[6]*S[2] + s_R[7]*S[5] + s_R[8]*S[8]);
+
+      float Ju_r = du_dX*M[0] + du_dY*M[3] + du_dZ*M[6];
+      float Ju_p = du_dX*M[1] + du_dY*M[4] + du_dZ*M[7];
+      float Ju_y = du_dX*M[2] + du_dY*M[5] + du_dZ*M[8];
+
+      float Jv_r = dv_dX*M[0] + dv_dY*M[3] + dv_dZ*M[6];
+      float Jv_p = dv_dX*M[1] + dv_dY*M[4] + dv_dZ*M[7];
+      float Jv_y = dv_dX*M[2] + dv_dY*M[5] + dv_dZ*M[8];
+
+      float Ju_tx = du_dX, Ju_ty = du_dY, Ju_tz = du_dZ;
+      float Jv_tx = dv_dX, Jv_ty = dv_dY, Jv_tz = dv_dZ;
+
+      float J0[6] = {Ju_r, Ju_p, Ju_y, Ju_tx, Ju_ty, Ju_tz};
+      float J1[6] = {Jv_r, Jv_p, Jv_y, Jv_tx, Jv_ty, Jv_tz};
+
+      int idx = 0;
+      for(int a=0;a<6;++a){
+        lJTr[a] += J0[a]*ru + J1[a]*rv;
+        for(int b=0;b<6;++b){
+          lJTJ[idx++] += J0[a]*J0[b] + J1[a]*J1[b];
+        }
+      }
+    }
+
+    for(int k=0;k<36;++k){
+      sh_JTJ[k*blockDim.x + threadIdx.x] = lJTJ[k];
+    }
+    for(int k=0;k<6;++k){
+      sh_JTr[k*blockDim.x + threadIdx.x] = lJTr[k];
+    }
+    __syncthreads();
+
+    if(threadIdx.x==0){
+      for(int k=0;k<36;++k){
+        float sum = 0.f;
+        for(int t=0;t<blockDim.x;++t){
+          sum += sh_JTJ[k*blockDim.x + t];
+        }
+        s_JTJ[k] = sum;
+      }
+      for(int k=0;k<6;++k){
+        float sum = 0.f;
+        for(int t=0;t<blockDim.x;++t){
+          sum += sh_JTr[k*blockDim.x + t];
+        }
+        s_JTr[k] = sum;
+      }
+      for(int d=0; d<6; ++d){
+        s_JTJ[d*6 + d] += damping;
+      }
+
+      float delta[6];
+      solve6_gauss_jordan(s_JTJ, s_JTr, delta);
+
+      s_rvec[0] += delta[0]; s_rvec[1] += delta[1]; s_rvec[2] += delta[2];
+      s_tvec[0] += delta[3]; s_tvec[1] += delta[4]; s_tvec[2] += delta[5];
+
+      float nrm = fabsf(delta[0])+fabsf(delta[1])+fabsf(delta[2])
+                + fabsf(delta[3])+fabsf(delta[4])+fabsf(delta[5]);
+      s_done = (nrm < 1e-6f) ? 1 : 0;
+    }
+    __syncthreads();
+    if(s_done) break;
+  }
+
+  if(threadIdx.x==0){
+    rvec_out[3*b + 0]=s_rvec[0];
+    rvec_out[3*b + 1]=s_rvec[1];
+    rvec_out[3*b + 2]=s_rvec[2];
+    tvec_out[3*b + 0]=s_tvec[0];
+    tvec_out[3*b + 1]=s_tvec[1];
+    tvec_out[3*b + 2]=s_tvec[2];
+  }
+}
+
+} // extern "C"
+"""
+
+if cp is not None:
+    _mod = cp.RawModule(code=_CUDA_SRC, options=("-std=c++14",), name_expressions=("pnp_gn_batch",))
+    _pnp_kernel = _mod.get_function("pnp_gn_batch")
+
+
+def _solve_pnp_cuda_kernel(obj, img, K, iterations=15, damping=1e-6):
+    if cp is None:
+        raise RuntimeError("CuPy/CUDA not available")
+
+    obj_cu = cp.asarray(obj, dtype=cp.float32)
+    if obj_cu.ndim == 2:
+        obj_cu = obj_cu[None, ...]
+    if obj_cu.ndim != 3 or obj_cu.shape[2] != 3:
+        raise ValueError("object_points must have shape (..., 3)")
+    B, N, _ = obj_cu.shape
+    if N <= 0:
+        raise ValueError("object_points must contain at least one correspondence")
+
+    img_cu = cp.asarray(img, dtype=cp.float32)
+    if img_cu.ndim == 2:
+        img_cu = img_cu[None, ...]
+    if img_cu.ndim != 3 or img_cu.shape[2] != 2:
+        raise ValueError("image_points must have shape (..., 2)")
+    if img_cu.shape[0] != B or img_cu.shape[1] != N:
+        raise ValueError("object and image batches must align")
+
+    obj_cu = cp.ascontiguousarray(obj_cu)
+    img_cu = cp.ascontiguousarray(img_cu)
+
+    K_np = np.asarray(_to_cpu(K), dtype=np.float32)
+    np_intri = np.empty((B, 4), dtype=np.float32)
+    if K_np.ndim == 2:
+        if K_np.shape != (3, 3):
+            raise ValueError("camera_matrix must be 3x3 or batched 3x3")
+        fx, fy = K_np[0, 0], K_np[1, 1]
+        cx, cy = K_np[0, 2], K_np[1, 2]
+        np_intri[:] = (fx, fy, cx, cy)
+    elif K_np.ndim == 3:
+        if K_np.shape[0] != B or K_np.shape[1:] != (3, 3):
+            raise ValueError("batched camera_matrix must match batch size and be 3x3")
+        np_intri[:, 0] = K_np[:, 0, 0]
+        np_intri[:, 1] = K_np[:, 1, 1]
+        np_intri[:, 2] = K_np[:, 0, 2]
+        np_intri[:, 3] = K_np[:, 1, 2]
+    else:
+        raise ValueError("camera_matrix must be 3x3 or batched 3x3")
+
+    intr_cu = cp.asarray(np_intri, dtype=cp.float32)
+    r_out = cp.empty((B, 3), dtype=cp.float32)
+    t_out = cp.empty((B, 3), dtype=cp.float32)
+
+    threads = 256
+    if N < threads:
+        threads = 32
+        while threads < N and threads < 256:
+            threads <<= 1
+        threads = max(threads, 32)
+    shared_mem = (36 + 6) * threads * np.dtype(np.float32).itemsize
+
+    _pnp_kernel(
+        (B,),
+        (threads,),
+        (
+            obj_cu,
+            img_cu,
+            np.int32(int(N)),
+            intr_cu,
+            np.int32(int(iterations)),
+            np.float32(damping),
+            r_out,
+            t_out,
+        ),
+        shared_mem=shared_mem,
+    )
+
+    r_host = cp.asnumpy(r_out).reshape(B, 3, 1).astype(np.float64, copy=False)
+    t_host = cp.asnumpy(t_out).reshape(B, 3, 1).astype(np.float64, copy=False)
+    if B == 1:
+        return r_host[0], t_host[0]
+    return r_host, t_host
+
+
+def _bgr_to_rgb_cuda(img):
+    return img[..., ::-1]
+
+
+def _rgb_to_bgr_cuda(img):
+    return img[..., ::-1]
+
+
+def _bgra_to_rgba_cuda(img):
+    out = img.copy()
+    out[..., 0], out[..., 2] = img[..., 2], img[..., 0]
+    return out
+
+
+def _rgba_to_bgra_cuda(img):
+    out = img.copy()
+    out[..., 0], out[..., 2] = img[..., 2], img[..., 0]
+    return out
+
+
+def _gray_to_rgb_cuda(gray):
+    return cp.stack([gray, gray, gray], axis=-1)  # type: ignore
+
+
+def _rgb_to_gray_cuda(rgb):
+    r = rgb[..., 0].astype(cp.float32)  # type: ignore
+    g = rgb[..., 1].astype(cp.float32)  # type: ignore
+    b = rgb[..., 2].astype(cp.float32)  # type: ignore
+    # These come from the Rec.601 conversion for YUV. R = 0.299, G = 0.587, B = 0.114
+    y = 0.299 * r + 0.587 * g + 0.114 * b
+    if rgb.dtype == cp.uint8:  # type: ignore
+        y = cp.clip(y, 0, 255).astype(cp.uint8)  # type: ignore
+    return y
+
+
+def _resize_bilinear_hwc_cuda(img, out_h: int, out_w: int):
+    if cp is None or cndimage is None:
+        raise RuntimeError("CuPy/CUDA not available")
+    if img.ndim not in (2, 3):
+        raise ValueError("Expected HxW or HxWxC array")
+
+    work = img[..., None] if img.ndim == 2 else img
+    squeezed = work is not img
+    in_h, in_w = work.shape[:2]
+    if (in_h, in_w) == (out_h, out_w):
+        return img.copy()
+
+    zoom = (out_h / in_h, out_w / in_w, 1.0)
+    out = cndimage.zoom(
+        work.astype(cp.float32, copy=False),
+        zoom=zoom,
+        order=1,
+        mode="nearest",
+        prefilter=False,
+        grid_mode=True,
+    )
+
+    if squeezed:
+        out = out[..., 0]
+    if img.dtype == cp.uint8:
+        out = cp.clip(out, 0, 255).astype(cp.uint8, copy=False)
+    elif out.dtype != img.dtype:
+        out = out.astype(img.dtype, copy=False)
+    return out
+
+
+def _rodrigues(x, inverse: bool = False):
+    """Unified Rodrigues transform (vector<->matrix) for NumPy/CuPy arrays."""
+
+    if cp is not None and (
+        isinstance(x, cp.ndarray)  # type: ignore[arg-type]
+        or getattr(x, "__cuda_array_interface__", None) is not None
+    ):
+        xp = cp
+    else:
+        xp = np
+    arr = xp.asarray(x, dtype=xp.float64)
+
+    if not inverse and arr.ndim >= 2 and arr.shape[-2:] == (3, 3):
+        inverse = True
+
+    if not inverse:
+        vec = arr
+        if vec.ndim >= 2 and vec.shape[-1] == 1:
+            vec = vec[..., 0]
+        if vec.shape[-1] != 3:
+            raise ValueError("Rodrigues expects vectors of shape (..., 3)")
+        orig_shape = vec.shape[:-1]
+        vec = vec.reshape(-1, 3)
+        n = vec.shape[0]
+        theta = xp.linalg.norm(vec, axis=1)
+        small = theta < 1e-12
+
+        def _skew(v):
+            vx, vy, vz = v[:, 0], v[:, 1], v[:, 2]
+            O = xp.zeros_like(vx)
+            return xp.stack(
+                [
+                    xp.stack([O, -vz, vy], axis=-1),
+                    xp.stack([vz, O, -vx], axis=-1),
+                    xp.stack([-vy, vx, O], axis=-1),
+                ],
+                axis=-2,
+            )
+
+        K = _skew(vec)
+        theta2 = theta * theta
+        theta4 = theta2 * theta2
+        theta_safe = xp.where(small, 1.0, theta)
+        theta2_safe = xp.where(small, 1.0, theta2)
+        A = xp.where(small, 1.0 - theta2 / 6.0 + theta4 / 120.0, xp.sin(theta) / theta_safe)[
+            :, None, None
+        ]
+        B = xp.where(
+            small,
+            0.5 - theta2 / 24.0 + theta4 / 720.0,
+            (1.0 - xp.cos(theta)) / theta2_safe,
+        )[:, None, None]
+        I = xp.eye(3, dtype=arr.dtype)
+        I = I[None, :, :] if n == 1 else xp.broadcast_to(I, (n, 3, 3))
+        KK = xp.matmul(K, K)
+        out = I + A * K + B * KK
+        return out.reshape(orig_shape + (3, 3)) if orig_shape else out[0]
+
+    mat = arr
+    if mat.shape[-2:] != (3, 3):
+        raise ValueError("Rodrigues expects rotation matrices of shape (..., 3, 3)")
+    orig_shape = mat.shape[:-2]
+    mat = mat.reshape(-1, 3, 3)
+    trace = xp.trace(mat, axis1=1, axis2=2)
+    trace = xp.clip((trace - 1.0) / 2.0, -1.0, 1.0)
+    theta = xp.arccos(trace)
+    v = xp.stack(
+        [
+            mat[:, 2, 1] - mat[:, 1, 2],
+            mat[:, 0, 2] - mat[:, 2, 0],
+            mat[:, 1, 0] - mat[:, 0, 1],
+        ],
+        axis=1,
+    )
+    norm_v = xp.linalg.norm(v, axis=1)
+    small = theta < 1e-7
+    eps = 1e-8
+    norm_safe = xp.where(norm_v < eps, 1.0, norm_v)
+    r_general = theta[:, None] * v / norm_safe[:, None]
+    r_small = 0.5 * v
+    r = xp.where(small[:, None], r_small, r_general)
+    pi_mask = xp.abs(theta - xp.pi) < 1e-4
+    if np.any(pi_mask) if xp is np else bool(cp.asnumpy(pi_mask).any()):
+        diag = xp.diagonal(mat, axis1=1, axis2=2)
+        axis_candidates = xp.clip((diag + 1.0) / 2.0, 0.0, None)
+        axis = xp.sqrt(axis_candidates)
+        signs = xp.sign(v)
+        axis = xp.where(signs == 0, axis, xp.copysign(axis, signs))
+        axis_norm = xp.linalg.norm(axis, axis=1)
+        axis_norm = xp.where(axis_norm < eps, 1.0, axis_norm)
+        axis = axis / axis_norm[:, None]
+        r_pi = theta[:, None] * axis
+        r = xp.where(pi_mask[:, None], r_pi, r)
+    out = r.reshape(orig_shape + (3,)) if orig_shape else r[0]
+    return out
+
+
+def _undistort_points_cuda(
+    img_px: "cp.ndarray", K: "cp.ndarray", dist: "cp.ndarray", iterations: int = 8
+) -> "cp.ndarray":
+    """Iteratively undistort pixel coordinates on device (Brown–Conrady).
+
+    Returns pixel coordinates after undistortion (fx*xu+cx, fy*yu+cy).
+    """
+    N = img_px.shape[0]
+    ones = cp.ones((N, 1), dtype=cp.float64)
+    uv1 = cp.concatenate([img_px.astype(cp.float64), ones], axis=1)
+    Kinv = cp.linalg.inv(K)
+    xdyd1 = uv1 @ Kinv.T
+    xd = xdyd1[:, 0]
+    yd = xdyd1[:, 1]
+    xu = xd.copy()
+    yu = yd.copy()
+    k1 = dist[0]
+    k2 = dist[1] if dist.size > 1 else 0.0
+    p1 = dist[2] if dist.size > 2 else 0.0
+    p2 = dist[3] if dist.size > 3 else 0.0
+    k3 = dist[4] if dist.size > 4 else 0.0
+    for _ in range(iterations):
+        r2 = xu * xu + yu * yu
+        r4 = r2 * r2
+        r6 = r4 * r2
+        radial = 1.0 + k1 * r2 + k2 * r4 + k3 * r6
+        delta_x = 2.0 * p1 * xu * yu + p2 * (r2 + 2.0 * xu * xu)
+        delta_y = p1 * (r2 + 2.0 * yu * yu) + 2.0 * p2 * xu * yu
+        xu = (xd - delta_x) / radial
+        yu = (yd - delta_y) / radial
+    fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
+    return cp.stack([fx * xu + cx, fy * yu + cy], axis=1)
+
+
+@dataclass
+class CudaImage(AbstractImage):
+    data: any  # cupy.ndarray
+    format: ImageFormat = field(default=ImageFormat.BGR)
+    frame_id: str = field(default="")
+    ts: float = field(default_factory=time.time)
+
+    def __post_init__(self):
+        if not HAS_CUDA or cp is None:
+            raise RuntimeError("CuPy/CUDA not available")
+        if not _is_cu(self.data):
+            # Accept NumPy arrays and move to device automatically
+            try:
+                self.data = cp.asarray(self.data)
+            except Exception as e:
+                raise ValueError("CudaImage requires a CuPy array") from e
+        if self.data.ndim < 2:
+            raise ValueError("Image data must be at least 2D")
+        self.data = _ascontig(self.data)
+
+    @property
+    def is_cuda(self) -> bool:
+        return True
+
+    def to_opencv(self) -> np.ndarray:
+        if self.format in (ImageFormat.BGR, ImageFormat.RGB, ImageFormat.RGBA, ImageFormat.BGRA):
+            return _to_cpu(self.to_bgr().data)
+        return _to_cpu(self.data)
+
+    def to_rgb(self) -> "CudaImage":
+        if self.format == ImageFormat.RGB:
+            return self.copy()  # type: ignore
+        if self.format == ImageFormat.BGR:
+            return CudaImage(_bgr_to_rgb_cuda(self.data), ImageFormat.RGB, self.frame_id, self.ts)
+        if self.format == ImageFormat.RGBA:
+            return self.copy()  # type: ignore
+        if self.format == ImageFormat.BGRA:
+            return CudaImage(
+                _bgra_to_rgba_cuda(self.data), ImageFormat.RGBA, self.frame_id, self.ts
+            )
+        if self.format == ImageFormat.GRAY:
+            return CudaImage(_gray_to_rgb_cuda(self.data), ImageFormat.RGB, self.frame_id, self.ts)
+        if self.format in (ImageFormat.GRAY16, ImageFormat.DEPTH16):
+            gray8 = (self.data.astype(cp.float32) / 256.0).clip(0, 255).astype(cp.uint8)  # type: ignore
+            return CudaImage(_gray_to_rgb_cuda(gray8), ImageFormat.RGB, self.frame_id, self.ts)
+        return self.copy()  # type: ignore
+
+    def to_bgr(self) -> "CudaImage":
+        if self.format == ImageFormat.BGR:
+            return self.copy()  # type: ignore
+        if self.format == ImageFormat.RGB:
+            return CudaImage(_rgb_to_bgr_cuda(self.data), ImageFormat.BGR, self.frame_id, self.ts)
+        if self.format == ImageFormat.RGBA:
+            return CudaImage(
+                _rgba_to_bgra_cuda(self.data)[..., :3], ImageFormat.BGR, self.frame_id, self.ts
+            )
+        if self.format == ImageFormat.BGRA:
+            return CudaImage(self.data[..., :3], ImageFormat.BGR, self.frame_id, self.ts)
+        if self.format in (ImageFormat.GRAY, ImageFormat.DEPTH):
+            return CudaImage(
+                _rgb_to_bgr_cuda(_gray_to_rgb_cuda(self.data)),
+                ImageFormat.BGR,
+                self.frame_id,
+                self.ts,
+            )
+        if self.format in (ImageFormat.GRAY16, ImageFormat.DEPTH16):
+            gray8 = (self.data.astype(cp.float32) / 256.0).clip(0, 255).astype(cp.uint8)  # type: ignore
+            return CudaImage(
+                _rgb_to_bgr_cuda(_gray_to_rgb_cuda(gray8)), ImageFormat.BGR, self.frame_id, self.ts
+            )
+        return self.copy()  # type: ignore
+
+    def to_grayscale(self) -> "CudaImage":
+        if self.format in (ImageFormat.GRAY, ImageFormat.GRAY16, ImageFormat.DEPTH):
+            return self.copy()  # type: ignore
+        if self.format == ImageFormat.BGR:
+            return CudaImage(
+                _rgb_to_gray_cuda(_bgr_to_rgb_cuda(self.data)),
+                ImageFormat.GRAY,
+                self.frame_id,
+                self.ts,
+            )
+        if self.format == ImageFormat.RGB:
+            return CudaImage(_rgb_to_gray_cuda(self.data), ImageFormat.GRAY, self.frame_id, self.ts)
+        if self.format in (ImageFormat.RGBA, ImageFormat.BGRA):
+            rgb = (
+                self.data[..., :3]
+                if self.format == ImageFormat.RGBA
+                else _bgra_to_rgba_cuda(self.data)[..., :3]
+            )
+            return CudaImage(_rgb_to_gray_cuda(rgb), ImageFormat.GRAY, self.frame_id, self.ts)
+        raise ValueError(f"Unsupported format: {self.format}")
+
+    def resize(self, width: int, height: int, interpolation: int = cv2.INTER_LINEAR) -> "CudaImage":
+        return CudaImage(
+            _resize_bilinear_hwc_cuda(self.data, height, width), self.format, self.frame_id, self.ts
+        )
+
+    def sharpness(self) -> float:
+        if cp is None:
+            return 0.0
+        try:
+            from cupyx.scipy import ndimage as cndimage  # type: ignore
+
+            gray = self.to_grayscale().data.astype(cp.float32)
+            deriv5 = cp.asarray([1, 2, 0, -2, -1], dtype=cp.float32)
+            smooth5 = cp.asarray([1, 4, 6, 4, 1], dtype=cp.float32)
+            gx = cndimage.convolve1d(gray, deriv5, axis=1, mode="reflect")  # type: ignore
+            gx = cndimage.convolve1d(gx, smooth5, axis=0, mode="reflect")  # type: ignore
+            gy = cndimage.convolve1d(gray, deriv5, axis=0, mode="reflect")  # type: ignore
+            gy = cndimage.convolve1d(gy, smooth5, axis=1, mode="reflect")  # type: ignore
+            magnitude = cp.hypot(gx, gy)  # type: ignore
+            mean_mag = float(cp.asnumpy(magnitude.mean()))  # type: ignore
+        except Exception:
+            return 0.0
+        if mean_mag <= 0:
+            return 0.0
+        return float(np.clip((np.log10(mean_mag + 1) - 1.7) / 2.0, 0.0, 1.0))
+
+    # CUDA tracker (template NCC with small scale pyramid)
+    @dataclass
+    class BBox:
+        x: int
+        y: int
+        w: int
+        h: int
+
+    def create_csrt_tracker(self, bbox: BBox):
+        if csignal is None:
+            raise RuntimeError("cupyx.scipy.signal not available for CUDA tracker")
+        x, y, w, h = map(int, bbox)
+        gray = self.to_grayscale().data.astype(cp.float32)
+        tmpl = gray[y : y + h, x : x + w]
+        if tmpl.size == 0:
+            raise ValueError("Invalid bbox for CUDA tracker")
+        return _CudaTemplateTracker(tmpl, x0=x, y0=y)
+
+    def csrt_update(self, tracker) -> Tuple[bool, Tuple[int, int, int, int]]:
+        if not isinstance(tracker, _CudaTemplateTracker):
+            raise TypeError("Expected CUDA tracker instance")
+        gray = self.to_grayscale().data.astype(cp.float32)
+        x, y, w, h = tracker.update(gray)
+        return True, (int(x), int(y), int(w), int(h))
+
+    # PnP – Gauss–Newton (no distortion in batch), iterative per-instance
+    def solve_pnp(
+        self,
+        object_points: np.ndarray,
+        image_points: np.ndarray,
+        camera_matrix: np.ndarray,
+        dist_coeffs: Optional[np.ndarray] = None,
+        flags: int = cv2.SOLVEPNP_ITERATIVE,
+    ) -> Tuple[bool, np.ndarray, np.ndarray]:
+        if not HAS_CUDA or cp is None or (dist_coeffs is not None and np.any(dist_coeffs)):
+            obj = np.asarray(object_points, dtype=np.float32).reshape(-1, 3)
+            img = np.asarray(image_points, dtype=np.float32).reshape(-1, 2)
+            K = np.asarray(camera_matrix, dtype=np.float64)
+            dist = None if dist_coeffs is None else np.asarray(dist_coeffs, dtype=np.float64)
+            ok, rvec, tvec = cv2.solvePnP(obj, img, K, dist, flags=flags)
+            return bool(ok), rvec.astype(np.float64), tvec.astype(np.float64)
+
+        rvec, tvec = _solve_pnp_cuda_kernel(object_points, image_points, camera_matrix)
+        ok = np.isfinite(rvec).all() and np.isfinite(tvec).all()
+        return ok, rvec, tvec
+
+    def solve_pnp_batch(
+        self,
+        object_points_batch: np.ndarray,
+        image_points_batch: np.ndarray,
+        camera_matrix: np.ndarray,
+        dist_coeffs: Optional[np.ndarray] = None,
+        iterations: int = 15,
+        damping: float = 1e-6,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Batched PnP (each block = one instance)."""
+        if not HAS_CUDA or cp is None or (dist_coeffs is not None and np.any(dist_coeffs)):
+            obj = np.asarray(object_points_batch, dtype=np.float32)
+            img = np.asarray(image_points_batch, dtype=np.float32)
+            if obj.ndim != 3 or img.ndim != 3 or obj.shape[:2] != img.shape[:2]:
+                raise ValueError(
+                    "Batched object/image arrays must be shaped (B,N,...) with matching sizes"
+                )
+            K = np.asarray(camera_matrix, dtype=np.float64)
+            dist = None if dist_coeffs is None else np.asarray(dist_coeffs, dtype=np.float64)
+            B = obj.shape[0]
+            r_list = np.empty((B, 3, 1), dtype=np.float64)
+            t_list = np.empty((B, 3, 1), dtype=np.float64)
+            for b in range(B):
+                K_b = K if K.ndim == 2 else K[b]
+                dist_b = None
+                if dist is not None:
+                    if dist.ndim == 1:
+                        dist_b = dist
+                    elif dist.ndim == 2:
+                        dist_b = dist[b]
+                    else:
+                        raise ValueError("dist_coeffs must be 1D or batched 2D")
+                ok, rvec, tvec = cv2.solvePnP(
+                    obj[b], img[b], K_b, dist_b, flags=cv2.SOLVEPNP_ITERATIVE
+                )
+                if not ok:
+                    raise RuntimeError(f"cv2.solvePnP failed for batch index {b}")
+                r_list[b] = rvec.astype(np.float64)
+                t_list[b] = tvec.astype(np.float64)
+            return r_list, t_list
+
+        return _solve_pnp_cuda_kernel(
+            object_points_batch,
+            image_points_batch,
+            camera_matrix,
+            iterations=iterations,
+            damping=damping,
+        )
+
+    def solve_pnp_ransac(
+        self,
+        object_points: np.ndarray,
+        image_points: np.ndarray,
+        camera_matrix: np.ndarray,
+        dist_coeffs: Optional[np.ndarray] = None,
+        iterations_count: int = 100,
+        reprojection_error: float = 3.0,
+        confidence: float = 0.99,
+        min_sample: int = 6,
+    ) -> Tuple[bool, np.ndarray, np.ndarray, np.ndarray]:
+        """RANSAC with CUDA PnP solver."""
+        if not HAS_CUDA or cp is None or (dist_coeffs is not None and np.any(dist_coeffs)):
+            obj = np.asarray(object_points, dtype=np.float32)
+            img = np.asarray(image_points, dtype=np.float32)
+            K = np.asarray(camera_matrix, dtype=np.float64)
+            dist = None if dist_coeffs is None else np.asarray(dist_coeffs, dtype=np.float64)
+            ok, rvec, tvec, mask = cv2.solvePnPRansac(
+                obj,
+                img,
+                K,
+                dist,
+                iterationsCount=int(iterations_count),
+                reprojectionError=float(reprojection_error),
+                confidence=float(confidence),
+                flags=cv2.SOLVEPNP_ITERATIVE,
+            )
+            mask_flat = np.zeros((obj.shape[0],), dtype=np.uint8)
+            if mask is not None and len(mask) > 0:
+                mask_flat[mask.flatten()] = 1
+            return bool(ok), rvec.astype(np.float64), tvec.astype(np.float64), mask_flat
+
+        obj = cp.asarray(object_points, dtype=cp.float32)
+        img = cp.asarray(image_points, dtype=cp.float32)
+        camera_matrix_np = np.asarray(_to_cpu(camera_matrix), dtype=np.float32)
+        fx = float(camera_matrix_np[0, 0])
+        fy = float(camera_matrix_np[1, 1])
+        cx = float(camera_matrix_np[0, 2])
+        cy = float(camera_matrix_np[1, 2])
+        N = obj.shape[0]
+        rng = cp.random.RandomState(1234)
+        best_inliers = -1
+        best_r, best_t, best_mask = None, None, None
+
+        for _ in range(iterations_count):
+            idx = rng.choice(N, size=min_sample, replace=False)
+            rvec, tvec = _solve_pnp_cuda_kernel(obj[idx], img[idx], camera_matrix_np)
+            R = _rodrigues(cp.asarray(rvec.flatten()))
+            Xc = obj @ R.T + cp.asarray(tvec.flatten())
+            invZ = 1.0 / cp.clip(Xc[:, 2], 1e-6, None)
+            u_hat = fx * Xc[:, 0] * invZ + cx
+            v_hat = fy * Xc[:, 1] * invZ + cy
+            err = cp.sqrt((img[:, 0] - u_hat) ** 2 + (img[:, 1] - v_hat) ** 2)
+            mask = (err < reprojection_error).astype(cp.uint8)
+            inliers = int(mask.sum())
+            if inliers > best_inliers:
+                best_inliers, best_r, best_t, best_mask = inliers, rvec, tvec, mask
+                if inliers >= int(confidence * N):
+                    break
+
+        if best_inliers <= 0:
+            return False, np.zeros((3, 1)), np.zeros((3, 1)), np.zeros((N,), dtype=np.uint8)
+        in_idx = cp.nonzero(best_mask)[0]
+        rvec, tvec = _solve_pnp_cuda_kernel(obj[in_idx], img[in_idx], camera_matrix_np)
+        return True, rvec, tvec, cp.asnumpy(best_mask)
+
+
+class _CudaTemplateTracker:
+    def __init__(
+        self,
+        tmpl: "cp.ndarray",
+        scale_step: float = 1.05,
+        lr: float = 0.1,
+        search_radius: int = 16,
+        x0: int = 0,
+        y0: int = 0,
+    ):
+        self.tmpl = tmpl.astype(cp.float32)
+        self.h, self.w = int(tmpl.shape[0]), int(tmpl.shape[1])
+        self.scale_step = float(scale_step)
+        self.lr = float(lr)
+        self.search_radius = int(search_radius)
+        # Cosine window
+        wy = cp.hanning(self.h).astype(cp.float32)
+        wx = cp.hanning(self.w).astype(cp.float32)
+        self.window = wy[:, None] * wx[None, :]
+        self.tmpl = self.tmpl * self.window
+        self.y = int(y0)
+        self.x = int(x0)
+
+    def update(self, gray: "cp.ndarray"):
+        H, W = int(gray.shape[0]), int(gray.shape[1])
+        r = self.search_radius
+        x0 = max(0, self.x - r)
+        y0 = max(0, self.y - r)
+        x1 = min(W, self.x + self.w + r)
+        y1 = min(H, self.y + self.h + r)
+        search = gray[y0:y1, x0:x1]
+        if search.shape[0] < self.h or search.shape[1] < self.w:
+            search = gray
+            x0 = y0 = 0
+        best = (self.x, self.y, self.w, self.h)
+        best_score = -1e9
+        for s in (1.0 / self.scale_step, 1.0, self.scale_step):
+            th = max(1, int(round(self.h * s)))
+            tw = max(1, int(round(self.w * s)))
+            tmpl_s = _resize_bilinear_hwc_cuda(self.tmpl, th, tw)
+            if tmpl_s.ndim == 3:
+                tmpl_s = tmpl_s[..., 0]
+            tmpl_s = tmpl_s.astype(cp.float32)
+            tmpl_zm = tmpl_s - tmpl_s.mean()
+            tmpl_energy = cp.sqrt(cp.sum(tmpl_zm * tmpl_zm)) + 1e-6
+            # NCC via correlate2d and local std
+            ones = cp.ones((th, tw), dtype=cp.float32)
+            num = csignal.correlate2d(search, tmpl_zm, mode="valid")  # type: ignore
+            sumS = csignal.correlate2d(search, ones, mode="valid")  # type: ignore
+            sumS2 = csignal.correlate2d(search * search, ones, mode="valid")  # type: ignore
+            n = float(th * tw)
+            meanS = sumS / n
+            varS = cp.clip(sumS2 - n * meanS * meanS, 0.0, None)
+            stdS = cp.sqrt(varS) + 1e-6
+            res = num / (stdS * tmpl_energy)
+            ij = cp.unravel_index(cp.argmax(res), res.shape)
+            dy, dx = int(ij[0].get()), int(ij[1].get())  # type: ignore
+            score = float(res[ij].get())  # type: ignore
+            if score > best_score:
+                best_score = score
+                best = (x0 + dx, y0 + dy, tw, th)
+        x, y, w, h = best
+        patch = gray[y : y + h, x : x + w]
+        if patch.shape[0] != self.h or patch.shape[1] != self.w:
+            patch = _resize_bilinear_hwc_cuda(patch, self.h, self.w)
+            if patch.ndim == 3:
+                patch = patch[..., 0]
+        patch = patch.astype(cp.float32) * self.window
+        self.tmpl = (1.0 - self.lr) * self.tmpl + self.lr * patch
+        self.x, self.y, self.w, self.h = x, y, w, h
+        return x, y, w, h
diff --git a/dimos/msgs/sensor_msgs/image_impls/NumpyImage.py b/dimos/msgs/sensor_msgs/image_impls/NumpyImage.py
new file mode 100644
index 0000000000..da04005d4f
--- /dev/null
+++ b/dimos/msgs/sensor_msgs/image_impls/NumpyImage.py
@@ -0,0 +1,214 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass, field
+from typing import Optional, Tuple
+
+import cv2
+import numpy as np
+
+from dimos.msgs.sensor_msgs.image_impls.AbstractImage import (
+    AbstractImage,
+    ImageFormat,
+)
+
+
+@dataclass
+class NumpyImage(AbstractImage):
+    data: np.ndarray
+    format: ImageFormat = field(default=ImageFormat.BGR)
+    frame_id: str = field(default="")
+    ts: float = field(default_factory=time.time)
+
+    def __post_init__(self):
+        if not isinstance(self.data, np.ndarray) or self.data.ndim < 2:
+            raise ValueError("NumpyImage requires a 2D/3D NumPy array")
+
+    @property
+    def is_cuda(self) -> bool:
+        return False
+
+    def to_opencv(self) -> np.ndarray:
+        arr = self.data
+        if self.format == ImageFormat.BGR:
+            return arr
+        if self.format == ImageFormat.RGB:
+            return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
+        if self.format == ImageFormat.RGBA:
+            return cv2.cvtColor(arr, cv2.COLOR_RGBA2BGR)
+        if self.format == ImageFormat.BGRA:
+            return cv2.cvtColor(arr, cv2.COLOR_BGRA2BGR)
+        if self.format in (
+            ImageFormat.GRAY,
+            ImageFormat.GRAY16,
+            ImageFormat.DEPTH,
+            ImageFormat.DEPTH16,
+        ):
+            return arr
+        raise ValueError(f"Unsupported format: {self.format}")
+
+    def to_rgb(self) -> "NumpyImage":
+        if self.format == ImageFormat.RGB:
+            return self.copy()  # type: ignore
+        arr = self.data
+        if self.format == ImageFormat.BGR:
+            return NumpyImage(
+                cv2.cvtColor(arr, cv2.COLOR_BGR2RGB), ImageFormat.RGB, self.frame_id, self.ts
+            )
+        if self.format == ImageFormat.RGBA:
+            return self.copy()  # RGBA contains RGB + alpha
+        if self.format == ImageFormat.BGRA:
+            rgba = cv2.cvtColor(arr, cv2.COLOR_BGRA2RGBA)
+            return NumpyImage(rgba, ImageFormat.RGBA, self.frame_id, self.ts)
+        if self.format in (ImageFormat.GRAY, ImageFormat.GRAY16, ImageFormat.DEPTH16):
+            gray8 = (arr / 256).astype(np.uint8) if self.format != ImageFormat.GRAY else arr
+            rgb = cv2.cvtColor(gray8, cv2.COLOR_GRAY2RGB)
+            return NumpyImage(rgb, ImageFormat.RGB, self.frame_id, self.ts)
+        return self.copy()  # type: ignore
+
+    def to_bgr(self) -> "NumpyImage":
+        if self.format == ImageFormat.BGR:
+            return self.copy()  # type: ignore
+        arr = self.data
+        if self.format == ImageFormat.RGB:
+            return NumpyImage(
+                cv2.cvtColor(arr, cv2.COLOR_RGB2BGR), ImageFormat.BGR, self.frame_id, self.ts
+            )
+        if self.format == ImageFormat.RGBA:
+            return NumpyImage(
+                cv2.cvtColor(arr, cv2.COLOR_RGBA2BGR), ImageFormat.BGR, self.frame_id, self.ts
+            )
+        if self.format == ImageFormat.BGRA:
+            return NumpyImage(
+                cv2.cvtColor(arr, cv2.COLOR_BGRA2BGR), ImageFormat.BGR, self.frame_id, self.ts
+            )
+        if self.format in (ImageFormat.GRAY, ImageFormat.GRAY16, ImageFormat.DEPTH16):
+            gray8 = (arr / 256).astype(np.uint8) if self.format != ImageFormat.GRAY else arr
+            return NumpyImage(
+                cv2.cvtColor(gray8, cv2.COLOR_GRAY2BGR), ImageFormat.BGR, self.frame_id, self.ts
+            )
+        return self.copy()  # type: ignore
+
+    def to_grayscale(self) -> "NumpyImage":
+        if self.format in (ImageFormat.GRAY, ImageFormat.GRAY16, ImageFormat.DEPTH):
+            return self.copy()  # type: ignore
+        if self.format == ImageFormat.BGR:
+            return NumpyImage(
+                cv2.cvtColor(self.data, cv2.COLOR_BGR2GRAY),
+                ImageFormat.GRAY,
+                self.frame_id,
+                self.ts,
+            )
+        if self.format == ImageFormat.RGB:
+            return NumpyImage(
+                cv2.cvtColor(self.data, cv2.COLOR_RGB2GRAY),
+                ImageFormat.GRAY,
+                self.frame_id,
+                self.ts,
+            )
+        if self.format in (ImageFormat.RGBA, ImageFormat.BGRA):
+            code = cv2.COLOR_RGBA2GRAY if self.format == ImageFormat.RGBA else cv2.COLOR_BGRA2GRAY
+            return NumpyImage(
+                cv2.cvtColor(self.data, code), ImageFormat.GRAY, self.frame_id, self.ts
+            )
+        raise ValueError(f"Unsupported format: {self.format}")
+
+    def resize(
+        self, width: int, height: int, interpolation: int = cv2.INTER_LINEAR
+    ) -> "NumpyImage":
+        return NumpyImage(
+            cv2.resize(self.data, (width, height), interpolation=interpolation),
+            self.format,
+            self.frame_id,
+            self.ts,
+        )
+
+    def sharpness(self) -> float:
+        gray = self.to_grayscale()
+        sx = cv2.Sobel(gray.data, cv2.CV_32F, 1, 0, ksize=5)
+        sy = cv2.Sobel(gray.data, cv2.CV_32F, 0, 1, ksize=5)
+        magnitude = cv2.magnitude(sx, sy)
+        mean_mag = float(magnitude.mean())
+        if mean_mag <= 0:
+            return 0.0
+        return float(np.clip((np.log10(mean_mag + 1) - 1.7) / 2.0, 0.0, 1.0))
+
+    # PnP wrappers
+    def solve_pnp(
+        self,
+        object_points: np.ndarray,
+        image_points: np.ndarray,
+        camera_matrix: np.ndarray,
+        dist_coeffs: Optional[np.ndarray] = None,
+        flags: int = cv2.SOLVEPNP_ITERATIVE,
+    ) -> Tuple[bool, np.ndarray, np.ndarray]:
+        obj = np.asarray(object_points, dtype=np.float32).reshape(-1, 3)
+        img = np.asarray(image_points, dtype=np.float32).reshape(-1, 2)
+        K = np.asarray(camera_matrix, dtype=np.float64)
+        dist = None if dist_coeffs is None else np.asarray(dist_coeffs, dtype=np.float64)
+        ok, rvec, tvec = cv2.solvePnP(obj, img, K, dist, flags=flags)
+        return bool(ok), rvec.astype(np.float64), tvec.astype(np.float64)
+
+    def create_csrt_tracker(self, bbox: Tuple[int, int, int, int]):
+        tracker = None
+        if hasattr(cv2, "legacy") and hasattr(cv2.legacy, "TrackerCSRT_create"):
+            tracker = cv2.legacy.TrackerCSRT_create()
+        elif hasattr(cv2, "TrackerCSRT_create"):
+            tracker = cv2.TrackerCSRT_create()
+        else:
+            raise RuntimeError("OpenCV CSRT tracker not available")
+        ok = tracker.init(self.to_bgr().to_opencv(), tuple(map(int, bbox)))
+        if not ok:
+            raise RuntimeError("Failed to initialize CSRT tracker")
+        return tracker
+
+    def csrt_update(self, tracker) -> Tuple[bool, Tuple[int, int, int, int]]:
+        ok, box = tracker.update(self.to_bgr().to_opencv())
+        if not ok:
+            return False, (0, 0, 0, 0)
+        x, y, w, h = map(int, box)
+        return True, (x, y, w, h)
+
+    def solve_pnp_ransac(
+        self,
+        object_points: np.ndarray,
+        image_points: np.ndarray,
+        camera_matrix: np.ndarray,
+        dist_coeffs: Optional[np.ndarray] = None,
+        iterations_count: int = 100,
+        reprojection_error: float = 3.0,
+        confidence: float = 0.99,
+        min_sample: int = 6,
+    ) -> Tuple[bool, np.ndarray, np.ndarray, np.ndarray]:
+        obj = np.asarray(object_points, dtype=np.float32).reshape(-1, 3)
+        img = np.asarray(image_points, dtype=np.float32).reshape(-1, 2)
+        K = np.asarray(camera_matrix, dtype=np.float64)
+        dist = None if dist_coeffs is None else np.asarray(dist_coeffs, dtype=np.float64)
+        ok, rvec, tvec, inliers = cv2.solvePnPRansac(
+            obj,
+            img,
+            K,
+            dist,
+            iterationsCount=int(iterations_count),
+            reprojectionError=float(reprojection_error),
+            confidence=float(confidence),
+            flags=cv2.SOLVEPNP_ITERATIVE,
+        )
+        mask = np.zeros((obj.shape[0],), dtype=np.uint8)
+        if inliers is not None and len(inliers) > 0:
+            mask[inliers.flatten()] = 1
+        return bool(ok), rvec.astype(np.float64), tvec.astype(np.float64), mask
diff --git a/dimos/perception/common/utils.py b/dimos/perception/common/utils.py
index f4bad0645e..1ce3931c2f 100644
--- a/dimos/perception/common/utils.py
+++ b/dimos/perception/common/utils.py
@@ -28,6 +28,32 @@
 
 logger = setup_logger("dimos.perception.common.utils")
 
+# Optional CuPy support
+try:  # pragma: no cover - optional dependency
+    import cupy as cp  # type: ignore
+
+    _HAS_CUDA = True
+except Exception:  # pragma: no cover - optional dependency
+    cp = None  # type: ignore
+    _HAS_CUDA = False
+
+
+def _is_cu_array(x) -> bool:
+    return _HAS_CUDA and cp is not None and isinstance(x, cp.ndarray)  # type: ignore
+
+
+def _to_numpy(x):
+    return cp.asnumpy(x) if _is_cu_array(x) else x  # type: ignore
+
+
+def _to_cupy(x):
+    if _HAS_CUDA and cp is not None and isinstance(x, np.ndarray):  # type: ignore
+        try:
+            return cp.asarray(x)  # type: ignore
+        except Exception:
+            return x
+    return x
+
 
 def load_camera_info(yaml_path: str, frame_id: str = "camera_link") -> CameraInfo:
     """
@@ -115,6 +141,135 @@ def load_camera_info_opencv(yaml_path: str) -> Tuple[np.ndarray, np.ndarray]:
     return K, dist
 
 
+def rectify_image_cpu(image: Image, camera_matrix: np.ndarray, dist_coeffs: np.ndarray) -> Image:
+    """CPU rectification using OpenCV. Preserves backend by caller.
+
+    Returns an Image with numpy or cupy data depending on caller choice.
+    """
+    src = _to_numpy(image.data)
+    rect = cv2.undistort(src, camera_matrix, dist_coeffs)
+    # Caller decides whether to convert back to GPU.
+    return Image(data=rect, format=image.format, frame_id=image.frame_id, ts=image.ts)
+
+
+def rectify_image_cuda(image: Image, camera_matrix: np.ndarray, dist_coeffs: np.ndarray) -> Image:
+    """GPU rectification using CuPy bilinear sampling.
+
+    Generates an undistorted output grid and samples from the distorted source.
+    Falls back to CPU if CUDA not available.
+    """
+    if not _HAS_CUDA or cp is None or not image.is_cuda:  # type: ignore
+        return rectify_image_cpu(image, camera_matrix, dist_coeffs)
+
+    xp = cp  # type: ignore
+
+    # Source (distorted) image on device
+    src = image.data
+    if src.ndim not in (2, 3):
+        raise ValueError("Unsupported image rank for rectification")
+    H, W = int(src.shape[0]), int(src.shape[1])
+
+    # Extract intrinsics and distortion as float64
+    K = xp.asarray(camera_matrix, dtype=xp.float64)
+    dist = xp.asarray(dist_coeffs, dtype=xp.float64).reshape(-1)
+    fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
+    k1 = dist[0] if dist.size > 0 else 0.0
+    k2 = dist[1] if dist.size > 1 else 0.0
+    p1 = dist[2] if dist.size > 2 else 0.0
+    p2 = dist[3] if dist.size > 3 else 0.0
+    k3 = dist[4] if dist.size > 4 else 0.0
+
+    # Build undistorted target grid (pixel coords)
+    u = xp.arange(W, dtype=xp.float64)
+    v = xp.arange(H, dtype=xp.float64)
+    uu, vv = xp.meshgrid(u, v, indexing="xy")
+
+    # Convert to normalized undistorted coords
+    xu = (uu - cx) / fx
+    yu = (vv - cy) / fy
+
+    # Apply forward distortion model to get distorted normalized coords
+    r2 = xu * xu + yu * yu
+    r4 = r2 * r2
+    r6 = r4 * r2
+    radial = 1.0 + k1 * r2 + k2 * r4 + k3 * r6
+    delta_x = 2.0 * p1 * xu * yu + p2 * (r2 + 2.0 * xu * xu)
+    delta_y = p1 * (r2 + 2.0 * yu * yu) + 2.0 * p2 * xu * yu
+    xd = xu * radial + delta_x
+    yd = yu * radial + delta_y
+
+    # Back to pixel coordinates in the source (distorted) image
+    us = fx * xd + cx
+    vs = fy * yd + cy
+
+    # Bilinear sample from src at (vs, us)
+    def _bilinear_sample_cuda(img, x_src, y_src):
+        h, w = int(img.shape[0]), int(img.shape[1])
+        # Base integer corners (not clamped)
+        x0i = xp.floor(x_src).astype(xp.int32)
+        y0i = xp.floor(y_src).astype(xp.int32)
+        x1i = x0i + 1
+        y1i = y0i + 1
+
+        # Masks for in-bounds neighbors (BORDER_CONSTANT behavior)
+        m00 = (x0i >= 0) & (x0i < w) & (y0i >= 0) & (y0i < h)
+        m10 = (x1i >= 0) & (x1i < w) & (y0i >= 0) & (y0i < h)
+        m01 = (x0i >= 0) & (x0i < w) & (y1i >= 0) & (y1i < h)
+        m11 = (x1i >= 0) & (x1i < w) & (y1i >= 0) & (y1i < h)
+
+        # Clamp indices for safe gather, but multiply contributions by masks
+        x0 = xp.clip(x0i, 0, w - 1)
+        y0 = xp.clip(y0i, 0, h - 1)
+        x1 = xp.clip(x1i, 0, w - 1)
+        y1 = xp.clip(y1i, 0, h - 1)
+
+        # Weights
+        wx = (x_src - x0i).astype(xp.float64)
+        wy = (y_src - y0i).astype(xp.float64)
+        w00 = (1.0 - wx) * (1.0 - wy)
+        w10 = wx * (1.0 - wy)
+        w01 = (1.0 - wx) * wy
+        w11 = wx * wy
+
+        # Cast masks for arithmetic
+        m00f = m00.astype(xp.float64)
+        m10f = m10.astype(xp.float64)
+        m01f = m01.astype(xp.float64)
+        m11f = m11.astype(xp.float64)
+
+        if img.ndim == 2:
+            Ia = img[y0, x0].astype(xp.float64)
+            Ib = img[y0, x1].astype(xp.float64)
+            Ic = img[y1, x0].astype(xp.float64)
+            Id = img[y1, x1].astype(xp.float64)
+            out = w00 * m00f * Ia + w10 * m10f * Ib + w01 * m01f * Ic + w11 * m11f * Id
+        else:
+            Ia = img[y0, x0].astype(xp.float64)
+            Ib = img[y0, x1].astype(xp.float64)
+            Ic = img[y1, x0].astype(xp.float64)
+            Id = img[y1, x1].astype(xp.float64)
+            # Expand weights and masks for channel broadcasting
+            w00e = (w00 * m00f)[..., None]
+            w10e = (w10 * m10f)[..., None]
+            w01e = (w01 * m01f)[..., None]
+            w11e = (w11 * m11f)[..., None]
+            out = w00e * Ia + w10e * Ib + w01e * Ic + w11e * Id
+
+        # Cast back to original dtype with clipping for integers
+        if img.dtype == xp.uint8:
+            out = xp.clip(xp.rint(out), 0, 255).astype(xp.uint8)
+        elif img.dtype == xp.uint16:
+            out = xp.clip(xp.rint(out), 0, 65535).astype(xp.uint16)
+        elif img.dtype == xp.int16:
+            out = xp.clip(xp.rint(out), -32768, 32767).astype(xp.int16)
+        else:
+            out = out.astype(img.dtype, copy=False)
+        return out
+
+    rect = _bilinear_sample_cuda(src, us, vs)
+    return Image(data=rect, format=image.format, frame_id=image.frame_id, ts=image.ts)
+
+
 def rectify_image(image: Image, camera_matrix: np.ndarray, dist_coeffs: np.ndarray) -> Image:
     """
     Rectify (undistort) an image using camera calibration parameters.
@@ -127,16 +282,52 @@ def rectify_image(image: Image, camera_matrix: np.ndarray, dist_coeffs: np.ndarr
     Returns:
         Image: Rectified Image object with same format and metadata
     """
-    # Apply undistortion using OpenCV
-    rectified_data = cv2.undistort(image.data, camera_matrix, dist_coeffs)
-
-    # Create new Image object with rectified data, preserving all other properties
-    return Image(data=rectified_data, format=image.format, frame_id=image.frame_id, ts=image.ts)
+    if image.is_cuda and _HAS_CUDA:
+        return rectify_image_cuda(image, camera_matrix, dist_coeffs)
+    return rectify_image_cpu(image, camera_matrix, dist_coeffs)
+
+
+def project_3d_points_to_2d_cuda(
+    points_3d: "cp.ndarray", camera_intrinsics: Union[List[float], "cp.ndarray"]
+) -> "cp.ndarray":
+    xp = cp  # type: ignore
+    pts = points_3d.astype(xp.float64, copy=False)
+    mask = pts[:, 2] > 0
+    if not bool(xp.any(mask)):
+        return xp.zeros((0, 2), dtype=xp.int32)
+    valid = pts[mask]
+    if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
+        fx, fy, cx, cy = [xp.asarray(v, dtype=xp.float64) for v in camera_intrinsics]
+    else:
+        K = camera_intrinsics.astype(xp.float64, copy=False)
+        fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
+    u = (valid[:, 0] * fx / valid[:, 2]) + cx
+    v = (valid[:, 1] * fy / valid[:, 2]) + cy
+    return xp.stack([u, v], axis=1).astype(xp.int32)
 
 
-def project_3d_points_to_2d(
+def project_3d_points_to_2d_cpu(
     points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray]
 ) -> np.ndarray:
+    pts = np.asarray(points_3d, dtype=np.float64)
+    valid_mask = pts[:, 2] > 0
+    if not np.any(valid_mask):
+        return np.zeros((0, 2), dtype=np.int32)
+    valid_points = pts[valid_mask]
+    if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
+        fx, fy, cx, cy = [float(v) for v in camera_intrinsics]
+    else:
+        K = np.array(camera_intrinsics, dtype=np.float64)
+        fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
+    u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx
+    v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy
+    return np.column_stack([u, v]).astype(np.int32)
+
+
+def project_3d_points_to_2d(
+    points_3d: Union[np.ndarray, "cp.ndarray"],
+    camera_intrinsics: Union[List[float], np.ndarray, "cp.ndarray"],
+) -> Union[np.ndarray, "cp.ndarray"]:
     """
     Project 3D points to 2D image coordinates using camera intrinsics.
 
@@ -148,40 +339,75 @@ def project_3d_points_to_2d(
         Nx2 array of 2D image coordinates (u, v)
     """
     if len(points_3d) == 0:
-        return np.zeros((0, 2), dtype=np.int32)
+        return (
+            cp.zeros((0, 2), dtype=cp.int32)
+            if _is_cu_array(points_3d)
+            else np.zeros((0, 2), dtype=np.int32)
+        )
 
     # Filter out points with zero or negative depth
-    valid_mask = points_3d[:, 2] > 0
-    if not np.any(valid_mask):
-        return np.zeros((0, 2), dtype=np.int32)
+    if _is_cu_array(points_3d) or _is_cu_array(camera_intrinsics):
+        xp = cp  # type: ignore
+        pts = points_3d if _is_cu_array(points_3d) else xp.asarray(points_3d)
+        K = camera_intrinsics if _is_cu_array(camera_intrinsics) else camera_intrinsics
+        return project_3d_points_to_2d_cuda(pts, K)  # type: ignore[arg-type]
+    return project_3d_points_to_2d_cpu(np.asarray(points_3d), np.asarray(camera_intrinsics))
+
+
+def project_2d_points_to_3d_cuda(
+    points_2d: "cp.ndarray",
+    depth_values: "cp.ndarray",
+    camera_intrinsics: Union[List[float], "cp.ndarray"],
+) -> "cp.ndarray":
+    xp = cp  # type: ignore
+    pts = points_2d.astype(xp.float64, copy=False)
+    depths = depth_values.astype(xp.float64, copy=False)
+    valid = depths > 0
+    if not bool(xp.any(valid)):
+        return xp.zeros((0, 3), dtype=xp.float32)
+    uv = pts[valid]
+    Z = depths[valid]
+    if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
+        fx, fy, cx, cy = [xp.asarray(v, dtype=xp.float64) for v in camera_intrinsics]
+    else:
+        K = camera_intrinsics.astype(xp.float64, copy=False)
+        fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
+    X = (uv[:, 0] - cx) * Z / fx
+    Y = (uv[:, 1] - cy) * Z / fy
+    return xp.stack([X, Y, Z], axis=1).astype(xp.float32)
 
-    valid_points = points_3d[valid_mask]
 
-    # Extract camera parameters
+def project_2d_points_to_3d_cpu(
+    points_2d: np.ndarray,
+    depth_values: np.ndarray,
+    camera_intrinsics: Union[List[float], np.ndarray],
+) -> np.ndarray:
+    pts = np.asarray(points_2d, dtype=np.float64)
+    depths = np.asarray(depth_values, dtype=np.float64)
+    valid_mask = depths > 0
+    if not np.any(valid_mask):
+        return np.zeros((0, 3), dtype=np.float32)
+    valid_points_2d = pts[valid_mask]
+    valid_depths = depths[valid_mask]
     if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
-        fx, fy, cx, cy = camera_intrinsics
+        fx, fy, cx, cy = [float(v) for v in camera_intrinsics]
     else:
-        camera_matrix = np.array(camera_intrinsics)
+        camera_matrix = np.array(camera_intrinsics, dtype=np.float64)
         fx = camera_matrix[0, 0]
         fy = camera_matrix[1, 1]
         cx = camera_matrix[0, 2]
         cy = camera_matrix[1, 2]
-
-    # Project to image coordinates
-    u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx
-    v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy
-
-    # Round to integer pixel coordinates
-    points_2d = np.column_stack([u, v]).astype(np.int32)
-
-    return points_2d
+    X = (valid_points_2d[:, 0] - cx) * valid_depths / fx
+    Y = (valid_points_2d[:, 1] - cy) * valid_depths / fy
+    Z = valid_depths
+    return np.column_stack([X, Y, Z]).astype(np.float32)
 
 
 def project_2d_points_to_3d(
-    points_2d: np.ndarray,
-    depth_values: np.ndarray,
-    camera_intrinsics: Union[List[float], np.ndarray],
-) -> np.ndarray:
+    points_2d: Union[np.ndarray, "cp.ndarray"],
+    depth_values: Union[np.ndarray, "cp.ndarray"],
+    camera_intrinsics: Union[List[float], np.ndarray, "cp.ndarray"],
+) -> Union[np.ndarray, "cp.ndarray"]:
     """
     Project 2D image points to 3D coordinates using depth values and camera intrinsics.
 
@@ -194,46 +420,27 @@ def project_2d_points_to_3d(
         Nx3 array of 3D points (X, Y, Z)
     """
     if len(points_2d) == 0:
-        return np.zeros((0, 3), dtype=np.float32)
+        return (
+            cp.zeros((0, 3), dtype=cp.float32)
+            if _is_cu_array(points_2d)
+            else np.zeros((0, 3), dtype=np.float32)
+        )
 
     # Ensure depth_values is a numpy array
-    depth_values = np.asarray(depth_values)
-
-    # Filter out points with zero or negative depth
-    valid_mask = depth_values > 0
-    if not np.any(valid_mask):
-        return np.zeros((0, 3), dtype=np.float32)
-
-    valid_points_2d = points_2d[valid_mask]
-    valid_depths = depth_values[valid_mask]
-
-    # Extract camera parameters
-    if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
-        fx, fy, cx, cy = camera_intrinsics
-    else:
-        camera_matrix = np.array(camera_intrinsics)
-        fx = camera_matrix[0, 0]
-        fy = camera_matrix[1, 1]
-        cx = camera_matrix[0, 2]
-        cy = camera_matrix[1, 2]
-
-    # Back-project to 3D coordinates
-    # X = (u - cx) * Z / fx
-    # Y = (v - cy) * Z / fy
-    # Z = depth
-    X = (valid_points_2d[:, 0] - cx) * valid_depths / fx
-    Y = (valid_points_2d[:, 1] - cy) * valid_depths / fy
-    Z = valid_depths
-
-    # Stack into 3D points
-    points_3d = np.column_stack([X, Y, Z]).astype(np.float32)
-
-    return points_3d
+    if _is_cu_array(points_2d) or _is_cu_array(depth_values) or _is_cu_array(camera_intrinsics):
+        xp = cp  # type: ignore
+        pts = points_2d if _is_cu_array(points_2d) else xp.asarray(points_2d)
+        depths = depth_values if _is_cu_array(depth_values) else xp.asarray(depth_values)
+        K = camera_intrinsics if _is_cu_array(camera_intrinsics) else camera_intrinsics
+        return project_2d_points_to_3d_cuda(pts, depths, K)  # type: ignore[arg-type]
+    return project_2d_points_to_3d_cpu(
+        np.asarray(points_2d), np.asarray(depth_values), np.asarray(camera_intrinsics)
+    )
 
 
 def colorize_depth(
-    depth_img: np.ndarray, max_depth: float = 5.0, overlay_stats: bool = True
-) -> Optional[np.ndarray]:
+    depth_img: Union[np.ndarray, "cp.ndarray"], max_depth: float = 5.0, overlay_stats: bool = True
+) -> Optional[Union[np.ndarray, "cp.ndarray"]]:
     """
     Normalize and colorize depth image using COLORMAP_JET with optional statistics overlay.
 
@@ -248,57 +455,57 @@ def colorize_depth(
     if depth_img is None:
         return None
 
-    valid_mask = np.isfinite(depth_img) & (depth_img > 0)
-    depth_norm = np.zeros_like(depth_img)
-    depth_norm[valid_mask] = np.clip(depth_img[valid_mask] / max_depth, 0, 1)
-    depth_colored = cv2.applyColorMap((depth_norm * 255).astype(np.uint8), cv2.COLORMAP_JET)
-    depth_rgb = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB)
-
-    # Make the depth image less bright by scaling down the values
-    depth_rgb = (depth_rgb * 0.6).astype(np.uint8)
-
-    if overlay_stats and valid_mask.any():
-        # Calculate statistics
-        valid_depths = depth_img[valid_mask]
-        min_depth = np.min(valid_depths)
-        max_depth_actual = np.max(valid_depths)
-
-        # Get center depth
-        h, w = depth_img.shape
+    was_cu = _is_cu_array(depth_img)
+    xp = cp if was_cu else np  # type: ignore
+    depth = depth_img if was_cu else np.asarray(depth_img)
+
+    valid_mask = xp.isfinite(depth) & (depth > 0)
+    depth_norm = xp.zeros_like(depth, dtype=xp.float32)
+    if bool(valid_mask.any() if not was_cu else xp.any(valid_mask)):
+        depth_norm = xp.where(valid_mask, xp.clip(depth / max_depth, 0, 1), depth_norm)
+
+    # Use CPU for colormap/text; convert back to GPU if needed
+    depth_norm_np = _to_numpy(depth_norm)
+    depth_colored = cv2.applyColorMap((depth_norm_np * 255).astype(np.uint8), cv2.COLORMAP_JET)
+    depth_rgb_np = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB)
+    depth_rgb_np = (depth_rgb_np * 0.6).astype(np.uint8)
+
+    if overlay_stats and (np.any(_to_numpy(valid_mask))):
+        valid_depths = _to_numpy(depth)[_to_numpy(valid_mask)]
+        min_depth = float(np.min(valid_depths))
+        max_depth_actual = float(np.max(valid_depths))
+        h, w = depth_rgb_np.shape[:2]
         center_y, center_x = h // 2, w // 2
-        # Sample a small region around center for robustness
-        center_region = depth_img[
+        center_region = _to_numpy(depth)[
             max(0, center_y - 2) : min(h, center_y + 3), max(0, center_x - 2) : min(w, center_x + 3)
         ]
         center_mask = np.isfinite(center_region) & (center_region > 0)
         if center_mask.any():
-            center_depth = np.median(center_region[center_mask])
+            center_depth = float(np.median(center_region[center_mask]))
         else:
-            center_depth = depth_img[center_y, center_x] if valid_mask[center_y, center_x] else 0.0
+            depth_np = _to_numpy(depth)
+            vm_np = _to_numpy(valid_mask)
+            center_depth = float(depth_np[center_y, center_x]) if vm_np[center_y, center_x] else 0.0
 
-        # Prepare text overlays
         font = cv2.FONT_HERSHEY_SIMPLEX
         font_scale = 0.6
         thickness = 1
         line_type = cv2.LINE_AA
-
-        # Text properties
-        text_color = (255, 255, 255)  # White
-        bg_color = (0, 0, 0)  # Black background
+        text_color = (255, 255, 255)
+        bg_color = (0, 0, 0)
         padding = 5
 
-        # Min depth (top-left)
         min_text = f"Min: {min_depth:.2f}m"
         (text_w, text_h), _ = cv2.getTextSize(min_text, font, font_scale, thickness)
         cv2.rectangle(
-            depth_rgb,
+            depth_rgb_np,
             (padding, padding),
             (padding + text_w + 4, padding + text_h + 6),
             bg_color,
             -1,
         )
         cv2.putText(
-            depth_rgb,
+            depth_rgb_np,
             min_text,
             (padding + 2, padding + text_h + 2),
             font,
@@ -308,18 +515,17 @@ def colorize_depth(
             line_type,
         )
 
-        # Max depth (top-right)
         max_text = f"Max: {max_depth_actual:.2f}m"
         (text_w, text_h), _ = cv2.getTextSize(max_text, font, font_scale, thickness)
         cv2.rectangle(
-            depth_rgb,
+            depth_rgb_np,
             (w - padding - text_w - 4, padding),
             (w - padding, padding + text_h + 6),
             bg_color,
             -1,
         )
         cv2.putText(
-            depth_rgb,
+            depth_rgb_np,
             max_text,
             (w - padding - text_w - 2, padding + text_h + 2),
             font,
@@ -329,41 +535,36 @@ def colorize_depth(
             line_type,
         )
 
-        # Center depth (center)
         if center_depth > 0:
             center_text = f"{center_depth:.2f}m"
             (text_w, text_h), _ = cv2.getTextSize(center_text, font, font_scale, thickness)
             center_text_x = center_x - text_w // 2
             center_text_y = center_y + text_h // 2
-
-            # Draw crosshair
             cross_size = 10
             cross_color = (255, 255, 255)
             cv2.line(
-                depth_rgb,
+                depth_rgb_np,
                 (center_x - cross_size, center_y),
                 (center_x + cross_size, center_y),
                 cross_color,
                 1,
             )
             cv2.line(
-                depth_rgb,
+                depth_rgb_np,
                 (center_x, center_y - cross_size),
                 (center_x, center_y + cross_size),
                 cross_color,
                 1,
             )
-
-            # Draw center depth text with background
             cv2.rectangle(
-                depth_rgb,
+                depth_rgb_np,
                 (center_text_x - 2, center_text_y - text_h - 2),
                 (center_text_x + text_w + 2, center_text_y + 2),
                 bg_color,
                 -1,
             )
             cv2.putText(
-                depth_rgb,
+                depth_rgb_np,
                 center_text,
                 (center_text_x, center_text_y),
                 font,
@@ -373,11 +574,11 @@ def colorize_depth(
                 line_type,
             )
 
-    return depth_rgb
+    return _to_cupy(depth_rgb_np) if was_cu else depth_rgb_np
 
 
 def draw_bounding_box(
-    image: np.ndarray,
+    image: Union[np.ndarray, "cp.ndarray"],
     bbox: List[float],
     color: Tuple[int, int, int] = (0, 255, 0),
     thickness: int = 2,
@@ -385,7 +586,7 @@ def draw_bounding_box(
     confidence: Optional[float] = None,
     object_id: Optional[int] = None,
     font_scale: float = 0.6,
-) -> np.ndarray:
+) -> Union[np.ndarray, "cp.ndarray"]:
     """
     Draw a bounding box with optional label on an image.
 
@@ -402,10 +603,10 @@ def draw_bounding_box(
     Returns:
         Image with bounding box drawn
     """
+    was_cu = _is_cu_array(image)
+    img_np = _to_numpy(image)
     x1, y1, x2, y2 = map(int, bbox)
-
-    # Draw bounding box
-    cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness)
+    cv2.rectangle(img_np, (x1, y1), (x2, y2), color, thickness)
 
     # Create label text
     text_parts = []
@@ -422,7 +623,7 @@ def draw_bounding_box(
         # Draw text background
         text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 1)[0]
         cv2.rectangle(
-            image,
+            img_np,
             (x1, y1 - text_size[1] - 5),
             (x1 + text_size[0], y1),
             (0, 0, 0),
@@ -431,7 +632,7 @@ def draw_bounding_box(
 
         # Draw text
         cv2.putText(
-            image,
+            img_np,
             text,
             (x1, y1 - 5),
             cv2.FONT_HERSHEY_SIMPLEX,
@@ -440,17 +641,17 @@ def draw_bounding_box(
             1,
         )
 
-    return image
+    return _to_cupy(img_np) if was_cu else img_np
 
 
 def draw_segmentation_mask(
-    image: np.ndarray,
-    mask: np.ndarray,
+    image: Union[np.ndarray, "cp.ndarray"],
+    mask: Union[np.ndarray, "cp.ndarray"],
     color: Tuple[int, int, int] = (0, 200, 200),
     alpha: float = 0.5,
     draw_contours: bool = True,
     contour_thickness: int = 2,
-) -> np.ndarray:
+) -> Union[np.ndarray, "cp.ndarray"]:
     """
     Draw segmentation mask overlay on an image.
 
@@ -468,39 +669,35 @@ def draw_segmentation_mask(
     if mask is None:
         return image
 
-    try:
-        # Ensure mask is uint8
-        mask = mask.astype(np.uint8)
+    was_cu = _is_cu_array(image)
+    img_np = _to_numpy(image)
+    mask_np = _to_numpy(mask)
 
-        # Create colored mask overlay
-        colored_mask = np.zeros_like(image)
-        colored_mask[mask > 0] = color
-
-        # Apply the mask with transparency
-        mask_area = mask > 0
-        image[mask_area] = cv2.addWeighted(
-            image[mask_area], 1 - alpha, colored_mask[mask_area], alpha, 0
+    try:
+        mask_np = mask_np.astype(np.uint8)
+        colored_mask = np.zeros_like(img_np)
+        colored_mask[mask_np > 0] = color
+        mask_area = mask_np > 0
+        img_np[mask_area] = cv2.addWeighted(
+            img_np[mask_area], 1 - alpha, colored_mask[mask_area], alpha, 0
         )
-
-        # Draw mask contours if requested
         if draw_contours:
-            contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-            cv2.drawContours(image, contours, -1, color, contour_thickness)
-
+            contours, _ = cv2.findContours(mask_np, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            cv2.drawContours(img_np, contours, -1, color, contour_thickness)
     except Exception as e:
         logger.warning(f"Error drawing segmentation mask: {e}")
 
-    return image
+    return _to_cupy(img_np) if was_cu else img_np
 
 
 def draw_object_detection_visualization(
-    image: np.ndarray,
+    image: Union[np.ndarray, "cp.ndarray"],
     objects: List[ObjectData],
     draw_masks: bool = False,
     bbox_color: Tuple[int, int, int] = (0, 255, 0),
     mask_color: Tuple[int, int, int] = (0, 200, 200),
     font_scale: float = 0.6,
-) -> np.ndarray:
+) -> Union[np.ndarray, "cp.ndarray"]:
     """
     Create object detection visualization with bounding boxes and optional masks.
 
@@ -515,7 +712,8 @@ def draw_object_detection_visualization(
     Returns:
         Image with detection visualization
     """
-    viz_image = image.copy()
+    was_cu = _is_cu_array(image)
+    viz_image = _to_numpy(image).copy()
 
     for obj in objects:
         try:
@@ -549,7 +747,7 @@ def draw_object_detection_visualization(
         except Exception as e:
             logger.warning(f"Error drawing object visualization: {e}")
 
-    return viz_image
+    return _to_cupy(viz_image) if was_cu else viz_image
 
 
 def detection_results_to_object_data(
@@ -635,11 +833,12 @@ def combine_object_data(
 
         # Check mask overlap
         mask2 = obj2.get("segmentation_mask")
-        if mask2 is None or np.sum(mask2 > 0) == 0:
+        m2 = _to_numpy(mask2) if mask2 is not None else None
+        if m2 is None or np.sum(m2 > 0) == 0:
             combined.append(obj_copy)
             continue
 
-        mask2_area = np.sum(mask2 > 0)
+        mask2_area = np.sum(m2 > 0)
         is_duplicate = False
 
         for obj1 in list1:
@@ -647,7 +846,8 @@ def combine_object_data(
             if mask1 is None:
                 continue
 
-            intersection = np.sum((mask1 > 0) & (mask2 > 0))
+            m1 = _to_numpy(mask1)
+            intersection = np.sum((m1 > 0) & (m2 > 0))
             if intersection / mask2_area >= overlap_threshold:
                 is_duplicate = True
                 break
diff --git a/pyproject.toml b/pyproject.toml
index 670f98a0b7..f6dc613fd2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -148,7 +148,8 @@ cpu = [
 ]
 
 cuda = [
-    "pycuda",
+    "cupy-cuda12x==13.6.0",
+    "nvidia-nvimgcodec-cu12[all]",
     "onnxruntime-gpu>=1.17.1", # Only versions supporting both cuda11 and cuda12
     "ctransformers[cuda]==0.2.27",
     "mmengine>=0.10.3",
diff --git a/tests/test_image_backend_utils.py b/tests/test_image_backend_utils.py
new file mode 100644
index 0000000000..3adde5b631
--- /dev/null
+++ b/tests/test_image_backend_utils.py
@@ -0,0 +1,279 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from dimos.msgs.sensor_msgs import Image, ImageFormat
+from dimos.perception.common.utils import (
+    rectify_image,
+    project_3d_points_to_2d,
+    project_2d_points_to_3d,
+    colorize_depth,
+    draw_bounding_box,
+    draw_segmentation_mask,
+    draw_object_detection_visualization,
+)
+
+
+def _has_cupy() -> bool:
+    try:
+        import cupy as cp  # type: ignore
+
+        try:
+            ndev = cp.cuda.runtime.getDeviceCount()  # type: ignore[attr-defined]
+            if ndev <= 0:
+                return False
+            x = cp.array([1, 2, 3])
+            _ = int(x.sum().get())
+            return True
+        except Exception:
+            return False
+    except Exception:
+        return False
+
+
+@pytest.mark.parametrize(
+    "shape,fmt", [((64, 64, 3), ImageFormat.BGR), ((64, 64), ImageFormat.GRAY)]
+)
+def test_rectify_image_cpu(shape, fmt):
+    arr = (np.random.rand(*shape) * (255 if fmt != ImageFormat.GRAY else 65535)).astype(
+        np.uint8 if fmt != ImageFormat.GRAY else np.uint16
+    )
+    img = Image(data=arr, format=fmt, frame_id="cam", ts=123.456)
+    K = np.array(
+        [[100.0, 0, arr.shape[1] / 2], [0, 100.0, arr.shape[0] / 2], [0, 0, 1]], dtype=np.float64
+    )
+    D = np.zeros(5, dtype=np.float64)
+    out = rectify_image(img, K, D)
+    assert out.shape[:2] == arr.shape[:2]
+    assert out.format == fmt
+    assert out.frame_id == "cam"
+    assert abs(out.ts - 123.456) < 1e-9
+    # With zero distortion, pixels should match
+    np.testing.assert_array_equal(out.data, arr)
+
+
+@pytest.mark.skipif(not _has_cupy(), reason="CuPy/CUDA not available")
+@pytest.mark.parametrize(
+    "shape,fmt", [((32, 32, 3), ImageFormat.BGR), ((32, 32), ImageFormat.GRAY)]
+)
+def test_rectify_image_gpu_parity(shape, fmt):
+    import cupy as cp  # type: ignore
+
+    arr_np = (np.random.rand(*shape) * (255 if fmt != ImageFormat.GRAY else 65535)).astype(
+        np.uint8 if fmt != ImageFormat.GRAY else np.uint16
+    )
+    arr_cu = cp.asarray(arr_np)
+    img = Image(data=arr_cu, format=fmt, frame_id="cam", ts=1.23)
+    K = np.array(
+        [[80.0, 0, arr_np.shape[1] / 2], [0, 80.0, arr_np.shape[0] / 2], [0, 0, 1.0]],
+        dtype=np.float64,
+    )
+    D = np.zeros(5, dtype=np.float64)
+    out = rectify_image(img, K, D)
+    # Zero distortion parity and backend preservation
+    assert out.format == fmt
+    assert out.frame_id == "cam"
+    assert abs(out.ts - 1.23) < 1e-9
+    assert out.data.__class__.__module__.startswith("cupy")
+    np.testing.assert_array_equal(cp.asnumpy(out.data), arr_np)
+
+
+@pytest.mark.skipif(not _has_cupy(), reason="CuPy/CUDA not available")
+def test_rectify_image_gpu_nonzero_dist_close():
+    import cupy as cp  # type: ignore
+
+    H, W = 64, 96
+    # Structured pattern to make interpolation deterministic enough
+    x = np.linspace(0, 255, W, dtype=np.float32)
+    y = np.linspace(0, 255, H, dtype=np.float32)
+    xv, yv = np.meshgrid(x, y)
+    arr_np = np.stack(
+        [
+            xv.astype(np.uint8),
+            yv.astype(np.uint8),
+            ((xv + yv) / 2).astype(np.uint8),
+        ],
+        axis=2,
+    )
+    img_cpu = Image(data=arr_np, format=ImageFormat.BGR, frame_id="cam", ts=0.5)
+    img_gpu = Image(data=cp.asarray(arr_np), format=ImageFormat.BGR, frame_id="cam", ts=0.5)
+
+    fx, fy = 120.0, 125.0
+    cx, cy = W / 2.0, H / 2.0
+    K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1.0]], dtype=np.float64)
+    D = np.array([0.05, -0.02, 0.001, -0.001, 0.0], dtype=np.float64)
+
+    out_cpu = rectify_image(img_cpu, K, D)
+    out_gpu = rectify_image(img_gpu, K, D)
+    # Compare within a small tolerance
+    # Small numeric differences may remain due to model and casting; keep tight tolerance
+    np.testing.assert_allclose(
+        cp.asnumpy(out_gpu.data).astype(np.int16), out_cpu.data.astype(np.int16), atol=4
+    )
+
+
+def test_project_roundtrip_cpu():
+    pts3d = np.array([[0.1, 0.2, 1.0], [0.0, 0.0, 2.0], [0.5, -0.3, 3.0]], dtype=np.float32)
+    fx, fy, cx, cy = 200.0, 220.0, 64.0, 48.0
+    K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1.0]], dtype=np.float64)
+    uv = project_3d_points_to_2d(pts3d, K)
+    assert uv.shape == (3, 2)
+    Z = pts3d[:, 2]
+    pts3d_back = project_2d_points_to_3d(uv.astype(np.float32), Z.astype(np.float32), K)
+    # Allow small rounding differences due to int rounding in 2D
+    assert pts3d_back.shape == (3, 3)
+    assert np.all(pts3d_back[:, 2] > 0)
+
+
+@pytest.mark.skipif(not _has_cupy(), reason="CuPy/CUDA not available")
+def test_project_parity_gpu_cpu():
+    import cupy as cp  # type: ignore
+
+    pts3d_np = np.array([[0.1, 0.2, 1.0], [0.0, 0.0, 2.0], [0.5, -0.3, 3.0]], dtype=np.float32)
+    fx, fy, cx, cy = 200.0, 220.0, 64.0, 48.0
+    K_np = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1.0]], dtype=np.float64)
+    uv_cpu = project_3d_points_to_2d(pts3d_np, K_np)
+    uv_gpu = project_3d_points_to_2d(cp.asarray(pts3d_np), cp.asarray(K_np))
+    np.testing.assert_array_equal(cp.asnumpy(uv_gpu), uv_cpu)
+
+    Z_np = pts3d_np[:, 2]
+    pts3d_cpu = project_2d_points_to_3d(uv_cpu.astype(np.float32), Z_np.astype(np.float32), K_np)
+    pts3d_gpu = project_2d_points_to_3d(
+        cp.asarray(uv_cpu.astype(np.float32)), cp.asarray(Z_np.astype(np.float32)), cp.asarray(K_np)
+    )
+    assert pts3d_cpu.shape == cp.asnumpy(pts3d_gpu).shape
+
+
+@pytest.mark.skipif(not _has_cupy(), reason="CuPy/CUDA not available")
+def test_project_parity_gpu_cpu_random():
+    import cupy as cp  # type: ignore
+
+    rng = np.random.RandomState(0)
+    N = 1000
+    Z = rng.uniform(0.1, 5.0, size=(N, 1)).astype(np.float32)
+    XY = rng.uniform(-1.0, 1.0, size=(N, 2)).astype(np.float32)
+    pts3d_np = np.concatenate([XY, Z], axis=1)
+
+    fx, fy = 300.0, 320.0
+    cx, cy = 128.0, 96.0
+    K_np = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1.0]], dtype=np.float64)
+
+    uv_cpu = project_3d_points_to_2d(pts3d_np, K_np)
+    uv_gpu = project_3d_points_to_2d(cp.asarray(pts3d_np), cp.asarray(K_np))
+    np.testing.assert_array_equal(cp.asnumpy(uv_gpu), uv_cpu)
+
+    # Roundtrip
+    Z_flat = pts3d_np[:, 2]
+    pts3d_cpu = project_2d_points_to_3d(uv_cpu.astype(np.float32), Z_flat.astype(np.float32), K_np)
+    pts3d_gpu = project_2d_points_to_3d(
+        cp.asarray(uv_cpu.astype(np.float32)),
+        cp.asarray(Z_flat.astype(np.float32)),
+        cp.asarray(K_np),
+    )
+    assert pts3d_cpu.shape == cp.asnumpy(pts3d_gpu).shape
+
+
+def test_colorize_depth_cpu():
+    depth = np.zeros((32, 48), dtype=np.float32)
+    depth[8:16, 12:24] = 1.5
+    out = colorize_depth(depth, max_depth=3.0, overlay_stats=False)
+    assert isinstance(out, np.ndarray)
+    assert out.shape == (32, 48, 3)
+    assert out.dtype == np.uint8
+
+
+@pytest.mark.skipif(not _has_cupy(), reason="CuPy/CUDA not available")
+def test_colorize_depth_gpu_parity():
+    import cupy as cp  # type: ignore
+
+    depth_np = np.zeros((16, 20), dtype=np.float32)
+    depth_np[4:8, 5:15] = 2.0
+    out_cpu = colorize_depth(depth_np, max_depth=4.0, overlay_stats=False)
+    out_gpu = colorize_depth(cp.asarray(depth_np), max_depth=4.0, overlay_stats=False)
+    np.testing.assert_array_equal(cp.asnumpy(out_gpu), out_cpu)
+
+
+def test_draw_bounding_box_cpu():
+    img = np.zeros((20, 30, 3), dtype=np.uint8)
+    out = draw_bounding_box(img, [2, 3, 10, 12], color=(255, 0, 0), thickness=1)
+    assert isinstance(out, np.ndarray)
+    assert out.shape == img.shape
+    assert out.dtype == img.dtype
+
+
+@pytest.mark.skipif(not _has_cupy(), reason="CuPy/CUDA not available")
+def test_draw_bounding_box_gpu_parity():
+    import cupy as cp  # type: ignore
+
+    img_np = np.zeros((20, 30, 3), dtype=np.uint8)
+    out_cpu = draw_bounding_box(img_np.copy(), [2, 3, 10, 12], color=(0, 255, 0), thickness=2)
+    img_cu = cp.asarray(img_np)
+    out_gpu = draw_bounding_box(img_cu, [2, 3, 10, 12], color=(0, 255, 0), thickness=2)
+    np.testing.assert_array_equal(cp.asnumpy(out_gpu), out_cpu)
+
+
+def test_draw_segmentation_mask_cpu():
+    img = np.zeros((20, 30, 3), dtype=np.uint8)
+    mask = np.zeros((20, 30), dtype=np.uint8)
+    mask[5:10, 8:15] = 1
+    out = draw_segmentation_mask(img, mask, color=(0, 200, 200), alpha=0.5)
+    assert out.shape == img.shape
+
+
+@pytest.mark.skipif(not _has_cupy(), reason="CuPy/CUDA not available")
+def test_draw_segmentation_mask_gpu_parity():
+    import cupy as cp  # type: ignore
+
+    img_np = np.zeros((20, 30, 3), dtype=np.uint8)
+    mask_np = np.zeros((20, 30), dtype=np.uint8)
+    mask_np[2:12, 3:20] = 1
+    out_cpu = draw_segmentation_mask(img_np.copy(), mask_np, color=(100, 50, 200), alpha=0.4)
+    out_gpu = draw_segmentation_mask(
+        cp.asarray(img_np), cp.asarray(mask_np), color=(100, 50, 200), alpha=0.4
+    )
+    np.testing.assert_array_equal(cp.asnumpy(out_gpu), out_cpu)
+
+
+def test_draw_object_detection_visualization_cpu():
+    img = np.zeros((30, 40, 3), dtype=np.uint8)
+    objects = [
+        {
+            "object_id": 1,
+            "bbox": [5, 6, 20, 25],
+            "label": "box",
+            "confidence": 0.9,
+        }
+    ]
+    out = draw_object_detection_visualization(img, objects)
+    assert out.shape == img.shape
+
+
+@pytest.mark.skipif(not _has_cupy(), reason="CuPy/CUDA not available")
+def test_draw_object_detection_visualization_gpu_parity():
+    import cupy as cp  # type: ignore
+
+    img_np = np.zeros((30, 40, 3), dtype=np.uint8)
+    objects = [
+        {
+            "object_id": 1,
+            "bbox": [5, 6, 20, 25],
+            "label": "box",
+            "confidence": 0.9,
+        }
+    ]
+    out_cpu = draw_object_detection_visualization(img_np.copy(), objects)
+    out_gpu = draw_object_detection_visualization(cp.asarray(img_np), objects)
+    np.testing.assert_array_equal(cp.asnumpy(out_gpu), out_cpu)
diff --git a/tests/test_image_backends.py b/tests/test_image_backends.py
new file mode 100644
index 0000000000..59217719dc
--- /dev/null
+++ b/tests/test_image_backends.py
@@ -0,0 +1,534 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import cv2
+import numpy as np
+import pytest
+
+from dimos.msgs.sensor_msgs.Image import Image, ImageFormat, HAS_CUDA
+from dimos.utils.data import get_data
+
+IMAGE_PATH = get_data("chair-image.png")
+
+
+def _load_chair_image() -> np.ndarray:
+    img = cv2.imread(IMAGE_PATH, cv2.IMREAD_UNCHANGED)
+    if img is None:
+        raise FileNotFoundError(f"unable to load test image at {IMAGE_PATH}")
+    return img
+
+
+_CHAIR_BGRA = _load_chair_image()
+
+
+def _prepare_image(fmt: ImageFormat, shape=None) -> np.ndarray:
+    base = _CHAIR_BGRA
+    if fmt == ImageFormat.BGR:
+        arr = cv2.cvtColor(base, cv2.COLOR_BGRA2BGR)
+    elif fmt == ImageFormat.RGB:
+        arr = cv2.cvtColor(base, cv2.COLOR_BGRA2RGB)
+    elif fmt == ImageFormat.BGRA:
+        arr = base.copy()
+    elif fmt == ImageFormat.GRAY:
+        arr = cv2.cvtColor(base, cv2.COLOR_BGRA2GRAY)
+    else:
+        raise ValueError(f"unsupported image format {fmt}")
+
+    if shape is None:
+        return arr.copy()
+
+    if len(shape) == 2:
+        height, width = shape
+        orig_h, orig_w = arr.shape[:2]
+        interp = cv2.INTER_AREA if height <= orig_h and width <= orig_w else cv2.INTER_LINEAR
+        resized = cv2.resize(arr, (width, height), interpolation=interp)
+        return resized.copy()
+
+    if len(shape) == 3:
+        height, width, channels = shape
+        orig_h, orig_w = arr.shape[:2]
+        interp = cv2.INTER_AREA if height <= orig_h and width <= orig_w else cv2.INTER_LINEAR
+        resized = cv2.resize(arr, (width, height), interpolation=interp)
+        if resized.ndim == 2:
+            resized = np.repeat(resized[:, :, None], channels, axis=2)
+        elif resized.shape[2] != channels:
+            if channels == 4 and resized.shape[2] == 3:
+                alpha = np.full((height, width, 1), 255, dtype=resized.dtype)
+                resized = np.concatenate([resized, alpha], axis=2)
+            elif channels == 3 and resized.shape[2] == 4:
+                resized = resized[:, :, :3]
+            else:
+                raise ValueError(f"cannot adjust image to {channels} channels")
+        return resized.copy()
+
+    raise ValueError("shape must be a tuple of length 2 or 3")
+
+
+@pytest.fixture
+def alloc_timer(request):
+    def _alloc(
+        arr: np.ndarray, fmt: ImageFormat, *, to_cuda: bool = True, label: str | None = None
+    ):
+        tag = label or request.node.name
+        start = time.perf_counter()
+        cpu = Image.from_numpy(arr, format=fmt)
+        cpu_time = time.perf_counter() - start
+
+        gpu = None
+        gpu_time = None
+        if to_cuda:
+            arr_gpu = np.array(arr, copy=True)
+            start = time.perf_counter()
+            gpu = Image.from_numpy(arr_gpu, format=fmt, to_cuda=True)
+            gpu_time = time.perf_counter() - start
+
+        if gpu_time is not None:
+            print(f"[alloc {tag}] cpu={cpu_time:.6f}s gpu={gpu_time:.6f}s")
+        else:
+            print(f"[alloc {tag}] cpu={cpu_time:.6f}s")
+        return cpu, gpu, cpu_time, gpu_time
+
+    return _alloc
+
+
+@pytest.mark.parametrize(
+    "shape,fmt",
+    [
+        ((64, 64, 3), ImageFormat.BGR),
+        ((64, 64, 4), ImageFormat.BGRA),
+        ((64, 64, 3), ImageFormat.RGB),
+        ((64, 64), ImageFormat.GRAY),
+    ],
+)
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_color_conversions_parity(shape, fmt, alloc_timer):
+    arr = _prepare_image(fmt, shape)
+    # Build CPU and CUDA images with same logical content (timed allocations)
+    cpu, gpu, _, _ = alloc_timer(arr, fmt)
+
+    # Test to_rgb -> to_bgr parity
+    cpu_round = cpu.to_rgb().to_bgr().to_opencv()
+    gpu_round = gpu.to_rgb().to_bgr().to_opencv()
+
+    assert cpu_round.shape == gpu_round.shape
+    assert cpu_round.dtype == gpu_round.dtype
+    # Exact match for uint8 color ops
+    assert np.array_equal(cpu_round, gpu_round)
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_grayscale_parity(alloc_timer):
+    arr = _prepare_image(ImageFormat.BGR, (48, 32, 3))
+    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR)
+
+    cpu_gray = cpu.to_grayscale().to_opencv()
+    gpu_gray = gpu.to_grayscale().to_opencv()
+
+    assert cpu_gray.shape == gpu_gray.shape
+    assert cpu_gray.dtype == gpu_gray.dtype
+    # Allow tiny rounding differences (<=1 LSB) — visually indistinguishable
+    diff = np.abs(cpu_gray.astype(np.int16) - gpu_gray.astype(np.int16))
+    assert diff.max() <= 1
+
+
+@pytest.mark.parametrize("fmt", [ImageFormat.BGR, ImageFormat.RGB, ImageFormat.BGRA])
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_resize_parity(fmt, alloc_timer):
+    shape = (60, 80, 3) if fmt in (ImageFormat.BGR, ImageFormat.RGB) else (60, 80, 4)
+    arr = _prepare_image(fmt, shape)
+    cpu, gpu, _, _ = alloc_timer(arr, fmt)
+
+    new_w, new_h = 37, 53
+    cpu_res = cpu.resize(new_w, new_h).to_opencv()
+    gpu_res = gpu.resize(new_w, new_h).to_opencv()
+
+    assert cpu_res.shape == gpu_res.shape
+    assert cpu_res.dtype == gpu_res.dtype
+    # Allow small tolerance due to float interpolation differences
+    assert np.max(np.abs(cpu_res.astype(np.int16) - gpu_res.astype(np.int16))) <= 1
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_perf_compare_alloc(alloc_timer):
+    arr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
+    alloc_timer(arr, ImageFormat.BGR, label="test_perf_compare_alloc-setup")
+
+    runs = 5
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = Image.from_numpy(arr, format=ImageFormat.BGR)
+    cpu_t = (time.perf_counter() - t0) / runs
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = Image.from_numpy(arr, format=ImageFormat.BGR, to_cuda=True)
+    gpu_t = (time.perf_counter() - t0) / runs
+    print(f"alloc (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+    assert cpu_t > 0 and gpu_t > 0
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_sharpness_parity(alloc_timer):
+    arr = _prepare_image(ImageFormat.BGR, (64, 64, 3))
+    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR)
+
+    s_cpu = cpu.sharpness()
+    s_gpu = gpu.sharpness()
+
+    # Values should be very close; minor border/rounding differences allowed
+    assert abs(s_cpu - s_gpu) < 5e-2
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_to_opencv_parity(alloc_timer):
+    # BGRA should drop alpha and produce BGR
+    arr = _prepare_image(ImageFormat.BGRA, (32, 32, 4))
+    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGRA)
+
+    cpu_bgr = cpu.to_opencv()
+    gpu_bgr = gpu.to_opencv()
+
+    assert cpu_bgr.shape == (32, 32, 3)
+    assert gpu_bgr.shape == (32, 32, 3)
+    assert np.array_equal(cpu_bgr, gpu_bgr)
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_solve_pnp_parity(alloc_timer):
+    # Synthetic camera and 3D points
+    K = np.array([[400.0, 0.0, 32.0], [0.0, 400.0, 24.0], [0.0, 0.0, 1.0]], dtype=np.float64)
+    dist = None
+    obj = np.array(
+        [
+            [-0.5, -0.5, 0.0],
+            [0.5, -0.5, 0.0],
+            [0.5, 0.5, 0.0],
+            [-0.5, 0.5, 0.0],
+            [0.0, 0.0, 0.5],
+            [0.0, 0.0, 1.0],
+        ],
+        dtype=np.float32,
+    )
+
+    rvec_true = np.zeros((3, 1), dtype=np.float64)
+    tvec_true = np.array([[0.0], [0.0], [2.0]], dtype=np.float64)
+    img_pts, _ = cv2.projectPoints(obj, rvec_true, tvec_true, K, dist)
+    img_pts = img_pts.reshape(-1, 2).astype(np.float32)
+
+    # Build images using deterministic fixture content
+    base_bgr = _prepare_image(ImageFormat.BGR, (48, 64, 3))
+    cpu, gpu, _, _ = alloc_timer(base_bgr, ImageFormat.BGR)
+
+    ok_cpu, r_cpu, t_cpu = cpu.solve_pnp(obj, img_pts, K, dist)
+    ok_gpu, r_gpu, t_gpu = gpu.solve_pnp(obj, img_pts, K, dist)
+
+    assert ok_cpu and ok_gpu
+    # Validate reprojection error for CUDA solver
+    proj_cpu, _ = cv2.projectPoints(obj, r_cpu, t_cpu, K, dist)
+    proj_gpu, _ = cv2.projectPoints(obj, r_gpu, t_gpu, K, dist)
+    proj_cpu = proj_cpu.reshape(-1, 2)
+    proj_gpu = proj_gpu.reshape(-1, 2)
+    err_gpu = np.linalg.norm(proj_gpu - img_pts, axis=1)
+    assert err_gpu.mean() < 1e-3
+    assert err_gpu.max() < 1e-2
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_perf_compare_grayscale(alloc_timer):
+    arr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
+    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR, label="test_perf_compare_grayscale-setup")
+
+    runs = 10
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = cpu.to_grayscale()
+    cpu_t = (time.perf_counter() - t0) / runs
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = gpu.to_grayscale()
+    gpu_t = (time.perf_counter() - t0) / runs
+    print(f"grayscale (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+    assert cpu_t > 0 and gpu_t > 0
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_perf_compare_resize(alloc_timer):
+    arr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
+    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR, label="test_perf_compare_resize-setup")
+
+    runs = 5
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = cpu.resize(320, 240)
+    cpu_t = (time.perf_counter() - t0) / runs
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = gpu.resize(320, 240)
+    gpu_t = (time.perf_counter() - t0) / runs
+    print(f"resize (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+    assert cpu_t > 0 and gpu_t > 0
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_perf_compare_sharpness(alloc_timer):
+    arr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
+    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR, label="test_perf_compare_sharpness-setup")
+
+    runs = 3
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = cpu.sharpness()
+    cpu_t = (time.perf_counter() - t0) / runs
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = gpu.sharpness()
+    gpu_t = (time.perf_counter() - t0) / runs
+    print(f"sharpness (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+    assert cpu_t > 0 and gpu_t > 0
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_perf_compare_solvepnp(alloc_timer):
+    K = np.array([[600.0, 0.0, 320.0], [0.0, 600.0, 240.0], [0.0, 0.0, 1.0]], dtype=np.float64)
+    dist = None
+    rng = np.random.default_rng(123)
+    obj = rng.standard_normal((200, 3)).astype(np.float32)
+    rvec_true = np.array([[0.1], [-0.2], [0.05]])
+    tvec_true = np.array([[0.0], [0.0], [3.0]])
+    img_pts, _ = cv2.projectPoints(obj, rvec_true, tvec_true, K, dist)
+    img_pts = img_pts.reshape(-1, 2).astype(np.float32)
+    base_bgr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
+    cpu, gpu, _, _ = alloc_timer(
+        base_bgr, ImageFormat.BGR, label="test_perf_compare_solvepnp-setup"
+    )
+
+    runs = 5
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = cpu.solve_pnp(obj, img_pts, K, dist)
+    cpu_t = (time.perf_counter() - t0) / runs
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = gpu.solve_pnp(obj, img_pts, K, dist)
+    gpu_t = (time.perf_counter() - t0) / runs
+    print(f"solvePnP (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+    assert cpu_t > 0 and gpu_t > 0
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_perf_compare_tracker(alloc_timer):
+    H, W = 240, 320
+    img_base = _prepare_image(ImageFormat.BGR, (H, W, 3))
+    img1 = img_base.copy()
+    img2 = img_base.copy()
+    bbox0 = (80, 60, 40, 30)
+    x0, y0, w0, h0 = bbox0
+    cv2.rectangle(img1, (x0, y0), (x0 + w0, y0 + h0), (255, 255, 255), thickness=-1)
+    dx, dy = 8, 5
+    cv2.rectangle(
+        img2,
+        (x0 + dx, y0 + dy),
+        (x0 + dx + w0, y0 + dy + h0),
+        (255, 255, 255),
+        thickness=-1,
+    )
+    cpu1, gpu1, _, _ = alloc_timer(img1, ImageFormat.BGR, label="test_perf_compare_tracker-frame1")
+    cpu2, gpu2, _, _ = alloc_timer(img2, ImageFormat.BGR, label="test_perf_compare_tracker-frame2")
+    trk_cpu = cpu1.create_csrt_tracker(bbox0)
+    trk_gpu = gpu1.create_csrt_tracker(bbox0)
+
+    runs = 10
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = cpu2.csrt_update(trk_cpu)
+    cpu_t = (time.perf_counter() - t0) / runs
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        _ = gpu2.csrt_update(trk_gpu)
+    gpu_t = (time.perf_counter() - t0) / runs
+    print(f"tracker (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+    assert cpu_t > 0 and gpu_t > 0
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_csrt_tracker_parity(alloc_timer):
+    # Check tracker availability
+    has_csrt = False
+    if hasattr(cv2, "legacy") and hasattr(cv2.legacy, "TrackerCSRT_create"):
+        has_csrt = True
+    elif hasattr(cv2, "TrackerCSRT_create"):
+        has_csrt = True
+    if not has_csrt:
+        pytest.skip("OpenCV CSRT tracker not available")
+
+    H, W = 100, 100
+    # Create two frames with a moving rectangle
+    img_base = _prepare_image(ImageFormat.BGR, (H, W, 3))
+    img1 = img_base.copy()
+    img2 = img_base.copy()
+    bbox0 = (30, 30, 20, 15)
+    x0, y0, w0, h0 = bbox0
+    # draw rect in img1
+    cv2.rectangle(img1, (x0, y0), (x0 + w0, y0 + h0), (255, 255, 255), thickness=-1)
+    # shift by (dx,dy)
+    dx, dy = 5, 3
+    cv2.rectangle(
+        img2,
+        (x0 + dx, y0 + dy),
+        (x0 + dx + w0, y0 + dy + h0),
+        (255, 255, 255),
+        thickness=-1,
+    )
+
+    cpu1, gpu1, _, _ = alloc_timer(img1, ImageFormat.BGR, label="test_csrt_tracker_parity-frame1")
+    cpu2, gpu2, _, _ = alloc_timer(img2, ImageFormat.BGR, label="test_csrt_tracker_parity-frame2")
+
+    trk_cpu = cpu1.create_csrt_tracker(bbox0)
+    ok_cpu, bbox_cpu = cpu2.csrt_update(trk_cpu)
+    trk_gpu = gpu1.create_csrt_tracker(bbox0)
+    ok_gpu, bbox_gpu = gpu2.csrt_update(trk_gpu)
+
+    assert ok_cpu and ok_gpu
+    # Compare both to ground-truth expected bbox
+    expected = (x0 + dx, y0 + dy, w0, h0)
+    err_cpu = sum(abs(a - b) for a, b in zip(bbox_cpu, expected))
+    err_gpu = sum(abs(a - b) for a, b in zip(bbox_gpu, expected))
+    assert err_cpu <= 8
+    assert err_gpu <= 10  # allow some slack for scale/window effects
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_solve_pnp_ransac_with_outliers_and_distortion(alloc_timer):
+    # Camera with distortion
+    K = np.array([[500.0, 0.0, 320.0], [0.0, 500.0, 240.0], [0.0, 0.0, 1.0]], dtype=np.float64)
+    dist = np.array([0.1, -0.05, 0.001, 0.001, 0.0], dtype=np.float64)
+    rng = np.random.default_rng(202)
+    obj = rng.uniform(-1.0, 1.0, size=(200, 3)).astype(np.float32)
+    obj[:, 2] = np.abs(obj[:, 2]) + 2.0  # keep in front of camera
+    rvec_true = np.array([[0.1], [-0.15], [0.05]], dtype=np.float64)
+    tvec_true = np.array([[0.2], [-0.1], [3.0]], dtype=np.float64)
+    img_pts, _ = cv2.projectPoints(obj, rvec_true, tvec_true, K, dist)
+    img_pts = img_pts.reshape(-1, 2)
+    # Add outliers
+    n_out = 20
+    idx = rng.choice(len(img_pts), size=n_out, replace=False)
+    img_pts[idx] += rng.uniform(-50, 50, size=(n_out, 2))
+    img_pts = img_pts.astype(np.float32)
+
+    base_bgr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
+    _, gpu, _, _ = alloc_timer(base_bgr, ImageFormat.BGR, label="test_solve_pnp_ransac-setup")
+
+    ok_gpu, r_gpu, t_gpu, mask_gpu = gpu.solve_pnp_ransac(
+        obj, img_pts, K, dist, iterations_count=150, reprojection_error=3.0
+    )
+    assert ok_gpu
+    inlier_ratio = mask_gpu.mean()
+    assert inlier_ratio > 0.7
+    # Reprojection error on inliers
+    in_idx = np.nonzero(mask_gpu)[0]
+    proj_gpu, _ = cv2.projectPoints(obj[in_idx], r_gpu, t_gpu, K, dist)
+    proj_gpu = proj_gpu.reshape(-1, 2)
+    err = np.linalg.norm(proj_gpu - img_pts[in_idx], axis=1)
+    assert err.mean() < 1.5
+    assert err.max() < 4.0
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_solve_pnp_batch_correctness_and_perf(alloc_timer):
+    # Generate batched problems
+    B, N = 8, 50
+    rng = np.random.default_rng(99)
+    obj = rng.uniform(-1.0, 1.0, size=(B, N, 3)).astype(np.float32)
+    obj[:, :, 2] = np.abs(obj[:, :, 2]) + 2.0
+    K = np.array([[600.0, 0.0, 320.0], [0.0, 600.0, 240.0], [0.0, 0.0, 1.0]], dtype=np.float64)
+    r_true = np.zeros((B, 3, 1), dtype=np.float64)
+    t_true = np.tile(np.array([[0.0], [0.0], [3.0]], dtype=np.float64), (B, 1, 1))
+    img = []
+    for b in range(B):
+        ip, _ = cv2.projectPoints(obj[b], r_true[b], t_true[b], K, None)
+        img.append(ip.reshape(-1, 2))
+    img = np.stack(img, axis=0).astype(np.float32)
+
+    base_bgr = _prepare_image(ImageFormat.BGR, (10, 10, 3))
+    cpu, gpu, _, _ = alloc_timer(base_bgr, ImageFormat.BGR, label="test_solve_pnp_batch-setup")
+
+    # CPU loop
+
+    t0 = time.perf_counter()
+    r_list = []
+    t_list = []
+    for b in range(B):
+        ok, r, t = cpu.solve_pnp(obj[b], img[b], K, None)
+        assert ok
+        r_list.append(r)
+        t_list.append(t)
+    cpu_total = time.perf_counter() - t0
+    cpu_t = cpu_total / B
+
+    # CUDA batched
+    t0 = time.perf_counter()
+    r_b, t_b = gpu.solve_pnp_batch(obj, img, K)
+    gpu_total = time.perf_counter() - t0
+    gpu_t = gpu_total / B
+    print(f"solvePnP-batch (avg per pose) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s (B={B}, N={N})")
+
+    # Check reprojection for a couple of batches
+    for b in range(min(B, 4)):
+        proj, _ = cv2.projectPoints(obj[b], r_b[b], t_b[b], K, None)
+        err = np.linalg.norm(proj.reshape(-1, 2) - img[b], axis=1)
+        assert err.mean() < 1e-2
+        assert err.max() < 1e-1
+
+
+def test_nvimgcodec_flag_and_fallback(monkeypatch):
+    # Force nvimgcodec flag on, then reload Image and ensure fallback works
+    monkeypatch.setenv("USE_NVIMGCODEC", "1")
+    import importlib as _importlib
+
+    ImageMod = _importlib.import_module("dimos.msgs.sensor_msgs.Image")
+    _importlib.reload(ImageMod)
+    # Even if nvimgcodec missing, to_base64 should work (fallback)
+    arr = _prepare_image(ImageFormat.BGR, (32, 32, 3))
+    img = ImageMod.Image.from_numpy(
+        arr, format=ImageMod.ImageFormat.BGR, to_cuda=bool(ImageMod.HAS_CUDA)
+    )
+    b64 = img.to_base64()
+    assert isinstance(b64, str) and len(b64) > 0
+    # Turn flag off and reload
+    monkeypatch.setenv("USE_NVIMGCODEC", "0")
+    _importlib.reload(ImageMod)
+    img2 = ImageMod.Image.from_numpy(arr, format=ImageMod.ImageFormat.BGR)
+    b64_2 = img2.to_base64()
+    assert isinstance(b64_2, str) and len(b64_2) > 0
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_nvimgcodec_gpu_path(monkeypatch):
+    # Enable flag and reload; skip if nvimgcodec not present
+    monkeypatch.setenv("USE_NVIMGCODEC", "1")
+    import importlib as _importlib
+
+    ImageMod = _importlib.import_module("dimos.msgs.sensor_msgs.Image")
+    _importlib.reload(ImageMod)
+    if not ImageMod.HAS_NVIMGCODEC:
+        pytest.skip("nvimgcodec library not available")
+    # Create a CUDA image and encode
+    arr = _prepare_image(ImageFormat.BGR, (32, 32, 3))
+    img = ImageMod.Image.from_numpy(arr, format=ImageMod.ImageFormat.BGR, to_cuda=True)
+    b64 = img.to_base64()
+    assert isinstance(b64, str) and len(b64) > 0
+    # Some builds may import nvimgcodec but not support CuPy device buffers; allow skip
+    if not getattr(ImageMod, "NVIMGCODEC_LAST_USED", False):
+        pytest.skip("nvimgcodec present but encode fell back to CPU in this environment")

From 29248521c4dd8826ae1e975f75f8c0128c8d55e7 Mon Sep 17 00:00:00 2001
From: Paul Nechifor <paul@nechifor.net>
Date: Thu, 9 Oct 2025 02:21:11 +0300
Subject: [PATCH 02/16] fix

---
 dimos/msgs/sensor_msgs/Image.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py
index 7f77d71e70..a40ad5f75f 100644
--- a/dimos/msgs/sensor_msgs/Image.py
+++ b/dimos/msgs/sensor_msgs/Image.py
@@ -513,6 +513,22 @@ def __eq__(self, other) -> bool:
     def __len__(self) -> int:
         return int(self.height * self.width)
 
+    def __getstate__(self):
+        return {
+            'data': self.data,
+            'format': self.format,
+            'frame_id': self.frame_id,
+            'ts': self.ts
+        }
+
+    def __setstate__(self, state):
+        self.__init__(
+            data=state.get('data'),
+            format=state.get('format'),
+            frame_id=state.get('frame_id'),
+            ts=state.get('ts')
+        )
+
 
 # Re-exports for tests
 HAS_CUDA = HAS_CUDA

From 2a017dc00a2b5fd6ec067bbc45f126e8a38ef73f Mon Sep 17 00:00:00 2001
From: paul-nechifor <1262969+paul-nechifor@users.noreply.github.com>
Date: Wed, 8 Oct 2025 23:30:19 +0000
Subject: [PATCH 03/16] CI code cleanup

---
 dimos/msgs/sensor_msgs/Image.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py
index a40ad5f75f..38c30c2772 100644
--- a/dimos/msgs/sensor_msgs/Image.py
+++ b/dimos/msgs/sensor_msgs/Image.py
@@ -514,19 +514,14 @@ def __len__(self) -> int:
         return int(self.height * self.width)
 
     def __getstate__(self):
-        return {
-            'data': self.data,
-            'format': self.format,
-            'frame_id': self.frame_id,
-            'ts': self.ts
-        }
+        return {"data": self.data, "format": self.format, "frame_id": self.frame_id, "ts": self.ts}
 
     def __setstate__(self, state):
         self.__init__(
-            data=state.get('data'),
-            format=state.get('format'),
-            frame_id=state.get('frame_id'),
-            ts=state.get('ts')
+            data=state.get("data"),
+            format=state.get("format"),
+            frame_id=state.get("frame_id"),
+            ts=state.get("ts"),
         )
 
 

From 15d19915660d4db8d397d5f904345efe859d2e93 Mon Sep 17 00:00:00 2001
From: Paul Nechifor <paul@nechifor.net>
Date: Thu, 9 Oct 2025 02:48:53 +0300
Subject: [PATCH 04/16] add crop back

---
 dimos/msgs/sensor_msgs/Image.py               |  3 ++
 .../msgs/sensor_msgs/image_impls/CudaImage.py | 32 +++++++++++++++++++
 .../sensor_msgs/image_impls/NumpyImage.py     | 32 +++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py
index 38c30c2772..e0687954d7 100644
--- a/dimos/msgs/sensor_msgs/Image.py
+++ b/dimos/msgs/sensor_msgs/Image.py
@@ -296,6 +296,9 @@ def to_grayscale(self) -> "Image":
     def resize(self, width: int, height: int, interpolation: int = cv2.INTER_LINEAR) -> "Image":
         return Image(self._impl.resize(width, height, interpolation))
 
+    def crop(self, x: int, y: int, width: int, height: int) -> "Image":
+        return Image(self._impl.crop(x, y, width, height))
+
     def sharpness(self):
         """Return sharpness score as a callable float for backward compatibility."""
         return self._impl.sharpness()
diff --git a/dimos/msgs/sensor_msgs/image_impls/CudaImage.py b/dimos/msgs/sensor_msgs/image_impls/CudaImage.py
index fc8577f0ac..0d779cf70d 100644
--- a/dimos/msgs/sensor_msgs/image_impls/CudaImage.py
+++ b/dimos/msgs/sensor_msgs/image_impls/CudaImage.py
@@ -639,6 +639,38 @@ def resize(self, width: int, height: int, interpolation: int = cv2.INTER_LINEAR)
             _resize_bilinear_hwc_cuda(self.data, height, width), self.format, self.frame_id, self.ts
         )
 
+    def crop(self, x: int, y: int, width: int, height: int) -> "CudaImage":
+        """Crop the image to the specified region.
+
+        Args:
+            x: Starting x coordinate (left edge)
+            y: Starting y coordinate (top edge)
+            width: Width of the cropped region
+            height: Height of the cropped region
+
+        Returns:
+            A new CudaImage containing the cropped region
+        """
+        # Get current image dimensions
+        img_height, img_width = self.data.shape[:2]
+
+        # Clamp the crop region to image bounds
+        x = max(0, min(x, img_width))
+        y = max(0, min(y, img_height))
+        x_end = min(x + width, img_width)
+        y_end = min(y + height, img_height)
+
+        # Perform the crop using array slicing
+        if self.data.ndim == 2:
+            # Grayscale image
+            cropped_data = self.data[y:y_end, x:x_end]
+        else:
+            # Color image (HxWxC)
+            cropped_data = self.data[y:y_end, x:x_end, :]
+
+        # Return a new CudaImage with the cropped data
+        return CudaImage(cropped_data, self.format, self.frame_id, self.ts)
+
     def sharpness(self) -> float:
         if cp is None:
             return 0.0
diff --git a/dimos/msgs/sensor_msgs/image_impls/NumpyImage.py b/dimos/msgs/sensor_msgs/image_impls/NumpyImage.py
index da04005d4f..3431b11295 100644
--- a/dimos/msgs/sensor_msgs/image_impls/NumpyImage.py
+++ b/dimos/msgs/sensor_msgs/image_impls/NumpyImage.py
@@ -137,6 +137,38 @@ def resize(
             self.ts,
         )
 
+    def crop(self, x: int, y: int, width: int, height: int) -> "NumpyImage":
+        """Crop the image to the specified region.
+
+        Args:
+            x: Starting x coordinate (left edge)
+            y: Starting y coordinate (top edge)
+            width: Width of the cropped region
+            height: Height of the cropped region
+
+        Returns:
+            A new NumpyImage containing the cropped region
+        """
+        # Get current image dimensions
+        img_height, img_width = self.data.shape[:2]
+
+        # Clamp the crop region to image bounds
+        x = max(0, min(x, img_width))
+        y = max(0, min(y, img_height))
+        x_end = min(x + width, img_width)
+        y_end = min(y + height, img_height)
+
+        # Perform the crop using array slicing
+        if self.data.ndim == 2:
+            # Grayscale image
+            cropped_data = self.data[y:y_end, x:x_end]
+        else:
+            # Color image (HxWxC)
+            cropped_data = self.data[y:y_end, x:x_end, :]
+
+        # Return a new NumpyImage with the cropped data
+        return NumpyImage(cropped_data, self.format, self.frame_id, self.ts)
+
     def sharpness(self) -> float:
         gray = self.to_grayscale()
         sx = cv2.Sobel(gray.data, cv2.CV_32F, 1, 0, ksize=5)

From e88621704d7fd30848b083674dd04668d5aac836 Mon Sep 17 00:00:00 2001
From: Paul Nechifor <paul@nechifor.net>
Date: Thu, 9 Oct 2025 05:05:22 +0300
Subject: [PATCH 05/16] fix encoding

---
 dimos/msgs/sensor_msgs/image_impls/CudaImage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dimos/msgs/sensor_msgs/image_impls/CudaImage.py b/dimos/msgs/sensor_msgs/image_impls/CudaImage.py
index 0d779cf70d..58ebaf621d 100644
--- a/dimos/msgs/sensor_msgs/image_impls/CudaImage.py
+++ b/dimos/msgs/sensor_msgs/image_impls/CudaImage.py
@@ -71,7 +71,7 @@
   S[6]=-v[1];   S[7]= v[0]; S[8]=0.f;
 }
 
-// Solve a 6x6 system (JTJ * x = JTr) with Gauss–Jordan; JTJ is SPD after damping.
+// Solve a 6x6 system (JTJ * x = JTr) with Gauss-Jordan; JTJ is SPD after damping.
 __device__ void solve6_gauss_jordan(float A[36], float b[6], float x[6]){
   float M[6][7];
   #pragma unroll

From 7bc6f9e2798c8c53ad98792044250010be770ad3 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 9 Oct 2025 11:16:43 -0700
Subject: [PATCH 06/16] Fix detection 3d test for floating point non
 determinism

---
 dimos/perception/detection2d/type/test_detection3d.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dimos/perception/detection2d/type/test_detection3d.py b/dimos/perception/detection2d/type/test_detection3d.py
index c8215b9601..5efa482793 100644
--- a/dimos/perception/detection2d/type/test_detection3d.py
+++ b/dimos/perception/detection2d/type/test_detection3d.py
@@ -124,7 +124,9 @@ def test_foxglove_text_label(detection3d):
     entity = detection3d.to_foxglove_scene_entity("test_entity_123")
     text = entity.texts[0]
 
-    assert text.text == "1/suitcase (81%)", f"Expected text '1/suitcase (81%)', got '{text.text}'"
+    assert text.text in ["1/suitcase (81%)", "1/suitcase (82%)"], (
+        f"Expected text '1/suitcase (81%)' or '1/suitcase (82%)', got '{text.text}'"
+    )
     assert text.pose.position.x == pytest.approx(-3.325, abs=0.1)
     assert text.pose.position.y == pytest.approx(-0.250, abs=0.1)
     assert text.pose.position.z == pytest.approx(0.575, abs=0.1)

From 3ba5695084128b0cabd1f7c0b58c7587f3383ec0 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 9 Oct 2025 11:48:37 -0700
Subject: [PATCH 07/16] Fix CI lack of timer precision for test reactive

---
 dimos/utils/test_reactive.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dimos/utils/test_reactive.py b/dimos/utils/test_reactive.py
index 21f2bd7894..98c8dfb910 100644
--- a/dimos/utils/test_reactive.py
+++ b/dimos/utils/test_reactive.py
@@ -45,7 +45,9 @@ def assert_time(
 
 
 def min_time(func: Callable[[], Any], min_t: int, assert_fail_msg="Function returned too fast"):
-    return assert_time(func, (lambda t: t > min_t), assert_fail_msg + f", min: {min_t} seconds")
+    return assert_time(
+        func, (lambda t: t >= min_t * 0.98), assert_fail_msg + f", min: {min_t} seconds"
+    )
 
 
 def max_time(func: Callable[[], Any], max_t: int, assert_fail_msg="Function took too long"):

From e73e82e982b7f285e4fc53883617dc39bf2ef41f Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Fri, 10 Oct 2025 11:51:45 -0700
Subject: [PATCH 08/16] Added cleaner Dask exit handling for cuda and shared
 memory object cleanup

---
 dimos/core/__init__.py | 102 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 8 deletions(-)

diff --git a/dimos/core/__init__.py b/dimos/core/__init__.py
index 8b00b7da8f..63ba8c3c5e 100644
--- a/dimos/core/__init__.py
+++ b/dimos/core/__init__.py
@@ -25,6 +25,33 @@
 __all__ = ["TF", "LCMTF", "PubSubTF", "TFSpec", "TFConfig"]
 
 
+class CudaCleanupPlugin:
+    """Dask worker plugin to cleanup CUDA resources on shutdown."""
+
+    def setup(self, worker):
+        """Called when worker starts."""
+        pass
+
+    def teardown(self, worker):
+        """Clean up CUDA resources when worker shuts down."""
+        try:
+            import sys
+
+            if "cupy" in sys.modules:
+                import cupy as cp
+
+                # Clear memory pools
+                mempool = cp.get_default_memory_pool()
+                pinned_mempool = cp.get_default_pinned_memory_pool()
+                mempool.free_all_blocks()
+                pinned_mempool.free_all_blocks()
+                cp.cuda.Stream.null.synchronize()
+                mempool.free_all_blocks()
+                pinned_mempool.free_all_blocks()
+        except Exception:
+            pass
+
+
 def patch_actor(actor, cls): ...
 
 
@@ -183,17 +210,46 @@ def check_worker_memory():
             )
 
     def close_all():
+        # Prevents multiple calls to close_all
+        if hasattr(dask_client, "_closed") and dask_client._closed:
+            return
+        dask_client._closed = True
+
         import time
 
+        # Stop all SharedMemory transports before closing Dask
+        # This prevents the "leaked shared_memory objects" warning and hangs
+        try:
+            from dimos.protocol.pubsub import shmpubsub
+            import gc
+
+            for obj in gc.get_objects():
+                if isinstance(obj, (shmpubsub.SharedMemory, shmpubsub.PickleSharedMemory)):
+                    try:
+                        obj.stop()
+                    except Exception:
+                        pass
+        except Exception:
+            pass
+
         # Get the event loop before shutting down
         loop = dask_client.loop
 
-        # Close cluster and client
+        # Clear the actor registry
         ActorRegistry.clear()
-        local_cluster.close()
-        dask_client.close()
 
-        # Stop the Tornado IOLoop to clean up IO loop and Profile threads
+        # Close cluster and client with reasonable timeout
+        # The CudaCleanupPlugin will handle CUDA cleanup on each worker
+        try:
+            local_cluster.close(timeout=5)
+        except Exception:
+            pass
+
+        try:
+            dask_client.close(timeout=5)
+        except Exception:
+            pass
+
         if loop and hasattr(loop, "add_callback") and hasattr(loop, "stop"):
             try:
                 loop.add_callback(loop.stop)
@@ -209,9 +265,6 @@ def close_all():
         except Exception:
             pass
 
-        # Give threads a moment to clean up
-        time.sleep(0.1)
-
     dask_client.deploy = deploy
     dask_client.check_worker_memory = check_worker_memory
     dask_client.stop = lambda: dask_client.close()
@@ -226,6 +279,9 @@ def start(n: Optional[int] = None, memory_limit: str = "auto") -> Client:
         n: Number of workers (defaults to CPU count)
         memory_limit: Memory limit per worker (e.g., '4GB', '2GiB', or 'auto' for Dask's default)
     """
+    import signal
+    import atexit
+
     console = Console()
     if not n:
         n = mp.cpu_count()
@@ -236,10 +292,40 @@ def start(n: Optional[int] = None, memory_limit: str = "auto") -> Client:
             n_workers=n,
             threads_per_worker=4,
             memory_limit=memory_limit,
+            plugins=[CudaCleanupPlugin()],  # Register CUDA cleanup plugin
         )
         client = Client(cluster)
 
     console.print(
         f"[green]Initialized dimos local cluster with [bright_blue]{n} workers, memory limit: {memory_limit}"
     )
-    return patchdask(client, cluster)
+
+    patched_client = patchdask(client, cluster)
+    patched_client._closed = False  # Initialize the flag
+    patched_client._shutting_down = False  # Track if we're shutting down
+
+    # Signal handler with proper exit handling
+    def signal_handler(sig, frame):
+        # If already shutting down, force exit
+        if patched_client._shutting_down:
+            import os
+
+            console.print("[red]Force exit!")
+            os._exit(1)
+
+        patched_client._shutting_down = True
+        console.print(f"[yellow]Shutting down (signal {sig})...")
+
+        try:
+            patched_client.close_all()
+        except Exception:
+            pass
+
+        import sys
+
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    return patched_client

From f91d6e2f8c3e476ae6895325ce2ffb46d231e668 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Fri, 10 Oct 2025 11:52:59 -0700
Subject: [PATCH 09/16] Fix lcm_encode in new Image and added BGR default

---
 dimos/msgs/sensor_msgs/Image.py | 36 +++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py
index e0687954d7..270f99d7d8 100644
--- a/dimos/msgs/sensor_msgs/Image.py
+++ b/dimos/msgs/sensor_msgs/Image.py
@@ -85,7 +85,7 @@ def __init__(
         # Raw constructor path
         if data is None:
             raise TypeError("'data' is required when constructing Image without 'impl'")
-        fmt = format if format is not None else ImageFormat.RGB
+        fmt = format if format is not None else ImageFormat.BGR
         fid = frame_id if frame_id is not None else ""
         tstamp = ts if ts is not None else time.time()
 
@@ -118,7 +118,7 @@ def from_impl(cls, impl: AbstractImage) -> "Image":
     def from_numpy(
         cls,
         np_image: np.ndarray,
-        format: ImageFormat = ImageFormat.RGB,
+        format: ImageFormat = ImageFormat.BGR,
         to_cuda: bool = False,
         **kwargs,
     ) -> "Image":
@@ -350,10 +350,15 @@ def agent_encode(self, quality: int = 80) -> str:
 
     # LCM encode/decode
     def lcm_encode(self, frame_id: Optional[str] = None) -> bytes:
+        """Convert to LCM Image message."""
         msg = LCMImage()
+
+        # Header
         msg.header = Header()
         msg.header.seq = 0
         msg.header.frame_id = frame_id or self.frame_id
+
+        # Set timestamp
         if self.ts is not None:
             msg.header.stamp.sec = int(self.ts)
             msg.header.stamp.nsec = int((self.ts - int(self.ts)) * 1e9)
@@ -362,20 +367,21 @@ def lcm_encode(self, frame_id: Optional[str] = None) -> bytes:
             msg.header.stamp.sec = int(now)
             msg.header.stamp.nsec = int((now - int(now)) * 1e9)
 
-        arr = (
-            self.to_opencv()
-            if self.format in (ImageFormat.BGR, ImageFormat.RGB, ImageFormat.RGBA, ImageFormat.BGRA)
-            else self.to_opencv()
-        )
-        msg.height = int(arr.shape[0])
-        msg.width = int(arr.shape[1])
-        msg.encoding = _get_lcm_encoding(self.format, arr.dtype)
+        # Image properties
+        msg.height = self.height
+        msg.width = self.width
+        msg.encoding = _get_lcm_encoding(self.format, self.dtype)
         msg.is_bigendian = False
-        channels = 1 if arr.ndim == 2 else int(arr.shape[2])
-        msg.step = int(arr.shape[1] * arr.dtype.itemsize * channels)
-        img_bytes = arr.tobytes()
-        msg.data_length = len(img_bytes)
-        msg.data = img_bytes
+
+        # Calculate step (bytes per row)
+        channels = 1 if self.data.ndim == 2 else self.data.shape[2]
+        msg.step = self.width * self.dtype.itemsize * channels
+
+        # Image data - use raw data to preserve format
+        image_bytes = self.data.tobytes()
+        msg.data_length = len(image_bytes)
+        msg.data = image_bytes
+
         return msg.lcm_encode()
 
     @classmethod

From 820fa5975a7f7da516a08722a16618dd14048cfb Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Fri, 10 Oct 2025 12:11:06 -0700
Subject: [PATCH 10/16] Moved cudaimage unit tests to correct location

---
 .../msgs/sensor_msgs/image_impls}/test_image_backend_utils.py     | 0
 .../msgs/sensor_msgs/image_impls}/test_image_backends.py          | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename {tests => dimos/msgs/sensor_msgs/image_impls}/test_image_backend_utils.py (100%)
 rename {tests => dimos/msgs/sensor_msgs/image_impls}/test_image_backends.py (100%)

diff --git a/tests/test_image_backend_utils.py b/dimos/msgs/sensor_msgs/image_impls/test_image_backend_utils.py
similarity index 100%
rename from tests/test_image_backend_utils.py
rename to dimos/msgs/sensor_msgs/image_impls/test_image_backend_utils.py
diff --git a/tests/test_image_backends.py b/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
similarity index 100%
rename from tests/test_image_backends.py
rename to dimos/msgs/sensor_msgs/image_impls/test_image_backends.py

From 348d49ce312d695b7f9e375265c3a41a8d619731 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Fri, 10 Oct 2025 14:10:02 -0700
Subject: [PATCH 11/16] Fix very broken to_cpu method in Image

---
 dimos/msgs/sensor_msgs/Image.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py
index 270f99d7d8..a0cdf5949e 100644
--- a/dimos/msgs/sensor_msgs/Image.py
+++ b/dimos/msgs/sensor_msgs/Image.py
@@ -263,9 +263,27 @@ def copy(self) -> "Image":
     def to_cpu(self) -> "Image":
         if isinstance(self._impl, NumpyImage):
             return self.copy()
+
+        # Special handling for formats with alpha channel
+        if self._impl.format in (ImageFormat.RGBA, ImageFormat.BGRA):
+            data = self._impl.data.get()  # CuPy array to NumPy
+        else:
+            # Use existing to_opencv() logic for other formats
+            bgr_data = np.asarray(self._impl.to_opencv())
+
+            if self._impl.format == ImageFormat.RGB:
+                data = cv2.cvtColor(bgr_data, cv2.COLOR_BGR2RGB)
+            elif self._impl.format == ImageFormat.GRAY:
+                if bgr_data.ndim == 3:
+                    data = cv2.cvtColor(bgr_data, cv2.COLOR_BGR2GRAY)
+                else:
+                    data = bgr_data
+            else:  # BGR, DEPTH, DEPTH16, GRAY16
+                data = bgr_data
+
         return Image(
             NumpyImage(
-                np.asarray(self._impl.to_opencv()),
+                data,
                 self._impl.format,
                 self._impl.frame_id,
                 self._impl.ts,

From 80d6f4fd2ce8b290848afc91f02d7a346753faec Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Fri, 10 Oct 2025 14:10:56 -0700
Subject: [PATCH 12/16] Major rewrite of image backend tests, seperated
 cuda/cpu for testing in CI

---
 .../image_impls/test_image_backends.py        | 530 ++++++++++++------
 1 file changed, 374 insertions(+), 156 deletions(-)

diff --git a/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py b/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
index 59217719dc..21c0286e49 100644
--- a/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
+++ b/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
@@ -23,6 +23,12 @@
 
 IMAGE_PATH = get_data("chair-image.png")
 
+# Print whether we're running in CPU or GPU mode
+if HAS_CUDA:
+    print("Running image backend tests with CUDA/CuPy support (GPU mode)")
+else:
+    print("Running image backend tests in CPU-only mode")
+
 
 def _load_chair_image() -> np.ndarray:
     img = cv2.imread(IMAGE_PATH, cv2.IMREAD_UNCHANGED)
@@ -79,17 +85,25 @@ def _prepare_image(fmt: ImageFormat, shape=None) -> np.ndarray:
 
 @pytest.fixture
 def alloc_timer(request):
+    """Helper fixture for adaptive testing with optional GPU support."""
+
     def _alloc(
-        arr: np.ndarray, fmt: ImageFormat, *, to_cuda: bool = True, label: str | None = None
+        arr: np.ndarray, fmt: ImageFormat, *, to_cuda: bool = None, label: str | None = None
     ):
         tag = label or request.node.name
+
+        # Always create CPU image
         start = time.perf_counter()
-        cpu = Image.from_numpy(arr, format=fmt)
+        cpu = Image.from_numpy(arr, format=fmt, to_cuda=False)
         cpu_time = time.perf_counter() - start
 
+        # Optionally create GPU image if CUDA is available
         gpu = None
         gpu_time = None
-        if to_cuda:
+        if to_cuda is None:
+            to_cuda = HAS_CUDA
+
+        if to_cuda and HAS_CUDA:
             arr_gpu = np.array(arr, copy=True)
             start = time.perf_counter()
             gpu = Image.from_numpy(arr_gpu, format=fmt, to_cuda=True)
@@ -113,100 +127,136 @@ def _alloc(
         ((64, 64), ImageFormat.GRAY),
     ],
 )
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_color_conversions_parity(shape, fmt, alloc_timer):
+def test_color_conversions(shape, fmt, alloc_timer):
+    """Test color conversions with NumpyImage always, add CudaImage parity when available."""
     arr = _prepare_image(fmt, shape)
-    # Build CPU and CUDA images with same logical content (timed allocations)
     cpu, gpu, _, _ = alloc_timer(arr, fmt)
 
-    # Test to_rgb -> to_bgr parity
+    # Always test CPU backend
     cpu_round = cpu.to_rgb().to_bgr().to_opencv()
-    gpu_round = gpu.to_rgb().to_bgr().to_opencv()
-
-    assert cpu_round.shape == gpu_round.shape
-    assert cpu_round.dtype == gpu_round.dtype
-    # Exact match for uint8 color ops
-    assert np.array_equal(cpu_round, gpu_round)
-
-
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_grayscale_parity(alloc_timer):
+    assert cpu_round.shape[0] == shape[0]
+    assert cpu_round.shape[1] == shape[1]
+    assert cpu_round.shape[2] == 3  # to_opencv always returns BGR (3 channels)
+    assert cpu_round.dtype == np.uint8
+
+    # Optionally test GPU parity when CUDA is available
+    if gpu is not None:
+        gpu_round = gpu.to_rgb().to_bgr().to_opencv()
+        assert gpu_round.shape == cpu_round.shape
+        assert gpu_round.dtype == cpu_round.dtype
+        # Exact match for uint8 color ops
+        assert np.array_equal(cpu_round, gpu_round)
+
+
+def test_grayscale(alloc_timer):
+    """Test grayscale conversion with NumpyImage always, add CudaImage parity when available."""
     arr = _prepare_image(ImageFormat.BGR, (48, 32, 3))
     cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR)
 
+    # Always test CPU backend
     cpu_gray = cpu.to_grayscale().to_opencv()
-    gpu_gray = gpu.to_grayscale().to_opencv()
+    assert cpu_gray.shape == (48, 32)  # Grayscale has no channel dimension in OpenCV
+    assert cpu_gray.dtype == np.uint8
 
-    assert cpu_gray.shape == gpu_gray.shape
-    assert cpu_gray.dtype == gpu_gray.dtype
-    # Allow tiny rounding differences (<=1 LSB) — visually indistinguishable
-    diff = np.abs(cpu_gray.astype(np.int16) - gpu_gray.astype(np.int16))
-    assert diff.max() <= 1
+    # Optionally test GPU parity when CUDA is available
+    if gpu is not None:
+        gpu_gray = gpu.to_grayscale().to_opencv()
+        assert gpu_gray.shape == cpu_gray.shape
+        assert gpu_gray.dtype == cpu_gray.dtype
+        # Allow tiny rounding differences (<=1 LSB) — visually indistinguishable
+        diff = np.abs(cpu_gray.astype(np.int16) - gpu_gray.astype(np.int16))
+        assert diff.max() <= 1
 
 
 @pytest.mark.parametrize("fmt", [ImageFormat.BGR, ImageFormat.RGB, ImageFormat.BGRA])
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_resize_parity(fmt, alloc_timer):
+def test_resize(fmt, alloc_timer):
+    """Test resize with NumpyImage always, add CudaImage parity when available."""
     shape = (60, 80, 3) if fmt in (ImageFormat.BGR, ImageFormat.RGB) else (60, 80, 4)
     arr = _prepare_image(fmt, shape)
     cpu, gpu, _, _ = alloc_timer(arr, fmt)
 
     new_w, new_h = 37, 53
-    cpu_res = cpu.resize(new_w, new_h).to_opencv()
-    gpu_res = gpu.resize(new_w, new_h).to_opencv()
-
-    assert cpu_res.shape == gpu_res.shape
-    assert cpu_res.dtype == gpu_res.dtype
-    # Allow small tolerance due to float interpolation differences
-    assert np.max(np.abs(cpu_res.astype(np.int16) - gpu_res.astype(np.int16))) <= 1
 
-
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_perf_compare_alloc(alloc_timer):
+    # Always test CPU backend
+    cpu_res = cpu.resize(new_w, new_h).to_opencv()
+    assert (
+        cpu_res.shape == (53, 37, 3) if fmt != ImageFormat.BGRA else (53, 37, 3)
+    )  # to_opencv drops alpha
+    assert cpu_res.dtype == np.uint8
+
+    # Optionally test GPU parity when CUDA is available
+    if gpu is not None:
+        gpu_res = gpu.resize(new_w, new_h).to_opencv()
+        assert gpu_res.shape == cpu_res.shape
+        assert gpu_res.dtype == cpu_res.dtype
+        # Allow small tolerance due to float interpolation differences
+        assert np.max(np.abs(cpu_res.astype(np.int16) - gpu_res.astype(np.int16))) <= 1
+
+
+def test_perf_alloc(alloc_timer):
+    """Test allocation performance with NumpyImage always, add CudaImage when available."""
     arr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
-    alloc_timer(arr, ImageFormat.BGR, label="test_perf_compare_alloc-setup")
+    alloc_timer(arr, ImageFormat.BGR, label="test_perf_alloc-setup")
 
     runs = 5
+
+    # Always test CPU allocation
     t0 = time.perf_counter()
     for _ in range(runs):
-        _ = Image.from_numpy(arr, format=ImageFormat.BGR)
+        _ = Image.from_numpy(arr, format=ImageFormat.BGR, to_cuda=False)
     cpu_t = (time.perf_counter() - t0) / runs
-    t0 = time.perf_counter()
-    for _ in range(runs):
-        _ = Image.from_numpy(arr, format=ImageFormat.BGR, to_cuda=True)
-    gpu_t = (time.perf_counter() - t0) / runs
-    print(f"alloc (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
-    assert cpu_t > 0 and gpu_t > 0
+    assert cpu_t > 0
+
+    # Optionally test GPU allocation when CUDA is available
+    if HAS_CUDA:
+        t0 = time.perf_counter()
+        for _ in range(runs):
+            _ = Image.from_numpy(arr, format=ImageFormat.BGR, to_cuda=True)
+        gpu_t = (time.perf_counter() - t0) / runs
+        print(f"alloc (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+        assert gpu_t > 0
+    else:
+        print(f"alloc (avg per call) cpu={cpu_t:.6f}s")
 
 
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_sharpness_parity(alloc_timer):
+def test_sharpness(alloc_timer):
+    """Test sharpness computation with NumpyImage always, add CudaImage parity when available."""
     arr = _prepare_image(ImageFormat.BGR, (64, 64, 3))
     cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR)
 
+    # Always test CPU backend
     s_cpu = cpu.sharpness()
-    s_gpu = gpu.sharpness()
+    assert s_cpu >= 0  # Sharpness should be non-negative
+    assert s_cpu < 1000  # Reasonable upper bound
 
-    # Values should be very close; minor border/rounding differences allowed
-    assert abs(s_cpu - s_gpu) < 5e-2
+    # Optionally test GPU parity when CUDA is available
+    if gpu is not None:
+        s_gpu = gpu.sharpness()
+        # Values should be very close; minor border/rounding differences allowed
+        assert abs(s_cpu - s_gpu) < 5e-2
 
 
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_to_opencv_parity(alloc_timer):
+def test_to_opencv(alloc_timer):
+    """Test to_opencv conversion with NumpyImage always, add CudaImage parity when available."""
     # BGRA should drop alpha and produce BGR
     arr = _prepare_image(ImageFormat.BGRA, (32, 32, 4))
     cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGRA)
 
+    # Always test CPU backend
     cpu_bgr = cpu.to_opencv()
-    gpu_bgr = gpu.to_opencv()
-
     assert cpu_bgr.shape == (32, 32, 3)
-    assert gpu_bgr.shape == (32, 32, 3)
-    assert np.array_equal(cpu_bgr, gpu_bgr)
+    assert cpu_bgr.dtype == np.uint8
 
+    # Optionally test GPU parity when CUDA is available
+    if gpu is not None:
+        gpu_bgr = gpu.to_opencv()
+        assert gpu_bgr.shape == cpu_bgr.shape
+        assert gpu_bgr.dtype == cpu_bgr.dtype
+        assert np.array_equal(cpu_bgr, gpu_bgr)
 
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_solve_pnp_parity(alloc_timer):
+
+def test_solve_pnp(alloc_timer):
+    """Test solve_pnp with NumpyImage always, add CudaImage parity when available."""
     # Synthetic camera and 3D points
     K = np.array([[400.0, 0.0, 32.0], [0.0, 400.0, 24.0], [0.0, 0.0, 1.0]], dtype=np.float64)
     dist = None
@@ -231,76 +281,110 @@ def test_solve_pnp_parity(alloc_timer):
     base_bgr = _prepare_image(ImageFormat.BGR, (48, 64, 3))
     cpu, gpu, _, _ = alloc_timer(base_bgr, ImageFormat.BGR)
 
+    # Always test CPU backend
     ok_cpu, r_cpu, t_cpu = cpu.solve_pnp(obj, img_pts, K, dist)
-    ok_gpu, r_gpu, t_gpu = gpu.solve_pnp(obj, img_pts, K, dist)
+    assert ok_cpu
 
-    assert ok_cpu and ok_gpu
-    # Validate reprojection error for CUDA solver
+    # Validate reprojection error for CPU solver
     proj_cpu, _ = cv2.projectPoints(obj, r_cpu, t_cpu, K, dist)
-    proj_gpu, _ = cv2.projectPoints(obj, r_gpu, t_gpu, K, dist)
     proj_cpu = proj_cpu.reshape(-1, 2)
-    proj_gpu = proj_gpu.reshape(-1, 2)
-    err_gpu = np.linalg.norm(proj_gpu - img_pts, axis=1)
-    assert err_gpu.mean() < 1e-3
-    assert err_gpu.max() < 1e-2
+    err_cpu = np.linalg.norm(proj_cpu - img_pts, axis=1)
+    assert err_cpu.mean() < 1e-3
+    assert err_cpu.max() < 1e-2
 
+    # Optionally test GPU parity when CUDA is available
+    if gpu is not None:
+        ok_gpu, r_gpu, t_gpu = gpu.solve_pnp(obj, img_pts, K, dist)
+        assert ok_gpu
 
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_perf_compare_grayscale(alloc_timer):
+        # Validate reprojection error for GPU solver
+        proj_gpu, _ = cv2.projectPoints(obj, r_gpu, t_gpu, K, dist)
+        proj_gpu = proj_gpu.reshape(-1, 2)
+        err_gpu = np.linalg.norm(proj_gpu - img_pts, axis=1)
+        assert err_gpu.mean() < 1e-3
+        assert err_gpu.max() < 1e-2
+
+
+def test_perf_grayscale(alloc_timer):
+    """Test grayscale performance with NumpyImage always, add CudaImage when available."""
     arr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
-    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR, label="test_perf_compare_grayscale-setup")
+    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR, label="test_perf_grayscale-setup")
 
     runs = 10
+
+    # Always test CPU performance
     t0 = time.perf_counter()
     for _ in range(runs):
         _ = cpu.to_grayscale()
     cpu_t = (time.perf_counter() - t0) / runs
-    t0 = time.perf_counter()
-    for _ in range(runs):
-        _ = gpu.to_grayscale()
-    gpu_t = (time.perf_counter() - t0) / runs
-    print(f"grayscale (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
-    assert cpu_t > 0 and gpu_t > 0
+    assert cpu_t > 0
+
+    # Optionally test GPU performance when CUDA is available
+    if gpu is not None:
+        t0 = time.perf_counter()
+        for _ in range(runs):
+            _ = gpu.to_grayscale()
+        gpu_t = (time.perf_counter() - t0) / runs
+        print(f"grayscale (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+        assert gpu_t > 0
+    else:
+        print(f"grayscale (avg per call) cpu={cpu_t:.6f}s")
 
 
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_perf_compare_resize(alloc_timer):
+def test_perf_resize(alloc_timer):
+    """Test resize performance with NumpyImage always, add CudaImage when available."""
     arr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
-    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR, label="test_perf_compare_resize-setup")
+    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR, label="test_perf_resize-setup")
 
     runs = 5
+
+    # Always test CPU performance
     t0 = time.perf_counter()
     for _ in range(runs):
         _ = cpu.resize(320, 240)
     cpu_t = (time.perf_counter() - t0) / runs
-    t0 = time.perf_counter()
-    for _ in range(runs):
-        _ = gpu.resize(320, 240)
-    gpu_t = (time.perf_counter() - t0) / runs
-    print(f"resize (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
-    assert cpu_t > 0 and gpu_t > 0
+    assert cpu_t > 0
+
+    # Optionally test GPU performance when CUDA is available
+    if gpu is not None:
+        t0 = time.perf_counter()
+        for _ in range(runs):
+            _ = gpu.resize(320, 240)
+        gpu_t = (time.perf_counter() - t0) / runs
+        print(f"resize (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+        assert gpu_t > 0
+    else:
+        print(f"resize (avg per call) cpu={cpu_t:.6f}s")
 
 
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_perf_compare_sharpness(alloc_timer):
+def test_perf_sharpness(alloc_timer):
+    """Test sharpness performance with NumpyImage always, add CudaImage when available."""
     arr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
-    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR, label="test_perf_compare_sharpness-setup")
+    cpu, gpu, _, _ = alloc_timer(arr, ImageFormat.BGR, label="test_perf_sharpness-setup")
 
     runs = 3
+
+    # Always test CPU performance
     t0 = time.perf_counter()
     for _ in range(runs):
         _ = cpu.sharpness()
     cpu_t = (time.perf_counter() - t0) / runs
-    t0 = time.perf_counter()
-    for _ in range(runs):
-        _ = gpu.sharpness()
-    gpu_t = (time.perf_counter() - t0) / runs
-    print(f"sharpness (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
-    assert cpu_t > 0 and gpu_t > 0
+    assert cpu_t > 0
+
+    # Optionally test GPU performance when CUDA is available
+    if gpu is not None:
+        t0 = time.perf_counter()
+        for _ in range(runs):
+            _ = gpu.sharpness()
+        gpu_t = (time.perf_counter() - t0) / runs
+        print(f"sharpness (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+        assert gpu_t > 0
+    else:
+        print(f"sharpness (avg per call) cpu={cpu_t:.6f}s")
 
 
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_perf_compare_solvepnp(alloc_timer):
+def test_perf_solvepnp(alloc_timer):
+    """Test solve_pnp performance with NumpyImage always, add CudaImage when available."""
     K = np.array([[600.0, 0.0, 320.0], [0.0, 600.0, 240.0], [0.0, 0.0, 1.0]], dtype=np.float64)
     dist = None
     rng = np.random.default_rng(123)
@@ -310,25 +394,33 @@ def test_perf_compare_solvepnp(alloc_timer):
     img_pts, _ = cv2.projectPoints(obj, rvec_true, tvec_true, K, dist)
     img_pts = img_pts.reshape(-1, 2).astype(np.float32)
     base_bgr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
-    cpu, gpu, _, _ = alloc_timer(
-        base_bgr, ImageFormat.BGR, label="test_perf_compare_solvepnp-setup"
-    )
+    cpu, gpu, _, _ = alloc_timer(base_bgr, ImageFormat.BGR, label="test_perf_solvepnp-setup")
 
     runs = 5
+
+    # Always test CPU performance
     t0 = time.perf_counter()
     for _ in range(runs):
         _ = cpu.solve_pnp(obj, img_pts, K, dist)
     cpu_t = (time.perf_counter() - t0) / runs
-    t0 = time.perf_counter()
-    for _ in range(runs):
-        _ = gpu.solve_pnp(obj, img_pts, K, dist)
-    gpu_t = (time.perf_counter() - t0) / runs
-    print(f"solvePnP (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
-    assert cpu_t > 0 and gpu_t > 0
+    assert cpu_t > 0
+
+    # Optionally test GPU performance when CUDA is available
+    if gpu is not None:
+        t0 = time.perf_counter()
+        for _ in range(runs):
+            _ = gpu.solve_pnp(obj, img_pts, K, dist)
+        gpu_t = (time.perf_counter() - t0) / runs
+        print(f"solvePnP (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+        assert gpu_t > 0
+    else:
+        print(f"solvePnP (avg per call) cpu={cpu_t:.6f}s")
 
 
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_perf_compare_tracker(alloc_timer):
+def test_perf_tracker(alloc_timer):
+    """Test tracker performance with NumpyImage always, add CudaImage when available."""
+    # Don't check - just let it fail if CSRT isn't available
+
     H, W = 240, 320
     img_base = _prepare_image(ImageFormat.BGR, (H, W, 3))
     img1 = img_base.copy()
@@ -344,34 +436,35 @@ def test_perf_compare_tracker(alloc_timer):
         (255, 255, 255),
         thickness=-1,
     )
-    cpu1, gpu1, _, _ = alloc_timer(img1, ImageFormat.BGR, label="test_perf_compare_tracker-frame1")
-    cpu2, gpu2, _, _ = alloc_timer(img2, ImageFormat.BGR, label="test_perf_compare_tracker-frame2")
+    cpu1, gpu1, _, _ = alloc_timer(img1, ImageFormat.BGR, label="test_perf_tracker-frame1")
+    cpu2, gpu2, _, _ = alloc_timer(img2, ImageFormat.BGR, label="test_perf_tracker-frame2")
+
+    # Always test CPU tracker
     trk_cpu = cpu1.create_csrt_tracker(bbox0)
-    trk_gpu = gpu1.create_csrt_tracker(bbox0)
 
     runs = 10
     t0 = time.perf_counter()
     for _ in range(runs):
         _ = cpu2.csrt_update(trk_cpu)
     cpu_t = (time.perf_counter() - t0) / runs
-    t0 = time.perf_counter()
-    for _ in range(runs):
-        _ = gpu2.csrt_update(trk_gpu)
-    gpu_t = (time.perf_counter() - t0) / runs
-    print(f"tracker (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
-    assert cpu_t > 0 and gpu_t > 0
+    assert cpu_t > 0
+
+    # Optionally test GPU performance when CUDA is available
+    if gpu1 is not None and gpu2 is not None:
+        trk_gpu = gpu1.create_csrt_tracker(bbox0)
+        t0 = time.perf_counter()
+        for _ in range(runs):
+            _ = gpu2.csrt_update(trk_gpu)
+        gpu_t = (time.perf_counter() - t0) / runs
+        print(f"tracker (avg per call) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s")
+        assert gpu_t > 0
+    else:
+        print(f"tracker (avg per call) cpu={cpu_t:.6f}s")
 
 
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_csrt_tracker_parity(alloc_timer):
-    # Check tracker availability
-    has_csrt = False
-    if hasattr(cv2, "legacy") and hasattr(cv2.legacy, "TrackerCSRT_create"):
-        has_csrt = True
-    elif hasattr(cv2, "TrackerCSRT_create"):
-        has_csrt = True
-    if not has_csrt:
-        pytest.skip("OpenCV CSRT tracker not available")
+def test_csrt_tracker(alloc_timer):
+    """Test CSRT tracker with NumpyImage always, add CudaImage parity when available."""
+    # Don't check - just let it fail if CSRT isn't available
 
     H, W = 100, 100
     # Create two frames with a moving rectangle
@@ -392,25 +485,31 @@ def test_csrt_tracker_parity(alloc_timer):
         thickness=-1,
     )
 
-    cpu1, gpu1, _, _ = alloc_timer(img1, ImageFormat.BGR, label="test_csrt_tracker_parity-frame1")
-    cpu2, gpu2, _, _ = alloc_timer(img2, ImageFormat.BGR, label="test_csrt_tracker_parity-frame2")
+    cpu1, gpu1, _, _ = alloc_timer(img1, ImageFormat.BGR, label="test_csrt_tracker-frame1")
+    cpu2, gpu2, _, _ = alloc_timer(img2, ImageFormat.BGR, label="test_csrt_tracker-frame2")
 
+    # Always test CPU tracker
     trk_cpu = cpu1.create_csrt_tracker(bbox0)
     ok_cpu, bbox_cpu = cpu2.csrt_update(trk_cpu)
-    trk_gpu = gpu1.create_csrt_tracker(bbox0)
-    ok_gpu, bbox_gpu = gpu2.csrt_update(trk_gpu)
+    assert ok_cpu
 
-    assert ok_cpu and ok_gpu
-    # Compare both to ground-truth expected bbox
+    # Compare to ground-truth expected bbox
     expected = (x0 + dx, y0 + dy, w0, h0)
     err_cpu = sum(abs(a - b) for a, b in zip(bbox_cpu, expected))
-    err_gpu = sum(abs(a - b) for a, b in zip(bbox_gpu, expected))
     assert err_cpu <= 8
-    assert err_gpu <= 10  # allow some slack for scale/window effects
 
+    # Optionally test GPU parity when CUDA is available
+    if gpu1 is not None and gpu2 is not None:
+        trk_gpu = gpu1.create_csrt_tracker(bbox0)
+        ok_gpu, bbox_gpu = gpu2.csrt_update(trk_gpu)
+        assert ok_gpu
 
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_solve_pnp_ransac_with_outliers_and_distortion(alloc_timer):
+        err_gpu = sum(abs(a - b) for a, b in zip(bbox_gpu, expected))
+        assert err_gpu <= 10  # allow some slack for scale/window effects
+
+
+def test_solve_pnp_ransac(alloc_timer):
+    """Test solve_pnp_ransac with NumpyImage always, add CudaImage when available."""
     # Camera with distortion
     K = np.array([[500.0, 0.0, 320.0], [0.0, 500.0, 240.0], [0.0, 0.0, 1.0]], dtype=np.float64)
     dist = np.array([0.1, -0.05, 0.001, 0.001, 0.0], dtype=np.float64)
@@ -428,25 +527,45 @@ def test_solve_pnp_ransac_with_outliers_and_distortion(alloc_timer):
     img_pts = img_pts.astype(np.float32)
 
     base_bgr = _prepare_image(ImageFormat.BGR, (480, 640, 3))
-    _, gpu, _, _ = alloc_timer(base_bgr, ImageFormat.BGR, label="test_solve_pnp_ransac-setup")
+    cpu, gpu, _, _ = alloc_timer(base_bgr, ImageFormat.BGR, label="test_solve_pnp_ransac-setup")
 
-    ok_gpu, r_gpu, t_gpu, mask_gpu = gpu.solve_pnp_ransac(
+    # Always test CPU backend
+    ok_cpu, r_cpu, t_cpu, mask_cpu = cpu.solve_pnp_ransac(
         obj, img_pts, K, dist, iterations_count=150, reprojection_error=3.0
     )
-    assert ok_gpu
-    inlier_ratio = mask_gpu.mean()
+    assert ok_cpu
+    inlier_ratio = mask_cpu.mean()
     assert inlier_ratio > 0.7
+
     # Reprojection error on inliers
-    in_idx = np.nonzero(mask_gpu)[0]
-    proj_gpu, _ = cv2.projectPoints(obj[in_idx], r_gpu, t_gpu, K, dist)
-    proj_gpu = proj_gpu.reshape(-1, 2)
-    err = np.linalg.norm(proj_gpu - img_pts[in_idx], axis=1)
+    in_idx = np.nonzero(mask_cpu)[0]
+    proj_cpu, _ = cv2.projectPoints(obj[in_idx], r_cpu, t_cpu, K, dist)
+    proj_cpu = proj_cpu.reshape(-1, 2)
+    err = np.linalg.norm(proj_cpu - img_pts[in_idx], axis=1)
     assert err.mean() < 1.5
     assert err.max() < 4.0
 
-
-@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
-def test_solve_pnp_batch_correctness_and_perf(alloc_timer):
+    # Optionally test GPU parity when CUDA is available
+    if gpu is not None:
+        ok_gpu, r_gpu, t_gpu, mask_gpu = gpu.solve_pnp_ransac(
+            obj, img_pts, K, dist, iterations_count=150, reprojection_error=3.0
+        )
+        assert ok_gpu
+        inlier_ratio_gpu = mask_gpu.mean()
+        assert inlier_ratio_gpu > 0.7
+
+        # Reprojection error on inliers for GPU
+        in_idx_gpu = np.nonzero(mask_gpu)[0]
+        proj_gpu, _ = cv2.projectPoints(obj[in_idx_gpu], r_gpu, t_gpu, K, dist)
+        proj_gpu = proj_gpu.reshape(-1, 2)
+        err_gpu = np.linalg.norm(proj_gpu - img_pts[in_idx_gpu], axis=1)
+        assert err_gpu.mean() < 1.5
+        assert err_gpu.max() < 4.0
+
+
+def test_solve_pnp_batch(alloc_timer):
+    """Test solve_pnp batch processing with NumpyImage always, add CudaImage when available."""
+    # Note: Batch processing is primarily a GPU feature, but we can still test CPU loop
     # Generate batched problems
     B, N = 8, 50
     rng = np.random.default_rng(99)
@@ -464,8 +583,7 @@ def test_solve_pnp_batch_correctness_and_perf(alloc_timer):
     base_bgr = _prepare_image(ImageFormat.BGR, (10, 10, 3))
     cpu, gpu, _, _ = alloc_timer(base_bgr, ImageFormat.BGR, label="test_solve_pnp_batch-setup")
 
-    # CPU loop
-
+    # Always test CPU loop
     t0 = time.perf_counter()
     r_list = []
     t_list = []
@@ -477,20 +595,30 @@ def test_solve_pnp_batch_correctness_and_perf(alloc_timer):
     cpu_total = time.perf_counter() - t0
     cpu_t = cpu_total / B
 
-    # CUDA batched
-    t0 = time.perf_counter()
-    r_b, t_b = gpu.solve_pnp_batch(obj, img, K)
-    gpu_total = time.perf_counter() - t0
-    gpu_t = gpu_total / B
-    print(f"solvePnP-batch (avg per pose) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s (B={B}, N={N})")
-
-    # Check reprojection for a couple of batches
-    for b in range(min(B, 4)):
-        proj, _ = cv2.projectPoints(obj[b], r_b[b], t_b[b], K, None)
+    # Check reprojection for CPU results
+    for b in range(min(B, 2)):
+        proj, _ = cv2.projectPoints(obj[b], r_list[b], t_list[b], K, None)
         err = np.linalg.norm(proj.reshape(-1, 2) - img[b], axis=1)
         assert err.mean() < 1e-2
         assert err.max() < 1e-1
 
+    # Optionally test GPU batch when CUDA is available
+    if gpu is not None and hasattr(gpu._impl, "solve_pnp_batch"):
+        t0 = time.perf_counter()
+        r_b, t_b = gpu.solve_pnp_batch(obj, img, K)
+        gpu_total = time.perf_counter() - t0
+        gpu_t = gpu_total / B
+        print(f"solvePnP-batch (avg per pose) cpu={cpu_t:.6f}s gpu={gpu_t:.6f}s (B={B}, N={N})")
+
+        # Check reprojection for GPU batches
+        for b in range(min(B, 4)):
+            proj, _ = cv2.projectPoints(obj[b], r_b[b], t_b[b], K, None)
+            err = np.linalg.norm(proj.reshape(-1, 2) - img[b], axis=1)
+            assert err.mean() < 1e-2
+            assert err.max() < 1e-1
+    else:
+        print(f"solvePnP-batch (avg per pose) cpu={cpu_t:.6f}s (GPU batch not available)")
+
 
 def test_nvimgcodec_flag_and_fallback(monkeypatch):
     # Force nvimgcodec flag on, then reload Image and ensure fallback works
@@ -532,3 +660,93 @@ def test_nvimgcodec_gpu_path(monkeypatch):
     # Some builds may import nvimgcodec but not support CuPy device buffers; allow skip
     if not getattr(ImageMod, "NVIMGCODEC_LAST_USED", False):
         pytest.skip("nvimgcodec present but encode fell back to CPU in this environment")
+
+
+@pytest.mark.skipif(not HAS_CUDA, reason="CuPy/CUDA not available")
+def test_to_cpu_format_preservation():
+    """Test that to_cpu() preserves image format correctly.
+
+    This tests the fix for the bug where to_cpu() was using to_opencv()
+    which always returns BGR, but keeping the original format label.
+    """
+    # Test RGB format preservation
+    rgb_array = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
+    gpu_img_rgb = Image.from_numpy(rgb_array, format=ImageFormat.RGB, to_cuda=True)
+    cpu_img_rgb = gpu_img_rgb.to_cpu()
+
+    # Verify format is preserved
+    assert cpu_img_rgb.format == ImageFormat.RGB, (
+        f"Format mismatch: expected RGB, got {cpu_img_rgb.format}"
+    )
+    # Verify data is actually in RGB format (not BGR)
+    np.testing.assert_array_equal(cpu_img_rgb.data, rgb_array)
+
+    # Test RGBA format preservation
+    rgba_array = np.random.randint(0, 255, (100, 100, 4), dtype=np.uint8)
+    gpu_img_rgba = Image.from_numpy(rgba_array, format=ImageFormat.RGBA, to_cuda=True)
+    cpu_img_rgba = gpu_img_rgba.to_cpu()
+
+    assert cpu_img_rgba.format == ImageFormat.RGBA, (
+        f"Format mismatch: expected RGBA, got {cpu_img_rgba.format}"
+    )
+    np.testing.assert_array_equal(cpu_img_rgba.data, rgba_array)
+
+    # Test BGR format (should be unchanged since to_opencv returns BGR)
+    bgr_array = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
+    gpu_img_bgr = Image.from_numpy(bgr_array, format=ImageFormat.BGR, to_cuda=True)
+    cpu_img_bgr = gpu_img_bgr.to_cpu()
+
+    assert cpu_img_bgr.format == ImageFormat.BGR, (
+        f"Format mismatch: expected BGR, got {cpu_img_bgr.format}"
+    )
+    np.testing.assert_array_equal(cpu_img_bgr.data, bgr_array)
+
+    # Test BGRA format
+    bgra_array = np.random.randint(0, 255, (100, 100, 4), dtype=np.uint8)
+    gpu_img_bgra = Image.from_numpy(bgra_array, format=ImageFormat.BGRA, to_cuda=True)
+    cpu_img_bgra = gpu_img_bgra.to_cpu()
+
+    assert cpu_img_bgra.format == ImageFormat.BGRA, (
+        f"Format mismatch: expected BGRA, got {cpu_img_bgra.format}"
+    )
+    np.testing.assert_array_equal(cpu_img_bgra.data, bgra_array)
+
+    # Test GRAY format
+    gray_array = np.random.randint(0, 255, (100, 100), dtype=np.uint8)
+    gpu_img_gray = Image.from_numpy(gray_array, format=ImageFormat.GRAY, to_cuda=True)
+    cpu_img_gray = gpu_img_gray.to_cpu()
+
+    assert cpu_img_gray.format == ImageFormat.GRAY, (
+        f"Format mismatch: expected GRAY, got {cpu_img_gray.format}"
+    )
+    np.testing.assert_array_equal(cpu_img_gray.data, gray_array)
+
+    # Test DEPTH format (float32)
+    depth_array = np.random.uniform(0.5, 10.0, (100, 100)).astype(np.float32)
+    gpu_img_depth = Image.from_numpy(depth_array, format=ImageFormat.DEPTH, to_cuda=True)
+    cpu_img_depth = gpu_img_depth.to_cpu()
+
+    assert cpu_img_depth.format == ImageFormat.DEPTH, (
+        f"Format mismatch: expected DEPTH, got {cpu_img_depth.format}"
+    )
+    np.testing.assert_array_equal(cpu_img_depth.data, depth_array)
+
+    # Test DEPTH16 format (uint16)
+    depth16_array = np.random.randint(100, 65000, (100, 100), dtype=np.uint16)
+    gpu_img_depth16 = Image.from_numpy(depth16_array, format=ImageFormat.DEPTH16, to_cuda=True)
+    cpu_img_depth16 = gpu_img_depth16.to_cpu()
+
+    assert cpu_img_depth16.format == ImageFormat.DEPTH16, (
+        f"Format mismatch: expected DEPTH16, got {cpu_img_depth16.format}"
+    )
+    np.testing.assert_array_equal(cpu_img_depth16.data, depth16_array)
+
+    # Test GRAY16 format (uint16)
+    gray16_array = np.random.randint(0, 65535, (100, 100), dtype=np.uint16)
+    gpu_img_gray16 = Image.from_numpy(gray16_array, format=ImageFormat.GRAY16, to_cuda=True)
+    cpu_img_gray16 = gpu_img_gray16.to_cpu()
+
+    assert cpu_img_gray16.format == ImageFormat.GRAY16, (
+        f"Format mismatch: expected GRAY16, got {cpu_img_gray16.format}"
+    )
+    np.testing.assert_array_equal(cpu_img_gray16.data, gray16_array)

From 68a49857ee73284d16524abbff4b6cf2b62c9040 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Fri, 10 Oct 2025 14:11:23 -0700
Subject: [PATCH 13/16] Minor changes

---
 .../image_impls/test_image_backend_utils.py           | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/dimos/msgs/sensor_msgs/image_impls/test_image_backend_utils.py b/dimos/msgs/sensor_msgs/image_impls/test_image_backend_utils.py
index 3adde5b631..fe187ce208 100644
--- a/dimos/msgs/sensor_msgs/image_impls/test_image_backend_utils.py
+++ b/dimos/msgs/sensor_msgs/image_impls/test_image_backend_utils.py
@@ -16,6 +16,17 @@
 import pytest
 
 from dimos.msgs.sensor_msgs import Image, ImageFormat
+
+# Print whether we're running in CPU or GPU mode
+try:
+    import cupy as cp
+
+    HAS_CUDA = True
+    print("Running image backend utils tests with CUDA/CuPy support (GPU mode)")
+except:
+    HAS_CUDA = False
+    print("Running image backend utils tests in CPU-only mode")
+
 from dimos.perception.common.utils import (
     rectify_image,
     project_3d_points_to_2d,

From 084edff11e26b6bd460e083c49093ef55cc8a19d Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Fri, 10 Oct 2025 14:39:21 -0700
Subject: [PATCH 14/16] Fixes

---
 dimos/msgs/sensor_msgs/image_impls/test_image_backend_utils.py | 1 -
 dimos/msgs/sensor_msgs/image_impls/test_image_backends.py      | 1 -
 2 files changed, 2 deletions(-)

diff --git a/dimos/msgs/sensor_msgs/image_impls/test_image_backend_utils.py b/dimos/msgs/sensor_msgs/image_impls/test_image_backend_utils.py
index fe187ce208..810cedf5f1 100644
--- a/dimos/msgs/sensor_msgs/image_impls/test_image_backend_utils.py
+++ b/dimos/msgs/sensor_msgs/image_impls/test_image_backend_utils.py
@@ -17,7 +17,6 @@
 
 from dimos.msgs.sensor_msgs import Image, ImageFormat
 
-# Print whether we're running in CPU or GPU mode
 try:
     import cupy as cp
 
diff --git a/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py b/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
index 21c0286e49..be18640fcc 100644
--- a/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
+++ b/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
@@ -23,7 +23,6 @@
 
 IMAGE_PATH = get_data("chair-image.png")
 
-# Print whether we're running in CPU or GPU mode
 if HAS_CUDA:
     print("Running image backend tests with CUDA/CuPy support (GPU mode)")
 else:

From 90fcc587a04cc6fd89e747da5d8d9624e3c7ed08 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Fri, 10 Oct 2025 15:40:58 -0700
Subject: [PATCH 15/16] Simplify to_cpu Image operation

---
 dimos/msgs/sensor_msgs/Image.py | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py
index a0cdf5949e..3d50346924 100644
--- a/dimos/msgs/sensor_msgs/Image.py
+++ b/dimos/msgs/sensor_msgs/Image.py
@@ -264,22 +264,7 @@ def to_cpu(self) -> "Image":
         if isinstance(self._impl, NumpyImage):
             return self.copy()
 
-        # Special handling for formats with alpha channel
-        if self._impl.format in (ImageFormat.RGBA, ImageFormat.BGRA):
-            data = self._impl.data.get()  # CuPy array to NumPy
-        else:
-            # Use existing to_opencv() logic for other formats
-            bgr_data = np.asarray(self._impl.to_opencv())
-
-            if self._impl.format == ImageFormat.RGB:
-                data = cv2.cvtColor(bgr_data, cv2.COLOR_BGR2RGB)
-            elif self._impl.format == ImageFormat.GRAY:
-                if bgr_data.ndim == 3:
-                    data = cv2.cvtColor(bgr_data, cv2.COLOR_BGR2GRAY)
-                else:
-                    data = bgr_data
-            else:  # BGR, DEPTH, DEPTH16, GRAY16
-                data = bgr_data
+        data = self._impl.data.get()  # CuPy array to NumPy
 
         return Image(
             NumpyImage(

From ab83dad83899957bfee987df2f7dbf785f44d661 Mon Sep 17 00:00:00 2001
From: spomichter <12108168+spomichter@users.noreply.github.com>
Date: Fri, 10 Oct 2025 22:52:04 +0000
Subject: [PATCH 16/16] CI code cleanup

---
 dimos/perception/detection2d/type/test_detection3d.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dimos/perception/detection2d/type/test_detection3d.py b/dimos/perception/detection2d/type/test_detection3d.py
index dbf06ebcc5..1f5c9cfee0 100644
--- a/dimos/perception/detection2d/type/test_detection3d.py
+++ b/dimos/perception/detection2d/type/test_detection3d.py
@@ -31,12 +31,13 @@ def test_guess_projection(get_moment_2d, publish_moment):
     # for stash
     # detection3d = Detection3D.from_2d(detection2d, 1.5, camera_info, transform)
     # print(detection3d)
-    
+
     # foxglove bridge needs 2 messages per topic to pass to foxglove
     publish_moment(moment)
     time.sleep(0.1)
     publish_moment(moment)
 
+
 def test_bounding_box_dimensions(detection3d):
     """Test bounding box dimension calculation."""
     dims = detection3d.get_bounding_box_dimensions()
@@ -146,4 +147,3 @@ def test_detection_pose(detection3d):
     assert detection3d.pose.frame_id == "world", (
         f"Expected frame_id 'world', got '{detection3d.pose.frame_id}'"
     )
-