From ea41664f1ae3c54ddb6dc0b2588813a4166183ac Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Fri, 10 Oct 2025 17:10:54 -0700
Subject: [PATCH 01/47] retry decorator, better vl model query system, json
 query, bounding box query

---
 .envrc.nix                                |   5 +
 .envrc.venv                               |   2 +
 dimos/models/vl/base.py                   | 137 +++++++++++++++
 dimos/models/vl/qwen.py                   |  10 +-
 dimos/models/vl/test_base.py              | 204 ++++++++++++++++++++++
 dimos/utils/decorators/__init__.py        |   3 +-
 dimos/utils/decorators/decorators.py      |  58 +++++-
 dimos/utils/decorators/test_decorators.py | 185 +++++++++++++++++++-
 8 files changed, 598 insertions(+), 6 deletions(-)
 create mode 100644 .envrc.nix
 create mode 100644 .envrc.venv
 create mode 100644 dimos/models/vl/test_base.py

diff --git a/.envrc.nix b/.envrc.nix
new file mode 100644
index 0000000000..4a6ade8151
--- /dev/null
+++ b/.envrc.nix
@@ -0,0 +1,5 @@
+if ! has nix_direnv_version || ! nix_direnv_version 3.0.6; then
+  source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/3.0.6/direnvrc" "sha256-RYcUJaRMf8oF5LznDrlCXbkOQrywm0HDv1VjYGaJGdM="
+fi
+use flake .
+dotenv_if_exists
diff --git a/.envrc.venv b/.envrc.venv
new file mode 100644
index 0000000000..a4b314c6f7
--- /dev/null
+++ b/.envrc.venv
@@ -0,0 +1,2 @@
+source env/bin/activate
+dotenv_if_exists
diff --git a/dimos/models/vl/base.py b/dimos/models/vl/base.py
index faab96363d..522d38ec46 100644
--- a/dimos/models/vl/base.py
+++ b/dimos/models/vl/base.py
@@ -1,10 +1,147 @@
+import json
+import re
 from abc import ABC, abstractmethod
+from typing import Union
 
 import numpy as np
 
 from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection2d.type import Detection2DBBox, ImageDetections2D
+from dimos.utils.decorators import retry
+
+
+def extract_json(response: str) -> Union[dict, list]:
+    """Extract JSON from potentially messy LLM response.
+
+    Tries multiple strategies:
+    1. Parse the entire response as JSON
+    2. Find and parse JSON arrays in the response
+    3. Find and parse JSON objects in the response
+
+    Args:
+        response: Raw text response that may contain JSON
+
+    Returns:
+        Parsed JSON object (dict or list)
+
+    Raises:
+        json.JSONDecodeError: If no valid JSON can be extracted
+    """
+    # First try to parse the whole response as JSON
+    try:
+        return json.loads(response)
+    except json.JSONDecodeError:
+        pass
+
+    # If that fails, try to extract JSON from the messy response
+    # Look for JSON arrays or objects in the text
+
+    # Pattern to match JSON arrays (including nested arrays/objects)
+    # This finds the outermost [...] structure
+    array_pattern = r'\[(?:[^\[\]]*|\[(?:[^\[\]]*|\[[^\[\]]*\])*\])*\]'
+
+    # Pattern to match JSON objects
+    object_pattern = r'\{(?:[^{}]*|\{(?:[^{}]*|\{[^{}]*\})*\})*\}'
+
+    # Try to find JSON arrays first (most common for detections)
+    matches = re.findall(array_pattern, response, re.DOTALL)
+    for match in matches:
+        try:
+            parsed = json.loads(match)
+            # For detection arrays, we expect a list
+            if isinstance(parsed, list):
+                return parsed
+        except json.JSONDecodeError:
+            continue
+
+    # Try JSON objects if no arrays found
+    matches = re.findall(object_pattern, response, re.DOTALL)
+    for match in matches:
+        try:
+            return json.loads(match)
+        except json.JSONDecodeError:
+            continue
+
+    # If nothing worked, raise an error with the original response
+    raise json.JSONDecodeError(
+        f"Could not extract valid JSON from response: {response[:200]}...",
+        response, 0
+    )
 
 
 class VlModel(ABC):
     @abstractmethod
     def query(self, image: Image | np.ndarray, query: str) -> str: ...
+
+    @retry(max_retries=2, on_exception=json.JSONDecodeError, delay=0.0)
+    def query_json(self, image: Image, query: str) -> dict:
+        response = self.query(image, query)
+        return extract_json(response)
+
+    def query_detections(self, image: Image, query: str) -> ImageDetections2D:
+        full_query = f"""show me bounding boxes in pixels for this query: `{query}`
+
+        format should be:
+        `[
+        [label, x1, y1, x2, y2]
+        ...
+        ]`
+
+        (etc, multiple matches are possible)
+
+        If there's no match return `[]`. Label is whatever you think is appropriate
+        Only respond with the coordinates, no other text."""
+
+        image_detections = ImageDetections2D(image)
+        try:
+            coords = self.query_json(image, full_query)
+        except Exception:
+            return image_detections
+
+        img_height, img_width = image.shape[:2] if image.shape else (float("inf"), float("inf"))
+
+        for track_id, detection_list in enumerate(coords):
+            if len(detection_list) != 5:
+                continue
+
+            name = detection_list[0]
+
+            # Convert to floats with error handling
+            try:
+                bbox = list(map(float, detection_list[1:]))
+            except (ValueError, TypeError):
+                print(
+                    f"Warning: Invalid bbox coordinates for detection '{name}': {detection_list[1:]}"
+                )
+                continue
+
+            # Validate bounding box
+            x1, y1, x2, y2 = bbox
+
+            # Check if coordinates are valid
+            if x2 <= x1 or y2 <= y1:
+                print(
+                    f"Warning: Invalid bbox dimensions for '{name}': x1={x1}, y1={y1}, x2={x2}, y2={y2}"
+                )
+                continue
+
+            # Clamp to image bounds if we have image dimensions
+            if image.shape:
+                x1 = max(0, min(x1, img_width))
+                y1 = max(0, min(y1, img_height))
+                x2 = max(0, min(x2, img_width))
+                y2 = max(0, min(y2, img_height))
+                bbox = [x1, y1, x2, y2]
+
+            image_detections.detections.append(
+                Detection2DBBox(
+                    bbox=bbox,
+                    track_id=track_id,
+                    class_id=-100,  # Using -100 to indicate VLModel-generated detection
+                    confidence=1.0,
+                    name=name,
+                    ts=image.ts,
+                    image=image,
+                )
+            )
+        return image_detections
diff --git a/dimos/models/vl/qwen.py b/dimos/models/vl/qwen.py
index 05ad4715c5..c34f6f7964 100644
--- a/dimos/models/vl/qwen.py
+++ b/dimos/models/vl/qwen.py
@@ -1,4 +1,5 @@
 import os
+from functools import cached_property
 from typing import Optional
 
 import numpy as np
@@ -9,19 +10,22 @@
 
 
 class QwenVlModel(VlModel):
-    _client: OpenAI
     _model_name: str
+    _api_key: Optional[str]
 
     def __init__(self, api_key: Optional[str] = None, model_name: str = "qwen2.5-vl-72b-instruct"):
         self._model_name = model_name
+        self._api_key = api_key
 
-        api_key = api_key or os.getenv("ALIBABA_API_KEY")
+    @cached_property
+    def _client(self) -> OpenAI:
+        api_key = self._api_key or os.getenv("ALIBABA_API_KEY")
         if not api_key:
             raise ValueError(
                 "Alibaba API key must be provided or set in ALIBABA_API_KEY environment variable"
             )
 
-        self._client = OpenAI(
+        return OpenAI(
             base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
             api_key=api_key,
         )
diff --git a/dimos/models/vl/test_base.py b/dimos/models/vl/test_base.py
new file mode 100644
index 0000000000..bed210a283
--- /dev/null
+++ b/dimos/models/vl/test_base.py
@@ -0,0 +1,204 @@
+import json
+import os
+from unittest.mock import MagicMock
+
+import pytest
+
+from dimos.models.vl.base import extract_json
+from dimos.models.vl.qwen import QwenVlModel
+from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection2d.type import ImageDetections2D
+from dimos.utils.data import get_data
+
+# Captured actual response from Qwen API for cafe.jpg with query "humans"
+MOCK_QWEN_RESPONSE = """
+   Here you go bro:
+
+   [
+    ["humans", 76, 368, 219, 580],
+    ["humans", 354, 372, 512, 525],
+    ["humans", 409, 370, 615, 748],
+    ["humans", 628, 350, 762, 528],
+    ["humans", 785, 323, 960, 650]
+   ]
+
+   Hope this helps!😀😊 :)"""
+
+
+def test_extract_json_clean_response():
+    """Test extract_json with clean JSON response."""
+    clean_json = '[["object", 1, 2, 3, 4]]'
+    result = extract_json(clean_json)
+    assert result == [["object", 1, 2, 3, 4]]
+
+
+def test_extract_json_with_text_before_after():
+    """Test extract_json with text before and after JSON."""
+    messy = """Here's what I found:
+    [
+        ["person", 10, 20, 30, 40],
+        ["car", 50, 60, 70, 80]
+    ]
+    Hope this helps!"""
+    result = extract_json(messy)
+    assert result == [["person", 10, 20, 30, 40], ["car", 50, 60, 70, 80]]
+
+
+def test_extract_json_with_emojis():
+    """Test extract_json with emojis and markdown code blocks."""
+    messy = """Sure! 😊 Here are the detections:
+
+    ```json
+    [["human", 100, 200, 300, 400]]
+    ```
+
+    Let me know if you need anything else! 👍"""
+    result = extract_json(messy)
+    assert result == [["human", 100, 200, 300, 400]]
+
+
+def test_extract_json_multiple_json_blocks():
+    """Test extract_json when there are multiple JSON blocks."""
+    messy = """First attempt (wrong format):
+    {"error": "not what we want"}
+
+    Correct format:
+    [
+        ["cat", 10, 10, 50, 50],
+        ["dog", 60, 60, 100, 100]
+    ]
+
+    Another block: {"also": "not needed"}"""
+    result = extract_json(messy)
+    # Should return the first valid array
+    assert result == [["cat", 10, 10, 50, 50], ["dog", 60, 60, 100, 100]]
+
+
+def test_extract_json_object():
+    """Test extract_json with JSON object instead of array."""
+    response = 'The result is: {"status": "success", "count": 5}'
+    result = extract_json(response)
+    assert result == {"status": "success", "count": 5}
+
+
+def test_extract_json_nested_structures():
+    """Test extract_json with nested arrays and objects."""
+    response = """Processing complete:
+    [
+        ["label1", 1, 2, 3, 4],
+        {"nested": {"value": 10}},
+        ["label2", 5, 6, 7, 8]
+    ]"""
+    result = extract_json(response)
+    assert result[0] == ["label1", 1, 2, 3, 4]
+    assert result[1] == {"nested": {"value": 10}}
+    assert result[2] == ["label2", 5, 6, 7, 8]
+
+
+def test_extract_json_invalid():
+    """Test extract_json raises error when no valid JSON found."""
+    response = "This response has no valid JSON at all!"
+    with pytest.raises(json.JSONDecodeError) as exc_info:
+        extract_json(response)
+    assert "Could not extract valid JSON" in str(exc_info.value)
+
+
+def test_extract_json_with_real_llm_response():
+    """Test extract_json with the actual messy response."""
+    result = extract_json(MOCK_QWEN_RESPONSE)
+    assert isinstance(result, list)
+    assert len(result) == 5
+    assert result[0] == ["humans", 76, 368, 219, 580]
+    assert result[-1] == ["humans", 785, 323, 960, 650]
+
+
+def test_query_detections_mocked():
+    """Test query_detections with mocked API response (no API key required)."""
+    # Load test image
+    image = Image.from_file(get_data("cafe.jpg"))
+
+    # Create model and mock the query method
+    model = QwenVlModel()
+    model.query = MagicMock(return_value=MOCK_QWEN_RESPONSE)
+
+    # Query for humans in the image
+    query = "humans"
+    detections = model.query_detections(image, query)
+
+    # Verify the return type
+    assert isinstance(detections, ImageDetections2D)
+
+    # Should have 5 detections based on our mock data
+    assert len(detections.detections) == 5, (
+        f"Expected 5 detections, got {len(detections.detections)}"
+    )
+
+    # Verify each detection
+    img_height, img_width = image.shape[:2]
+
+    for i, detection in enumerate(detections.detections):
+        # Verify attributes
+        assert detection.name == "humans"
+        assert detection.confidence == 1.0
+        assert detection.class_id == -100
+        assert detection.track_id == i
+        assert len(detection.bbox) == 4
+
+        # Verify bbox coordinates are valid and clamped
+        x1, y1, x2, y2 = detection.bbox
+        assert x2 > x1, f"Detection {i}: Invalid x coordinates: x1={x1}, x2={x2}"
+        assert y2 > y1, f"Detection {i}: Invalid y coordinates: y1={y1}, y2={y2}"
+
+        # Check bounds
+        assert 0 <= x1 <= img_width, f"Detection {i}: x1={x1} out of bounds"
+        assert 0 <= x2 <= img_width, f"Detection {i}: x2={x2} out of bounds"
+        assert 0 <= y1 <= img_height, f"Detection {i}: y1={y1} out of bounds"
+        assert 0 <= y2 <= img_height, f"Detection {i}: y2={y2} out of bounds"
+
+        # Verify clamping worked (the 3rd detection has y2=748 which exceeds image height of 771)
+        if i == 2:  # Third detection
+            assert y2 <= img_height, f"Detection {i}: y2={y2} should be clamped to {img_height}"
+
+    print(f"✓ Successfully processed {len(detections.detections)} mocked detections")
+
+
+@pytest.mark.tool
+@pytest.mark.skipif(not os.getenv("ALIBABA_API_KEY"), reason="ALIBABA_API_KEY not set")
+def test_query_detections_real():
+    """Test query_detections with real API calls (requires API key)."""
+    # Load test image
+    image = Image.from_file(get_data("cafe.jpg"))
+
+    # Initialize the model (will use real API)
+    model = QwenVlModel()
+
+    # Query for humans in the image
+    query = "humans"
+    detections = model.query_detections(image, query)
+
+    assert isinstance(detections, ImageDetections2D)
+    print(detections)
+
+    # Check that detections were found
+    if detections.detections:
+        for detection in detections.detections:
+            # Verify each detection has expected attributes
+            assert detection.bbox is not None
+            assert len(detection.bbox) == 4
+            assert detection.name
+            assert detection.confidence == 1.0
+            assert detection.class_id == -100
+
+            # Verify bbox coordinates are valid
+            x1, y1, x2, y2 = detection.bbox
+            assert x2 > x1, f"Invalid x coordinates: x1={x1}, x2={x2}"
+            assert y2 > y1, f"Invalid y coordinates: y1={y1}, y2={y2}"
+
+            # Verify coordinates are within image bounds
+            img_height, img_width = image.shape[:2]
+            assert 0 <= x1 <= img_width
+            assert 0 <= x2 <= img_width
+            assert 0 <= y1 <= img_height
+            assert 0 <= y2 <= img_height
+
+    print(f"Found {len(detections.detections)} detections for query '{query}'")
diff --git a/dimos/utils/decorators/__init__.py b/dimos/utils/decorators/__init__.py
index 22ad478a00..ee17260c20 100644
--- a/dimos/utils/decorators/__init__.py
+++ b/dimos/utils/decorators/__init__.py
@@ -1,11 +1,12 @@
 """Decorators and accumulators for rate limiting and other utilities."""
 
 from .accumulators import Accumulator, LatestAccumulator, RollingAverageAccumulator
-from .decorators import limit
+from .decorators import limit, retry
 
 __all__ = [
     "Accumulator",
     "LatestAccumulator",
     "RollingAverageAccumulator",
     "limit",
+    "retry",
 ]
diff --git a/dimos/utils/decorators/decorators.py b/dimos/utils/decorators/decorators.py
index c54e3530e1..067251e5c6 100644
--- a/dimos/utils/decorators/decorators.py
+++ b/dimos/utils/decorators/decorators.py
@@ -15,7 +15,7 @@
 import threading
 import time
 from functools import wraps
-from typing import Callable, Optional
+from typing import Callable, Optional, Type
 
 from .accumulators import Accumulator, LatestAccumulator
 
@@ -143,3 +143,59 @@ def getter(self):
             return getattr(self, attr_name)
 
     return getter
+
+
+def retry(max_retries: int = 3, on_exception: Type[Exception] = Exception, delay: float = 0.0):
+    """
+    Decorator that retries a function call if it raises an exception.
+
+    Args:
+        max_retries: Maximum number of retry attempts (default: 3)
+        on_exception: Exception type to catch and retry on (default: Exception)
+        delay: Fixed delay in seconds between retries (default: 0.0)
+
+    Returns:
+        Decorated function that will retry on failure
+
+    Example:
+        @retry(max_retries=5, on_exception=ConnectionError, delay=0.5)
+        def connect_to_server():
+            # connection logic that might fail
+            pass
+
+        @retry()  # Use defaults: 3 retries on any Exception, no delay
+        def risky_operation():
+            # might fail occasionally
+            pass
+    """
+    if max_retries < 0:
+        raise ValueError("max_retries must be non-negative")
+    if delay < 0:
+        raise ValueError("delay must be non-negative")
+
+    def decorator(func: Callable) -> Callable:
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            last_exception = None
+
+            for attempt in range(max_retries + 1):
+                try:
+                    return func(*args, **kwargs)
+                except on_exception as e:
+                    last_exception = e
+                    if attempt < max_retries:
+                        # Still have retries left
+                        if delay > 0:
+                            time.sleep(delay)
+                        continue
+                    else:
+                        # Out of retries, re-raise the last exception
+                        raise
+
+            # This should never be reached, but just in case
+            if last_exception:
+                raise last_exception
+
+        return wrapper
+
+    return decorator
diff --git a/dimos/utils/decorators/test_decorators.py b/dimos/utils/decorators/test_decorators.py
index 2a9162c762..133fab97c2 100644
--- a/dimos/utils/decorators/test_decorators.py
+++ b/dimos/utils/decorators/test_decorators.py
@@ -16,7 +16,7 @@
 
 import pytest
 
-from dimos.utils.decorators import LatestAccumulator, RollingAverageAccumulator, limit
+from dimos.utils.decorators import LatestAccumulator, RollingAverageAccumulator, limit, retry
 
 
 def test_limit():
@@ -77,3 +77,186 @@ def process(value: float, label: str = ""):
 
     # Should see the average of accumulated values
     assert calls == [(10.0, "first"), (25.0, "third")]  # (20+30)/2 = 25
+
+
+def test_retry_success_after_failures():
+    """Test that retry decorator retries on failure and eventually succeeds."""
+    attempts = []
+
+    @retry(max_retries=3)
+    def flaky_function(fail_times=2):
+        attempts.append(len(attempts))
+        if len(attempts) <= fail_times:
+            raise ValueError(f"Attempt {len(attempts)} failed")
+        return "success"
+
+    result = flaky_function()
+    assert result == "success"
+    assert len(attempts) == 3  # Failed twice, succeeded on third attempt
+
+
+def test_retry_exhausted():
+    """Test that retry decorator raises exception when retries are exhausted."""
+    attempts = []
+
+    @retry(max_retries=2)
+    def always_fails():
+        attempts.append(len(attempts))
+        raise RuntimeError(f"Attempt {len(attempts)} failed")
+
+    with pytest.raises(RuntimeError) as exc_info:
+        always_fails()
+
+    assert "Attempt 3 failed" in str(exc_info.value)
+    assert len(attempts) == 3  # Initial attempt + 2 retries
+
+
+def test_retry_specific_exception():
+    """Test that retry only catches specified exception types."""
+    attempts = []
+
+    @retry(max_retries=3, on_exception=ValueError)
+    def raises_different_exceptions():
+        attempts.append(len(attempts))
+        if len(attempts) == 1:
+            raise ValueError("First attempt")
+        elif len(attempts) == 2:
+            raise TypeError("Second attempt - should not be retried")
+        return "success"
+
+    # Should fail on TypeError (not retried)
+    with pytest.raises(TypeError) as exc_info:
+        raises_different_exceptions()
+
+    assert "Second attempt" in str(exc_info.value)
+    assert len(attempts) == 2  # First attempt with ValueError, second with TypeError
+
+
+def test_retry_no_failures():
+    """Test that retry decorator works when function succeeds immediately."""
+    attempts = []
+
+    @retry(max_retries=5)
+    def always_succeeds():
+        attempts.append(len(attempts))
+        return "immediate success"
+
+    result = always_succeeds()
+    assert result == "immediate success"
+    assert len(attempts) == 1  # Only one attempt needed
+
+
+def test_retry_with_delay():
+    """Test that retry decorator applies delay between attempts."""
+    attempts = []
+    times = []
+
+    @retry(max_retries=2, delay=0.1)
+    def delayed_failures():
+        times.append(time.time())
+        attempts.append(len(attempts))
+        if len(attempts) < 2:
+            raise ValueError(f"Attempt {len(attempts)}")
+        return "success"
+
+    start = time.time()
+    result = delayed_failures()
+    duration = time.time() - start
+
+    assert result == "success"
+    assert len(attempts) == 2
+    assert duration >= 0.1  # At least one delay occurred
+
+    # Check that delays were applied
+    if len(times) >= 2:
+        assert times[1] - times[0] >= 0.1
+
+
+def test_retry_zero_retries():
+    """Test retry with max_retries=0 (no retries, just one attempt)."""
+    attempts = []
+
+    @retry(max_retries=0)
+    def single_attempt():
+        attempts.append(len(attempts))
+        raise ValueError("Failed")
+
+    with pytest.raises(ValueError):
+        single_attempt()
+
+    assert len(attempts) == 1  # Only the initial attempt
+
+
+def test_retry_invalid_parameters():
+    """Test that retry decorator validates parameters."""
+    with pytest.raises(ValueError):
+
+        @retry(max_retries=-1)
+        def invalid_retries():
+            pass
+
+    with pytest.raises(ValueError):
+
+        @retry(delay=-0.5)
+        def invalid_delay():
+            pass
+
+
+def test_retry_with_methods():
+    """Test that retry decorator works with class methods, instance methods, and static methods."""
+
+    class TestClass:
+        def __init__(self):
+            self.instance_attempts = []
+            self.instance_value = 42
+
+        @retry(max_retries=3)
+        def instance_method(self, fail_times=2):
+            """Test retry on instance method."""
+            self.instance_attempts.append(len(self.instance_attempts))
+            if len(self.instance_attempts) <= fail_times:
+                raise ValueError(f"Instance attempt {len(self.instance_attempts)} failed")
+            return f"instance success with value {self.instance_value}"
+
+        @classmethod
+        @retry(max_retries=2)
+        def class_method(cls, attempts_list, fail_times=1):
+            """Test retry on class method."""
+            attempts_list.append(len(attempts_list))
+            if len(attempts_list) <= fail_times:
+                raise ValueError(f"Class attempt {len(attempts_list)} failed")
+            return f"class success from {cls.__name__}"
+
+        @staticmethod
+        @retry(max_retries=2)
+        def static_method(attempts_list, fail_times=1):
+            """Test retry on static method."""
+            attempts_list.append(len(attempts_list))
+            if len(attempts_list) <= fail_times:
+                raise ValueError(f"Static attempt {len(attempts_list)} failed")
+            return "static success"
+
+    # Test instance method
+    obj = TestClass()
+    result = obj.instance_method()
+    assert result == "instance success with value 42"
+    assert len(obj.instance_attempts) == 3  # Failed twice, succeeded on third
+
+    # Test class method
+    class_attempts = []
+    result = TestClass.class_method(class_attempts)
+    assert result == "class success from TestClass"
+    assert len(class_attempts) == 2  # Failed once, succeeded on second
+
+    # Test static method
+    static_attempts = []
+    result = TestClass.static_method(static_attempts)
+    assert result == "static success"
+    assert len(static_attempts) == 2  # Failed once, succeeded on second
+
+    # Test that self is properly maintained across retries
+    obj2 = TestClass()
+    obj2.instance_value = 100
+    result = obj2.instance_method()
+    assert result == "instance success with value 100"
+    assert len(obj2.instance_attempts) == 3

From fd7e2684a053d89156f4b99745fea6afa29438e0 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Fri, 10 Oct 2025 17:24:15 -0700
Subject: [PATCH 02/47] circular import bugfix

---
 dimos/perception/detection2d/module2D.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/dimos/perception/detection2d/module2D.py b/dimos/perception/detection2d/module2D.py
index d11875315f..90c8cbbd37 100644
--- a/dimos/perception/detection2d/module2D.py
+++ b/dimos/perception/detection2d/module2D.py
@@ -23,7 +23,6 @@
 from reactivex.subject import Subject
 
 from dimos.core import In, Module, Out, rpc
-from dimos.models.vl import QwenVlModel, VlModel
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.sensor_msgs.Image import sharpness_barrier
 from dimos.msgs.vision_msgs import Detection2DArray
@@ -40,7 +39,6 @@
 class Config:
     max_freq: float = 5  # hz
     detector: Optional[Callable[[Any], Detector]] = lambda: Yolo2DDetector()
-    vlmodel: VlModel = QwenVlModel
 
 
 class Detection2DModule(Module):
@@ -60,7 +58,6 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.config: Config = Config(**kwargs)
         self.detector = self.config.detector()
-        self.vlmodel = self.config.vlmodel()
         self.vlm_detections_subject = Subject()
 
     def process_image_frame(self, image: Image) -> ImageDetections2D:

From ace0725391876bdf2c7311357db7e76acf0eab98 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 10:43:51 -0700
Subject: [PATCH 03/47] better universal json and detection parsing for vlms

---
 dimos/models/vl/base.py                       | 89 ++++++++-----------
 dimos/models/vl/test_base.py                  | 12 +--
 .../detection2d/type/detection2d.py           | 27 ++++++
 3 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/dimos/models/vl/base.py b/dimos/models/vl/base.py
index 522d38ec46..dcca216479 100644
--- a/dimos/models/vl/base.py
+++ b/dimos/models/vl/base.py
@@ -3,10 +3,9 @@
 from abc import ABC, abstractmethod
 from typing import Union
 
-import numpy as np
-
 from dimos.msgs.sensor_msgs import Image
 from dimos.perception.detection2d.type import Detection2DBBox, ImageDetections2D
+from dimos.perception.detection2d.type.detection2d import Detection
 from dimos.utils.decorators import retry
 
 
@@ -38,10 +37,10 @@ def extract_json(response: str) -> Union[dict, list]:
 
     # Pattern to match JSON arrays (including nested arrays/objects)
     # This finds the outermost [...] structure
-    array_pattern = r'\[(?:[^\[\]]*|\[(?:[^\[\]]*|\[[^\[\]]*\])*\])*\]'
+    array_pattern = r"\[(?:[^\[\]]*|\[(?:[^\[\]]*|\[[^\[\]]*\])*\])*\]"
 
     # Pattern to match JSON objects
-    object_pattern = r'\{(?:[^{}]*|\{(?:[^{}]*|\{[^{}]*\})*\})*\}'
+    object_pattern = r"\{(?:[^{}]*|\{(?:[^{}]*|\{[^{}]*\})*\})*\}"
 
     # Try to find JSON arrays first (most common for detections)
     matches = re.findall(array_pattern, response, re.DOTALL)
@@ -64,15 +63,38 @@ def extract_json(response: str) -> Union[dict, list]:
 
     # If nothing worked, raise an error with the original response
     raise json.JSONDecodeError(
-        f"Could not extract valid JSON from response: {response[:200]}...",
-        response, 0
+        f"Could not extract valid JSON from response: {response[:200]}...", response, 0
     )
 
 
+def vlm_detection_to_yolo(vlm_detection: list, track_id: int) -> Detection | None:
+    """Convert a single VLM detection [label, x1, y1, x2, y2] to Detection tuple.
+
+    Args:
+        vlm_detection: Single detection list containing [label, x1, y1, x2, y2]
+        track_id: Track ID to assign to this detection
+
+    Returns:
+        Detection tuple (bbox, track_id, class_id, confidence, name) or None if invalid
+    """
+    if len(vlm_detection) != 5:
+        return None
+
+    name = str(vlm_detection[0])
+    try:
+        bbox = tuple(map(float, vlm_detection[1:]))
+        # Use -1 for class_id since VLM doesn't provide it
+        # confidence defaults to 1.0 for VLM
+        return (bbox, track_id, -1, 1.0, name)
+    except (ValueError, TypeError):
+        return None
+
+
 class VlModel(ABC):
     @abstractmethod
-    def query(self, image: Image | np.ndarray, query: str) -> str: ...
+    def query(self, image: Image, query: str) -> str: ...
 
+    # requery once if JSON parsing fails
     @retry(max_retries=2, on_exception=json.JSONDecodeError, delay=0.0)
     def query_json(self, image: Image, query: str) -> dict:
         response = self.query(image, query)
@@ -93,55 +115,18 @@ def query_detections(self, image: Image, query: str) -> ImageDetections2D:
         Only respond with the coordinates, no other text."""
 
         image_detections = ImageDetections2D(image)
+
         try:
-            coords = self.query_json(image, full_query)
+            detection_tuples = self.query_json(image, full_query)
         except Exception:
             return image_detections
 
-        img_height, img_width = image.shape[:2] if image.shape else (float("inf"), float("inf"))
-
-        for track_id, detection_list in enumerate(coords):
-            if len(detection_list) != 5:
-                continue
-
-            name = detection_list[0]
-
-            # Convert to floats with error handling
-            try:
-                bbox = list(map(float, detection_list[1:]))
-            except (ValueError, TypeError):
-                print(
-                    f"Warning: Invalid bbox coordinates for detection '{name}': {detection_list[1:]}"
-                )
-                continue
-
-            # Validate bounding box
-            x1, y1, x2, y2 = bbox
-
-            # Check if coordinates are valid
-            if x2 <= x1 or y2 <= y1:
-                print(
-                    f"Warning: Invalid bbox dimensions for '{name}': x1={x1}, y1={y1}, x2={x2}, y2={y2}"
-                )
+        for track_id, detection_tuple in enumerate(detection_tuples):
+            detection = vlm_detection_to_yolo(detection_tuple, track_id)
+            if detection is None:
                 continue
+            detection2d = Detection2DBBox.from_detection(detection, ts=image.ts, image=image)
+            if detection2d.is_valid():
+                image_detections.detections.append(detection2d)
 
-            # Clamp to image bounds if we have image dimensions
-            if image.shape:
-                x1 = max(0, min(x1, img_width))
-                y1 = max(0, min(y1, img_height))
-                x2 = max(0, min(x2, img_width))
-                y2 = max(0, min(y2, img_height))
-                bbox = [x1, y1, x2, y2]
-
-            image_detections.detections.append(
-                Detection2DBBox(
-                    bbox=bbox,
-                    track_id=track_id,
-                    class_id=-100,  # Using -100 to indicate VLModel-generated detection
-                    confidence=1.0,
-                    name=name,
-                    ts=image.ts,
-                    image=image,
-                )
-            )
         return image_detections
diff --git a/dimos/models/vl/test_base.py b/dimos/models/vl/test_base.py
index bed210a283..35110bd1cd 100644
--- a/dimos/models/vl/test_base.py
+++ b/dimos/models/vl/test_base.py
@@ -140,25 +140,21 @@ def test_query_detections_mocked():
         # Verify attributes
         assert detection.name == "humans"
         assert detection.confidence == 1.0
-        assert detection.class_id == -100
+        assert detection.class_id == -1  # VLM detections use -1 for class_id
         assert detection.track_id == i
         assert len(detection.bbox) == 4
 
-        # Verify bbox coordinates are valid and clamped
+        # Verify bbox coordinates are valid (out-of-bounds detections are discarded)
         x1, y1, x2, y2 = detection.bbox
         assert x2 > x1, f"Detection {i}: Invalid x coordinates: x1={x1}, x2={x2}"
         assert y2 > y1, f"Detection {i}: Invalid y coordinates: y1={y1}, y2={y2}"
 
-        # Check bounds
+        # Check bounds (out-of-bounds detections would have been discarded)
         assert 0 <= x1 <= img_width, f"Detection {i}: x1={x1} out of bounds"
         assert 0 <= x2 <= img_width, f"Detection {i}: x2={x2} out of bounds"
         assert 0 <= y1 <= img_height, f"Detection {i}: y1={y1} out of bounds"
         assert 0 <= y2 <= img_height, f"Detection {i}: y2={y2} out of bounds"
 
-        # Verify clamping worked (the 3rd detection has y2=748 which exceeds image height of 771)
-        if i == 2:  # Third detection
-            assert y2 <= img_height, f"Detection {i}: y2={y2} should be clamped to {img_height}"
-
     print(f"✓ Successfully processed {len(detections.detections)} mocked detections")
 
 
@@ -187,7 +183,7 @@ def test_query_detections_real():
             assert len(detection.bbox) == 4
             assert detection.name
             assert detection.confidence == 1.0
-            assert detection.class_id == -100
+            assert detection.class_id == -1  # VLM detections use -1 for class_id
 
             # Verify bbox coordinates are valid
             x1, y1, x2, y2 = detection.bbox
diff --git a/dimos/perception/detection2d/type/detection2d.py b/dimos/perception/detection2d/type/detection2d.py
index 48e1a5191d..53a449659d 100644
--- a/dimos/perception/detection2d/type/detection2d.py
+++ b/dimos/perception/detection2d/type/detection2d.py
@@ -19,6 +19,8 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple
 
+from dimos.utils.decorators.decorators import simple_mcache
+
 from dimos_lcm.foxglove_msgs.ImageAnnotations import (
     PointsAnnotation,
     TextAnnotation,
@@ -168,6 +170,31 @@ def bbox_2d_volume(self) -> float:
         height = max(0.0, y2 - y1)
         return width * height
 
+    @simple_mcache
+    def is_valid(self) -> bool:
+        """Check if detection bbox is valid.
+
+        Validates that:
+        - Bounding box has positive dimensions
+        - Bounding box is within image bounds (if image has shape)
+
+        Returns:
+            True if bbox is valid, False otherwise
+        """
+        x1, y1, x2, y2 = self.bbox
+
+        # Check positive dimensions
+        if x2 <= x1 or y2 <= y1:
+            return False
+
+        # Check if within image bounds (if image has shape)
+        if self.image.shape:
+            h, w = self.image.shape[:2]
+            if not (0 <= x1 <= w and 0 <= y1 <= h and 0 <= x2 <= w and 0 <= y2 <= h):
+                return False
+
+        return True
+
     @classmethod
     def from_detector(
         cls, raw_detections: InconvinientDetectionFormat, **kwargs

From 8546465187a687e5791b35a5ef64038c497c6619 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 14:54:30 -0700
Subject: [PATCH 04/47] renamed detections2d to detections

---
 dimos/models/vl/base.py                       |  65 +--------
 dimos/models/vl/test_base.py                  | 113 ++--------------
 .../detection/.claude/settings.local.json     |   9 ++
 dimos/perception/detection/__init__.py        |   7 +
 .../{detection2d => detection}/conftest.py    |   8 +-
 .../detection/detectors/__init__.py           |   3 +
 .../detectors/config/custom_tracker.yaml      |   0
 .../detectors/detic.py                        |   2 +-
 .../person/.claude/settings.local.json        |  10 ++
 .../detectors/person/test_annotations.py      |   2 +-
 .../person/test_detection2d_conformance.py    |   4 +-
 .../person/test_imagedetections2d.py          |   4 +-
 .../detectors/person/test_yolo.py             |   4 +-
 .../detectors/person/yolo.py                  |   6 +-
 .../detectors/types.py                        |   2 +-
 .../detectors/yolo.py                         |   4 +-
 .../{detection2d => detection}/module2D.py    |  15 +--
 .../{detection2d => detection}/module3D.py    |  19 ++-
 .../{detection2d => detection}/moduleDB.py    |   4 +-
 .../test_moduleDB.py                          |   2 +-
 .../type/.claude/settings.local.json          |  10 ++
 dimos/perception/detection/type/__init__.py   |  16 +++
 .../type/detection2d.py                       |   4 +-
 .../type/detection3d.py                       |   4 +-
 .../type/detection3dpc.py                     |   6 +-
 .../type/imageDetections.py                   |   2 +-
 .../{detection2d => detection}/type/person.py |   2 +-
 .../type/test_detection2d.py                  |   0
 .../type/test_detection3d.py                  |   2 +-
 .../type/test_detection3dpc.py                |   0
 .../type/test_object3d.py                     |   8 +-
 dimos/perception/detection2d/__init__.py      |   8 --
 .../detection2d/detectors/__init__.py         |   3 -
 dimos/perception/detection2d/type/__init__.py |  16 ---
 .../unitree_b1/test_connection.py             |   7 +-
 dimos/utils/llm_utils.py                      |  75 +++++++++++
 dimos/utils/test_llm_utils.py                 | 123 ++++++++++++++++++
 37 files changed, 326 insertions(+), 243 deletions(-)
 create mode 100644 dimos/perception/detection/.claude/settings.local.json
 create mode 100644 dimos/perception/detection/__init__.py
 rename dimos/perception/{detection2d => detection}/conftest.py (96%)
 create mode 100644 dimos/perception/detection/detectors/__init__.py
 rename dimos/perception/{detection2d => detection}/detectors/config/custom_tracker.yaml (100%)
 rename dimos/perception/{detection2d => detection}/detectors/detic.py (99%)
 create mode 100644 dimos/perception/detection/detectors/person/.claude/settings.local.json
 rename dimos/perception/{detection2d => detection}/detectors/person/test_annotations.py (96%)
 rename dimos/perception/{detection2d => detection}/detectors/person/test_detection2d_conformance.py (95%)
 rename dimos/perception/{detection2d => detection}/detectors/person/test_imagedetections2d.py (93%)
 rename dimos/perception/{detection2d => detection}/detectors/person/test_yolo.py (96%)
 rename dimos/perception/{detection2d => detection}/detectors/person/yolo.py (96%)
 rename dimos/perception/{detection2d => detection}/detectors/types.py (94%)
 rename dimos/perception/{detection2d => detection}/detectors/yolo.py (97%)
 rename dimos/perception/{detection2d => detection}/module2D.py (85%)
 rename dimos/perception/{detection2d => detection}/module3D.py (82%)
 rename dimos/perception/{detection2d => detection}/moduleDB.py (98%)
 rename dimos/perception/{detection2d => detection}/test_moduleDB.py (97%)
 create mode 100644 dimos/perception/detection/type/.claude/settings.local.json
 create mode 100644 dimos/perception/detection/type/__init__.py
 rename dimos/perception/{detection2d => detection}/type/detection2d.py (98%)
 rename dimos/perception/{detection2d => detection}/type/detection3d.py (97%)
 rename dimos/perception/{detection2d => detection}/type/detection3dpc.py (97%)
 rename dimos/perception/{detection2d => detection}/type/imageDetections.py (98%)
 rename dimos/perception/{detection2d => detection}/type/person.py (99%)
 rename dimos/perception/{detection2d => detection}/type/test_detection2d.py (100%)
 rename dimos/perception/{detection2d => detection}/type/test_detection3d.py (94%)
 rename dimos/perception/{detection2d => detection}/type/test_detection3dpc.py (100%)
 rename dimos/perception/{detection2d => detection}/type/test_object3d.py (95%)
 delete mode 100644 dimos/perception/detection2d/__init__.py
 delete mode 100644 dimos/perception/detection2d/detectors/__init__.py
 delete mode 100644 dimos/perception/detection2d/type/__init__.py
 create mode 100644 dimos/utils/llm_utils.py
 create mode 100644 dimos/utils/test_llm_utils.py

diff --git a/dimos/models/vl/base.py b/dimos/models/vl/base.py
index dcca216479..a46611b206 100644
--- a/dimos/models/vl/base.py
+++ b/dimos/models/vl/base.py
@@ -1,70 +1,11 @@
 import json
-import re
 from abc import ABC, abstractmethod
-from typing import Union
 
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection2d.type import Detection2DBBox, ImageDetections2D
-from dimos.perception.detection2d.type.detection2d import Detection
+from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D
+from dimos.perception.detection.type.detection2d import Detection
 from dimos.utils.decorators import retry
-
-
-def extract_json(response: str) -> Union[dict, list]:
-    """Extract JSON from potentially messy LLM response.
-
-    Tries multiple strategies:
-    1. Parse the entire response as JSON
-    2. Find and parse JSON arrays in the response
-    3. Find and parse JSON objects in the response
-
-    Args:
-        response: Raw text response that may contain JSON
-
-    Returns:
-        Parsed JSON object (dict or list)
-
-    Raises:
-        json.JSONDecodeError: If no valid JSON can be extracted
-    """
-    # First try to parse the whole response as JSON
-    try:
-        return json.loads(response)
-    except json.JSONDecodeError:
-        pass
-
-    # If that fails, try to extract JSON from the messy response
-    # Look for JSON arrays or objects in the text
-
-    # Pattern to match JSON arrays (including nested arrays/objects)
-    # This finds the outermost [...] structure
-    array_pattern = r"\[(?:[^\[\]]*|\[(?:[^\[\]]*|\[[^\[\]]*\])*\])*\]"
-
-    # Pattern to match JSON objects
-    object_pattern = r"\{(?:[^{}]*|\{(?:[^{}]*|\{[^{}]*\})*\})*\}"
-
-    # Try to find JSON arrays first (most common for detections)
-    matches = re.findall(array_pattern, response, re.DOTALL)
-    for match in matches:
-        try:
-            parsed = json.loads(match)
-            # For detection arrays, we expect a list
-            if isinstance(parsed, list):
-                return parsed
-        except json.JSONDecodeError:
-            continue
-
-    # Try JSON objects if no arrays found
-    matches = re.findall(object_pattern, response, re.DOTALL)
-    for match in matches:
-        try:
-            return json.loads(match)
-        except json.JSONDecodeError:
-            continue
-
-    # If nothing worked, raise an error with the original response
-    raise json.JSONDecodeError(
-        f"Could not extract valid JSON from response: {response[:200]}...", response, 0
-    )
+from dimos.utils.llm_utils import extract_json
 
 
 def vlm_detection_to_yolo(vlm_detection: list, track_id: int) -> Detection | None:
diff --git a/dimos/models/vl/test_base.py b/dimos/models/vl/test_base.py
index 35110bd1cd..302a588721 100644
--- a/dimos/models/vl/test_base.py
+++ b/dimos/models/vl/test_base.py
@@ -1,18 +1,17 @@
-import json
 import os
 from unittest.mock import MagicMock
 
 import pytest
 
-from dimos.models.vl.base import extract_json
 from dimos.models.vl.qwen import QwenVlModel
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection2d.type import ImageDetections2D
+from dimos.perception.detection.type import ImageDetections2D
 from dimos.utils.data import get_data
 
 # Captured actual response from Qwen API for cafe.jpg with query "humans"
+# Added garbage around JSON to ensure we are robustly extracting it
 MOCK_QWEN_RESPONSE = """
-   Here you go bro:
+   Locating humans for you 😊😊
 
    [
     ["humans", 76, 368, 219, 580],
@@ -22,94 +21,9 @@
     ["humans", 785, 323, 960, 650]
    ]
 
-   Hope this helps!😀😊 :)"""
-
-
-def test_extract_json_clean_response():
-    """Test extract_json with clean JSON response."""
-    clean_json = '[["object", 1, 2, 3, 4]]'
-    result = extract_json(clean_json)
-    assert result == [["object", 1, 2, 3, 4]]
-
-
-def test_extract_json_with_text_before_after():
-    """Test extract_json with text before and after JSON."""
-    messy = """Here's what I found:
-    [
-        ["person", 10, 20, 30, 40],
-        ["car", 50, 60, 70, 80]
-    ]
-    Hope this helps!"""
-    result = extract_json(messy)
-    assert result == [["person", 10, 20, 30, 40], ["car", 50, 60, 70, 80]]
-
-
-def test_extract_json_with_emojis():
-    """Test extract_json with emojis and markdown code blocks."""
-    messy = """Sure! 😊 Here are the detections:
-
-    ```json
-    [["human", 100, 200, 300, 400]]
-    ```
-
-    Let me know if you need anything else! 👍"""
-    result = extract_json(messy)
-    assert result == [["human", 100, 200, 300, 400]]
-
-
-def test_extract_json_multiple_json_blocks():
-    """Test extract_json when there are multiple JSON blocks."""
-    messy = """First attempt (wrong format):
-    {"error": "not what we want"}
-
-    Correct format:
-    [
-        ["cat", 10, 10, 50, 50],
-        ["dog", 60, 60, 100, 100]
-    ]
-
-    Another block: {"also": "not needed"}"""
-    result = extract_json(messy)
-    # Should return the first valid array
-    assert result == [["cat", 10, 10, 50, 50], ["dog", 60, 60, 100, 100]]
-
-
-def test_extract_json_object():
-    """Test extract_json with JSON object instead of array."""
-    response = 'The result is: {"status": "success", "count": 5}'
-    result = extract_json(response)
-    assert result == {"status": "success", "count": 5}
-
-
-def test_extract_json_nested_structures():
-    """Test extract_json with nested arrays and objects."""
-    response = """Processing complete:
-    [
-        ["label1", 1, 2, 3, 4],
-        {"nested": {"value": 10}},
-        ["label2", 5, 6, 7, 8]
-    ]"""
-    result = extract_json(response)
-    assert result[0] == ["label1", 1, 2, 3, 4]
-    assert result[1] == {"nested": {"value": 10}}
-    assert result[2] == ["label2", 5, 6, 7, 8]
-
-
-def test_extract_json_invalid():
-    """Test extract_json raises error when no valid JSON found."""
-    response = "This response has no valid JSON at all!"
-    with pytest.raises(json.JSONDecodeError) as exc_info:
-        extract_json(response)
-    assert "Could not extract valid JSON" in str(exc_info.value)
-
-
-def test_extract_json_with_real_llm_response():
-    """Test extract_json with the actual messy response."""
-    result = extract_json(MOCK_QWEN_RESPONSE)
-    assert isinstance(result, list)
-    assert len(result) == 5
-    assert result[0] == ["humans", 76, 368, 219, 580]
-    assert result[-1] == ["humans", 785, 323, 960, 650]
+   Here is some trash at the end of the response :)
+   Let me know if you need anything else 😀😊
+   """
 
 
 def test_query_detections_mocked():
@@ -144,6 +58,8 @@ def test_query_detections_mocked():
         assert detection.track_id == i
         assert len(detection.bbox) == 4
 
+        assert detection.is_valid()
+
         # Verify bbox coordinates are valid (out-of-bounds detections are discarded)
         x1, y1, x2, y2 = detection.bbox
         assert x2 > x1, f"Detection {i}: Invalid x coordinates: x1={x1}, x2={x2}"
@@ -184,17 +100,6 @@ def test_query_detections_real():
             assert detection.name
             assert detection.confidence == 1.0
             assert detection.class_id == -1  # VLM detections use -1 for class_id
-
-            # Verify bbox coordinates are valid
-            x1, y1, x2, y2 = detection.bbox
-            assert x2 > x1, f"Invalid x coordinates: x1={x1}, x2={x2}"
-            assert y2 > y1, f"Invalid y coordinates: y1={y1}, y2={y2}"
-
-            # Verify coordinates are within image bounds
-            img_height, img_width = image.shape[:2]
-            assert 0 <= x1 <= img_width
-            assert 0 <= x2 <= img_width
-            assert 0 <= y1 <= img_height
-            assert 0 <= y2 <= img_height
+            assert detection.is_valid()
 
     print(f"Found {len(detections.detections)} detections for query '{query}'")
diff --git a/dimos/perception/detection/.claude/settings.local.json b/dimos/perception/detection/.claude/settings.local.json
new file mode 100644
index 0000000000..060f1e47cd
--- /dev/null
+++ b/dimos/perception/detection/.claude/settings.local.json
@@ -0,0 +1,9 @@
+{
+  "permissions": {
+    "allow": [
+      "Read(//home/lesh/coding/dimensional/dimos/dimos/**)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
diff --git a/dimos/perception/detection/__init__.py b/dimos/perception/detection/__init__.py
new file mode 100644
index 0000000000..72663a69b0
--- /dev/null
+++ b/dimos/perception/detection/__init__.py
@@ -0,0 +1,7 @@
+from dimos.perception.detection.detectors import *
+from dimos.perception.detection.module2D import (
+    Detection2DModule,
+)
+from dimos.perception.detection.module3D import (
+    Detection3DModule,
+)
diff --git a/dimos/perception/detection2d/conftest.py b/dimos/perception/detection/conftest.py
similarity index 96%
rename from dimos/perception/detection2d/conftest.py
rename to dimos/perception/detection/conftest.py
index 8ada4ec356..1f3bd55486 100644
--- a/dimos/perception/detection2d/conftest.py
+++ b/dimos/perception/detection/conftest.py
@@ -23,10 +23,10 @@
 from dimos.msgs.geometry_msgs import Transform
 from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection2d.module2D import Detection2DModule
-from dimos.perception.detection2d.module3D import Detection3DModule
-from dimos.perception.detection2d.moduleDB import ObjectDBModule
-from dimos.perception.detection2d.type import (
+from dimos.perception.detection.module2D import Detection2DModule
+from dimos.perception.detection.module3D import Detection3DModule
+from dimos.perception.detection.moduleDB import ObjectDBModule
+from dimos.perception.detection.type import (
     Detection2D,
     Detection3D,
     Detection3DPC,
diff --git a/dimos/perception/detection/detectors/__init__.py b/dimos/perception/detection/detectors/__init__.py
new file mode 100644
index 0000000000..d6383d084e
--- /dev/null
+++ b/dimos/perception/detection/detectors/__init__.py
@@ -0,0 +1,3 @@
+# from dimos.perception.detection.detectors.detic import Detic2DDetector
+from dimos.perception.detection.detectors.types import Detector
+from dimos.perception.detection.detectors.yolo import Yolo2DDetector
diff --git a/dimos/perception/detection2d/detectors/config/custom_tracker.yaml b/dimos/perception/detection/detectors/config/custom_tracker.yaml
similarity index 100%
rename from dimos/perception/detection2d/detectors/config/custom_tracker.yaml
rename to dimos/perception/detection/detectors/config/custom_tracker.yaml
diff --git a/dimos/perception/detection2d/detectors/detic.py b/dimos/perception/detection/detectors/detic.py
similarity index 99%
rename from dimos/perception/detection2d/detectors/detic.py
rename to dimos/perception/detection/detectors/detic.py
index 0b7b63276f..57a459f750 100644
--- a/dimos/perception/detection2d/detectors/detic.py
+++ b/dimos/perception/detection/detectors/detic.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection2d.detectors.types import Detector
+from dimos.perception.detection.detectors.types import Detector
 from dimos.perception.detection2d.utils import plot_results
 
 # Add Detic to Python path
diff --git a/dimos/perception/detection/detectors/person/.claude/settings.local.json b/dimos/perception/detection/detectors/person/.claude/settings.local.json
new file mode 100644
index 0000000000..69334f84de
--- /dev/null
+++ b/dimos/perception/detection/detectors/person/.claude/settings.local.json
@@ -0,0 +1,10 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(pytest:*)",
+      "Bash(python3:*)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
diff --git a/dimos/perception/detection2d/detectors/person/test_annotations.py b/dimos/perception/detection/detectors/person/test_annotations.py
similarity index 96%
rename from dimos/perception/detection2d/detectors/person/test_annotations.py
rename to dimos/perception/detection/detectors/person/test_annotations.py
index c686c33bd9..a5c238029c 100644
--- a/dimos/perception/detection2d/detectors/person/test_annotations.py
+++ b/dimos/perception/detection/detectors/person/test_annotations.py
@@ -17,7 +17,7 @@
 import sys
 
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection2d.detectors.person.yolo import YoloPersonDetector
+from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
 from dimos.utils.data import get_data
 
 
diff --git a/dimos/perception/detection2d/detectors/person/test_detection2d_conformance.py b/dimos/perception/detection/detectors/person/test_detection2d_conformance.py
similarity index 95%
rename from dimos/perception/detection2d/detectors/person/test_detection2d_conformance.py
rename to dimos/perception/detection/detectors/person/test_detection2d_conformance.py
index f7c7cc088c..b8fb92182e 100644
--- a/dimos/perception/detection2d/detectors/person/test_detection2d_conformance.py
+++ b/dimos/perception/detection/detectors/person/test_detection2d_conformance.py
@@ -14,8 +14,8 @@
 
 import pytest
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection2d.detectors.person.yolo import YoloPersonDetector
-from dimos.perception.detection2d.type.person import Person
+from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
+from dimos.perception.detection.type.person import Person
 from dimos.utils.data import get_data
 
 
diff --git a/dimos/perception/detection2d/detectors/person/test_imagedetections2d.py b/dimos/perception/detection/detectors/person/test_imagedetections2d.py
similarity index 93%
rename from dimos/perception/detection2d/detectors/person/test_imagedetections2d.py
rename to dimos/perception/detection/detectors/person/test_imagedetections2d.py
index 89fd770aa6..5f8eac584f 100644
--- a/dimos/perception/detection2d/detectors/person/test_imagedetections2d.py
+++ b/dimos/perception/detection/detectors/person/test_imagedetections2d.py
@@ -15,8 +15,8 @@
 """Test ImageDetections2D with pose detections."""
 
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection2d.detectors.person.yolo import YoloPersonDetector
-from dimos.perception.detection2d.type import ImageDetections2D
+from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
+from dimos.perception.detection.type import ImageDetections2D
 from dimos.utils.data import get_data
 
 
diff --git a/dimos/perception/detection2d/detectors/person/test_yolo.py b/dimos/perception/detection/detectors/person/test_yolo.py
similarity index 96%
rename from dimos/perception/detection2d/detectors/person/test_yolo.py
rename to dimos/perception/detection/detectors/person/test_yolo.py
index 454997ca27..b9a0d18566 100644
--- a/dimos/perception/detection2d/detectors/person/test_yolo.py
+++ b/dimos/perception/detection/detectors/person/test_yolo.py
@@ -15,8 +15,8 @@
 import pytest
 
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection2d.detectors.person.yolo import YoloPersonDetector
-from dimos.perception.detection2d.type.person import Person
+from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
+from dimos.perception.detection.type.person import Person
 from dimos.utils.data import get_data
 
 
diff --git a/dimos/perception/detection2d/detectors/person/yolo.py b/dimos/perception/detection/detectors/person/yolo.py
similarity index 96%
rename from dimos/perception/detection2d/detectors/person/yolo.py
rename to dimos/perception/detection/detectors/person/yolo.py
index fb4fe4769e..506c63adc9 100644
--- a/dimos/perception/detection2d/detectors/person/yolo.py
+++ b/dimos/perception/detection/detectors/person/yolo.py
@@ -21,11 +21,11 @@
 from ultralytics.engine.results import Boxes, Keypoints, Results
 
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection2d.detectors.types import Detector
+from dimos.perception.detection.detectors.types import Detector
 from dimos.utils.data import get_data
 from dimos.utils.logging_config import setup_logger
 
-logger = setup_logger("dimos.perception.detection2d.yolo.person")
+logger = setup_logger("dimos.perception.detection.yolo.person")
 
 
 # Type alias for YOLO person detection results
@@ -64,7 +64,7 @@
 
 Note: All tensor data is on GPU by default. Use .cpu() to move to CPU.
 """
-from dimos.perception.detection2d.type.person import Person
+from dimos.perception.detection.type.person import Person
 
 
 class YoloPersonDetector(Detector):
diff --git a/dimos/perception/detection2d/detectors/types.py b/dimos/perception/detection/detectors/types.py
similarity index 94%
rename from dimos/perception/detection2d/detectors/types.py
rename to dimos/perception/detection/detectors/types.py
index 639fc09247..6acbba601e 100644
--- a/dimos/perception/detection2d/detectors/types.py
+++ b/dimos/perception/detection/detectors/types.py
@@ -15,7 +15,7 @@
 from abc import ABC, abstractmethod
 
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection2d.type import (
+from dimos.perception.detection.type import (
     InconvinientDetectionFormat,
 )
 
diff --git a/dimos/perception/detection2d/detectors/yolo.py b/dimos/perception/detection/detectors/yolo.py
similarity index 97%
rename from dimos/perception/detection2d/detectors/yolo.py
rename to dimos/perception/detection/detectors/yolo.py
index 2d8681f0ef..0f47ea246e 100644
--- a/dimos/perception/detection2d/detectors/yolo.py
+++ b/dimos/perception/detection/detectors/yolo.py
@@ -19,7 +19,7 @@
 from ultralytics import YOLO
 
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection2d.detectors.types import Detector
+from dimos.perception.detection.detectors.types import Detector
 from dimos.perception.detection2d.utils import (
     extract_detection_results,
     filter_detections,
@@ -29,7 +29,7 @@
 from dimos.utils.gpu_utils import is_cuda_available
 from dimos.utils.logging_config import setup_logger
 
-logger = setup_logger("dimos.perception.detection2d.yolo_2d_det")
+logger = setup_logger("dimos.perception.detection.yolo_2d_det")
 
 
 class Yolo2DDetector(Detector):
diff --git a/dimos/perception/detection2d/module2D.py b/dimos/perception/detection/module2D.py
similarity index 85%
rename from dimos/perception/detection2d/module2D.py
rename to dimos/perception/detection/module2D.py
index 90c8cbbd37..eca73afa8e 100644
--- a/dimos/perception/detection2d/module2D.py
+++ b/dimos/perception/detection/module2D.py
@@ -14,7 +14,6 @@
 from dataclasses import dataclass
 from typing import Any, Callable, Optional
 
-import numpy as np
 from dimos_lcm.foxglove_msgs.ImageAnnotations import (
     ImageAnnotations,
 )
@@ -26,9 +25,9 @@
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.sensor_msgs.Image import sharpness_barrier
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection2d.detectors import Detector, Yolo2DDetector
-from dimos.perception.detection2d.detectors.person.yolo import YoloPersonDetector
-from dimos.perception.detection2d.type import (
+from dimos.perception.detection.detectors import Detector, Yolo2DDetector
+from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
+from dimos.perception.detection.type import (
     ImageDetections2D,
 )
 from dimos.utils.decorators.decorators import simple_mcache
@@ -37,7 +36,7 @@
 
 @dataclass
 class Config:
-    max_freq: float = 5  # hz
+    max_freq: float = 10  # hz
     detector: Optional[Callable[[Any], Detector]] = lambda: Yolo2DDetector()
 
 
@@ -79,11 +78,7 @@ def sharp_image_stream(self) -> Observable[Image]:
 
     @simple_mcache
     def detection_stream_2d(self) -> Observable[ImageDetections2D]:
-        # return self.vlm_detections_subject
-        # Regular detection stream from the detector
-        regular_detections = self.sharp_image_stream().pipe(ops.map(self.process_image_frame))
-        # Merge with VL model detections
-        return backpressure(regular_detections.pipe(ops.merge(self.vlm_detections_subject)))
+        return backpressure(self.sharp_image_stream().pipe(ops.map(self.process_image_frame)))
 
     @rpc
     def start(self):
diff --git a/dimos/perception/detection2d/module3D.py b/dimos/perception/detection/module3D.py
similarity index 82%
rename from dimos/perception/detection2d/module3D.py
rename to dimos/perception/detection/module3D.py
index 66475d85a5..a94c73046c 100644
--- a/dimos/perception/detection2d/module3D.py
+++ b/dimos/perception/detection/module3D.py
@@ -20,13 +20,14 @@
 from dimos.core import In, Out, rpc
 from dimos.msgs.geometry_msgs import Transform
 from dimos.msgs.sensor_msgs import Image, PointCloud2
-from dimos.perception.detection2d.module2D import Detection2DModule
-from dimos.perception.detection2d.type import (
+from dimos.perception.detection.module2D import Detection2DModule
+from dimos.perception.detection.type import (
+    Detection2D,
     ImageDetections2D,
     ImageDetections3D,
     ImageDetections3DPC,
 )
-from dimos.perception.detection2d.type.detection3dpc import Detection3DPC
+from dimos.perception.detection.type.detection3dpc import Detection3DPC
 from dimos.types.timestamped import align_timestamped
 from dimos.utils.reactive import backpressure
 
@@ -37,10 +38,17 @@ class Detection3DModule(Detection2DModule):
     image: In[Image] = None  # type: ignore
     pointcloud: In[PointCloud2] = None  # type: ignore
 
+    # just for visualization,
+    # emits latest pointclouds of detected objects in a frame
     detected_pointcloud_0: Out[PointCloud2] = None  # type: ignore
     detected_pointcloud_1: Out[PointCloud2] = None  # type: ignore
     detected_pointcloud_2: Out[PointCloud2] = None  # type: ignore
 
+    # just for visualization, emits latest top 3 detections in a frame
+    detected_image_0: Out[Image] = None  # type: ignore
+    detected_image_1: Out[Image] = None  # type: ignore
+    detected_image_2: Out[Image] = None  # type: ignore
+
     detection_3d_stream: Observable[ImageDetections3DPC] = None
 
     def __init__(self, camera_info: CameraInfo, *args, **kwargs):
@@ -69,6 +77,8 @@ def process_frame(
 
         return ImageDetections3D(detections.image, detection3d_list)
 
+    def process_detection(self, detections: ImageDetections2D) -> ImageDetections3DPC: ...
+
     @rpc
     def start(self):
         super().start()
@@ -78,6 +88,7 @@ def detection2d_to_3d(args):
             transform = self.tf.get("camera_optical", pc.frame_id, detections.image.ts, 5.0)
             return self.process_frame(detections, pc, transform)
 
+        # does align message timestamps
         self.detection_stream_3d = align_timestamped(
             backpressure(self.detection_stream_2d()),
             self.pointcloud.observable(),
@@ -85,6 +96,8 @@ def detection2d_to_3d(args):
             buffer_size=20.0,
         ).pipe(ops.map(detection2d_to_3d))
 
+        # doesn't align message timestamps
+        #
         # self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe(
         #    ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d)
         # )
diff --git a/dimos/perception/detection2d/moduleDB.py b/dimos/perception/detection/moduleDB.py
similarity index 98%
rename from dimos/perception/detection2d/moduleDB.py
rename to dimos/perception/detection/moduleDB.py
index 456b1d8c87..56203b2f5c 100644
--- a/dimos/perception/detection2d/moduleDB.py
+++ b/dimos/perception/detection/moduleDB.py
@@ -25,8 +25,8 @@
 from dimos.msgs.geometry_msgs import PoseStamped, Quaternion, Transform, Vector3
 from dimos.msgs.sensor_msgs import Image, PointCloud2
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection2d.module3D import Detection3DModule
-from dimos.perception.detection2d.type import Detection3D, ImageDetections3D, TableStr
+from dimos.perception.detection.module3D import Detection3DModule
+from dimos.perception.detection.type import Detection3D, ImageDetections3D, TableStr
 from dimos.protocol.skill.skill import skill
 from dimos.protocol.skill.type import Output, Reducer, Stream
 from dimos.types.timestamped import to_datetime
diff --git a/dimos/perception/detection2d/test_moduleDB.py b/dimos/perception/detection/test_moduleDB.py
similarity index 97%
rename from dimos/perception/detection2d/test_moduleDB.py
rename to dimos/perception/detection/test_moduleDB.py
index a3a1b003fd..1ede53f172 100644
--- a/dimos/perception/detection2d/test_moduleDB.py
+++ b/dimos/perception/detection/test_moduleDB.py
@@ -21,7 +21,7 @@
 from dimos.msgs.geometry_msgs import PoseStamped
 from dimos.msgs.sensor_msgs import Image, PointCloud2
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection2d.moduleDB import ObjectDBModule
+from dimos.perception.detection.moduleDB import ObjectDBModule
 from dimos.protocol.service import lcmservice as lcm
 from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation
 from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule
diff --git a/dimos/perception/detection/type/.claude/settings.local.json b/dimos/perception/detection/type/.claude/settings.local.json
new file mode 100644
index 0000000000..f3e68a36e6
--- /dev/null
+++ b/dimos/perception/detection/type/.claude/settings.local.json
@@ -0,0 +1,10 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(pytest:*)",
+      "Bash(grep:*)",
+      "Read(//home/lesh/coding/dimensional/dimos/dimos/perception/detection2d/**)"
+    ],
+    "deny": []
+  }
+}
diff --git a/dimos/perception/detection/type/__init__.py b/dimos/perception/detection/type/__init__.py
new file mode 100644
index 0000000000..54147da975
--- /dev/null
+++ b/dimos/perception/detection/type/__init__.py
@@ -0,0 +1,16 @@
+from dimos.perception.detection.type.detection2d import (
+    Detection2D,
+    Detection2DBBox,
+    ImageDetections2D,
+    InconvinientDetectionFormat,
+)
+from dimos.perception.detection.type.detection3d import (
+    Detection3D,
+    ImageDetections3D,
+)
+from dimos.perception.detection.type.detection3dpc import (
+    Detection3DPC,
+    ImageDetections3DPC,
+)
+from dimos.perception.detection.type.imageDetections import ImageDetections, TableStr
+from dimos.perception.detection.type.person import Person
diff --git a/dimos/perception/detection2d/type/detection2d.py b/dimos/perception/detection/type/detection2d.py
similarity index 98%
rename from dimos/perception/detection2d/type/detection2d.py
rename to dimos/perception/detection/type/detection2d.py
index 53a449659d..44dcf47153 100644
--- a/dimos/perception/detection2d/type/detection2d.py
+++ b/dimos/perception/detection/type/detection2d.py
@@ -43,11 +43,11 @@
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.std_msgs import Header
-from dimos.perception.detection2d.type.imageDetections import ImageDetections
+from dimos.perception.detection.type.imageDetections import ImageDetections
 from dimos.types.timestamped import Timestamped, to_ros_stamp, to_timestamp
 
 if TYPE_CHECKING:
-    from dimos.perception.detection2d.type.person import Person
+    from dimos.perception.detection.type.person import Person
 
 Bbox = Tuple[float, float, float, float]
 CenteredBbox = Tuple[float, float, float, float]
diff --git a/dimos/perception/detection2d/type/detection3d.py b/dimos/perception/detection/type/detection3d.py
similarity index 97%
rename from dimos/perception/detection2d/type/detection3d.py
rename to dimos/perception/detection/type/detection3d.py
index a203bb1a4b..5a0f09f570 100644
--- a/dimos/perception/detection2d/type/detection3d.py
+++ b/dimos/perception/detection/type/detection3d.py
@@ -28,8 +28,8 @@
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3
 from dimos.msgs.sensor_msgs import PointCloud2
-from dimos.perception.detection2d.type.detection2d import Detection2D, Detection2DBBox
-from dimos.perception.detection2d.type.imageDetections import ImageDetections
+from dimos.perception.detection.type.detection2d import Detection2D, Detection2DBBox
+from dimos.perception.detection.type.imageDetections import ImageDetections
 from dimos.types.timestamped import to_ros_stamp
 
 
diff --git a/dimos/perception/detection2d/type/detection3dpc.py b/dimos/perception/detection/type/detection3dpc.py
similarity index 97%
rename from dimos/perception/detection2d/type/detection3dpc.py
rename to dimos/perception/detection/type/detection3dpc.py
index 44d242de9e..e7ca16c290 100644
--- a/dimos/perception/detection2d/type/detection3dpc.py
+++ b/dimos/perception/detection/type/detection3dpc.py
@@ -28,9 +28,9 @@
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3
 from dimos.msgs.sensor_msgs import PointCloud2
-from dimos.perception.detection2d.type.detection2d import Detection2D
-from dimos.perception.detection2d.type.detection3d import Detection3D
-from dimos.perception.detection2d.type.imageDetections import ImageDetections
+from dimos.perception.detection.type.detection2d import Detection2D
+from dimos.perception.detection.type.detection3d import Detection3D
+from dimos.perception.detection.type.imageDetections import ImageDetections
 from dimos.types.timestamped import to_ros_stamp
 
 Detection3DPCFilter = Callable[
diff --git a/dimos/perception/detection2d/type/imageDetections.py b/dimos/perception/detection/type/imageDetections.py
similarity index 98%
rename from dimos/perception/detection2d/type/imageDetections.py
rename to dimos/perception/detection/type/imageDetections.py
index edd8449f06..c09d7cb052 100644
--- a/dimos/perception/detection2d/type/imageDetections.py
+++ b/dimos/perception/detection/type/imageDetections.py
@@ -28,7 +28,7 @@
 from dimos.types.timestamped import to_timestamp
 
 if TYPE_CHECKING:
-    from dimos.perception.detection2d.type.detection2d import Detection2D
+    from dimos.perception.detection.type.detection2d import Detection2D
 
 T = TypeVar("T", bound="Detection2D")
 
diff --git a/dimos/perception/detection2d/type/person.py b/dimos/perception/detection/type/person.py
similarity index 99%
rename from dimos/perception/detection2d/type/person.py
rename to dimos/perception/detection/type/person.py
index b61045f48c..22608b76e3 100644
--- a/dimos/perception/detection2d/type/person.py
+++ b/dimos/perception/detection/type/person.py
@@ -23,7 +23,7 @@
 
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection2d.type.detection2d import Bbox, Detection2DBBox
+from dimos.perception.detection.type.detection2d import Bbox, Detection2DBBox
 from dimos.types.timestamped import to_ros_stamp
 
 if TYPE_CHECKING:
diff --git a/dimos/perception/detection2d/type/test_detection2d.py b/dimos/perception/detection/type/test_detection2d.py
similarity index 100%
rename from dimos/perception/detection2d/type/test_detection2d.py
rename to dimos/perception/detection/type/test_detection2d.py
diff --git a/dimos/perception/detection2d/type/test_detection3d.py b/dimos/perception/detection/type/test_detection3d.py
similarity index 94%
rename from dimos/perception/detection2d/type/test_detection3d.py
rename to dimos/perception/detection/type/test_detection3d.py
index 642e6c7542..2188583464 100644
--- a/dimos/perception/detection2d/type/test_detection3d.py
+++ b/dimos/perception/detection/type/test_detection3d.py
@@ -14,7 +14,7 @@
 
 import time
 
-from dimos.perception.detection2d.type.detection3d import Detection3D
+from dimos.perception.detection.type.detection3d import Detection3D
 
 
 def test_guess_projection(get_moment_2d, publish_moment):
diff --git a/dimos/perception/detection2d/type/test_detection3dpc.py b/dimos/perception/detection/type/test_detection3dpc.py
similarity index 100%
rename from dimos/perception/detection2d/type/test_detection3dpc.py
rename to dimos/perception/detection/type/test_detection3dpc.py
diff --git a/dimos/perception/detection2d/type/test_object3d.py b/dimos/perception/detection/type/test_object3d.py
similarity index 95%
rename from dimos/perception/detection2d/type/test_object3d.py
rename to dimos/perception/detection/type/test_object3d.py
index b7933e86d5..eb7b963a4e 100644
--- a/dimos/perception/detection2d/type/test_object3d.py
+++ b/dimos/perception/detection/type/test_object3d.py
@@ -14,10 +14,10 @@
 
 import pytest
 
-from dimos.perception.detection2d.module2D import Detection2DModule
-from dimos.perception.detection2d.module3D import Detection3DModule
-from dimos.perception.detection2d.moduleDB import Object3D, ObjectDBModule
-from dimos.perception.detection2d.type.detection3d import ImageDetections3D
+from dimos.perception.detection.module2D import Detection2DModule
+from dimos.perception.detection.module3D import Detection3DModule
+from dimos.perception.detection.moduleDB import Object3D, ObjectDBModule
+from dimos.perception.detection.type.detection3d import ImageDetections3D
 from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule
 
 
diff --git a/dimos/perception/detection2d/__init__.py b/dimos/perception/detection2d/__init__.py
deleted file mode 100644
index 6dc59e7366..0000000000
--- a/dimos/perception/detection2d/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dimos.perception.detection2d.detectors import *
-from dimos.perception.detection2d.module2D import (
-    Detection2DModule,
-)
-from dimos.perception.detection2d.module3D import (
-    Detection3DModule,
-)
-from dimos.perception.detection2d.utils import *
diff --git a/dimos/perception/detection2d/detectors/__init__.py b/dimos/perception/detection2d/detectors/__init__.py
deleted file mode 100644
index 287fff1a15..0000000000
--- a/dimos/perception/detection2d/detectors/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# from dimos.perception.detection2d.detectors.detic import Detic2DDetector
-from dimos.perception.detection2d.detectors.types import Detector
-from dimos.perception.detection2d.detectors.yolo import Yolo2DDetector
diff --git a/dimos/perception/detection2d/type/__init__.py b/dimos/perception/detection2d/type/__init__.py
deleted file mode 100644
index aee8597d5c..0000000000
--- a/dimos/perception/detection2d/type/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from dimos.perception.detection2d.type.detection2d import (
-    Detection2D,
-    Detection2DBBox,
-    ImageDetections2D,
-    InconvinientDetectionFormat,
-)
-from dimos.perception.detection2d.type.detection3d import (
-    Detection3D,
-    ImageDetections3D,
-)
-from dimos.perception.detection2d.type.detection3dpc import (
-    Detection3DPC,
-    ImageDetections3DPC,
-)
-from dimos.perception.detection2d.type.imageDetections import ImageDetections, TableStr
-from dimos.perception.detection2d.type.person import Person
diff --git a/dimos/robot/unitree_webrtc/unitree_b1/test_connection.py b/dimos/robot/unitree_webrtc/unitree_b1/test_connection.py
index a9451acdf0..57227e6e23 100644
--- a/dimos/robot/unitree_webrtc/unitree_b1/test_connection.py
+++ b/dimos/robot/unitree_webrtc/unitree_b1/test_connection.py
@@ -290,6 +290,9 @@ def test_mode_changes_with_watchdog(self):
         conn.watchdog_thread = threading.Thread(target=conn._watchdog_loop, daemon=True)
         conn.watchdog_thread.start()
 
+        # Give threads time to initialize
+        time.sleep(0.05)
+
         # Send walk command
         twist = TwistStamped(
             ts=time.time(),
@@ -301,8 +304,8 @@ def test_mode_changes_with_watchdog(self):
         assert conn.current_mode == 2
         assert conn._current_cmd.ly == 1.0
 
-        # Wait for timeout first
-        time.sleep(0.25)
+        # Wait for timeout first (0.2s timeout + 0.15s margin for reliability)
+        time.sleep(0.35)
         assert conn.timeout_active
         assert conn._current_cmd.ly == 0.0  # Watchdog zeroed it
 
diff --git a/dimos/utils/llm_utils.py b/dimos/utils/llm_utils.py
new file mode 100644
index 0000000000..05cc44ad24
--- /dev/null
+++ b/dimos/utils/llm_utils.py
@@ -0,0 +1,75 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from typing import Union
+
+
+def extract_json(response: str) -> Union[dict, list]:
+    """Extract JSON from potentially messy LLM response.
+
+    Tries multiple strategies:
+    1. Parse the entire response as JSON
+    2. Find and parse JSON arrays in the response
+    3. Find and parse JSON objects in the response
+
+    Args:
+        response: Raw text response that may contain JSON
+
+    Returns:
+        Parsed JSON object (dict or list)
+
+    Raises:
+        json.JSONDecodeError: If no valid JSON can be extracted
+    """
+    # First try to parse the whole response as JSON
+    try:
+        return json.loads(response)
+    except json.JSONDecodeError:
+        pass
+
+    # If that fails, try to extract JSON from the messy response
+    # Look for JSON arrays or objects in the text
+
+    # Pattern to match JSON arrays (including nested arrays/objects)
+    # This finds the outermost [...] structure
+    array_pattern = r"\[(?:[^\[\]]*|\[(?:[^\[\]]*|\[[^\[\]]*\])*\])*\]"
+
+    # Pattern to match JSON objects
+    object_pattern = r"\{(?:[^{}]*|\{(?:[^{}]*|\{[^{}]*\})*\})*\}"
+
+    # Try to find JSON arrays first (most common for detections)
+    matches = re.findall(array_pattern, response, re.DOTALL)
+    for match in matches:
+        try:
+            parsed = json.loads(match)
+            # For detection arrays, we expect a list
+            if isinstance(parsed, list):
+                return parsed
+        except json.JSONDecodeError:
+            continue
+
+    # Try JSON objects if no arrays found
+    matches = re.findall(object_pattern, response, re.DOTALL)
+    for match in matches:
+        try:
+            return json.loads(match)
+        except json.JSONDecodeError:
+            continue
+
+    # If nothing worked, raise an error with the original response
+    raise json.JSONDecodeError(
+        f"Could not extract valid JSON from response: {response[:200]}...", response, 0
+    )
diff --git a/dimos/utils/test_llm_utils.py b/dimos/utils/test_llm_utils.py
new file mode 100644
index 0000000000..4073fd8af2
--- /dev/null
+++ b/dimos/utils/test_llm_utils.py
@@ -0,0 +1,123 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for LLM utility functions."""
+
+import json
+
+import pytest
+
+from dimos.utils.llm_utils import extract_json
+
+
+def test_extract_json_clean_response():
+    """Test extract_json with clean JSON response."""
+    clean_json = '[["object", 1, 2, 3, 4]]'
+    result = extract_json(clean_json)
+    assert result == [["object", 1, 2, 3, 4]]
+
+
+def test_extract_json_with_text_before_after():
+    """Test extract_json with text before and after JSON."""
+    messy = """Here's what I found:
+    [
+        ["person", 10, 20, 30, 40],
+        ["car", 50, 60, 70, 80]
+    ]
+    Hope this helps!"""
+    result = extract_json(messy)
+    assert result == [["person", 10, 20, 30, 40], ["car", 50, 60, 70, 80]]
+
+
+def test_extract_json_with_emojis():
+    """Test extract_json with emojis and markdown code blocks."""
+    messy = """Sure! 😊 Here are the detections:
+
+    ```json
+    [["human", 100, 200, 300, 400]]
+    ```
+
+    Let me know if you need anything else! 👍"""
+    result = extract_json(messy)
+    assert result == [["human", 100, 200, 300, 400]]
+
+
+def test_extract_json_multiple_json_blocks():
+    """Test extract_json when there are multiple JSON blocks."""
+    messy = """First attempt (wrong format):
+    {"error": "not what we want"}
+
+    Correct format:
+    [
+        ["cat", 10, 10, 50, 50],
+        ["dog", 60, 60, 100, 100]
+    ]
+
+    Another block: {"also": "not needed"}"""
+    result = extract_json(messy)
+    # Should return the first valid array
+    assert result == [["cat", 10, 10, 50, 50], ["dog", 60, 60, 100, 100]]
+
+
+def test_extract_json_object():
+    """Test extract_json with JSON object instead of array."""
+    response = 'The result is: {"status": "success", "count": 5}'
+    result = extract_json(response)
+    assert result == {"status": "success", "count": 5}
+
+
+def test_extract_json_nested_structures():
+    """Test extract_json with nested arrays and objects."""
+    response = """Processing complete:
+    [
+        ["label1", 1, 2, 3, 4],
+        {"nested": {"value": 10}},
+        ["label2", 5, 6, 7, 8]
+    ]"""
+    result = extract_json(response)
+    assert result[0] == ["label1", 1, 2, 3, 4]
+    assert result[1] == {"nested": {"value": 10}}
+    assert result[2] == ["label2", 5, 6, 7, 8]
+
+
+def test_extract_json_invalid():
+    """Test extract_json raises error when no valid JSON found."""
+    response = "This response has no valid JSON at all!"
+    with pytest.raises(json.JSONDecodeError) as exc_info:
+        extract_json(response)
+    assert "Could not extract valid JSON" in str(exc_info.value)
+
+
+# Test with actual LLM response format
+MOCK_LLM_RESPONSE = """
+   Yes :)
+
+   [
+    ["humans", 76, 368, 219, 580],
+    ["humans", 354, 372, 512, 525],
+    ["humans", 409, 370, 615, 748],
+    ["humans", 628, 350, 762, 528],
+    ["humans", 785, 323, 960, 650]
+   ]
+
+   Hope this helps!😀😊 :)"""
+
+
+def test_extract_json_with_real_llm_response():
+    """Test extract_json with actual messy LLM response."""
+    result = extract_json(MOCK_LLM_RESPONSE)
+    assert isinstance(result, list)
+    assert len(result) == 5
+    assert result[0] == ["humans", 76, 368, 219, 580]
+    assert result[-1] == ["humans", 785, 323, 960, 650]

From 98e1c24d890cacce7b5b79b771fcb81bd72f61f3 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 16:52:50 -0700
Subject: [PATCH 05/47] obsoleted inconvinient detection format entirely

---
 .../detection/detectors/conftest.py           |  38 +++++
 .../detectors/person/test_annotations.py      |  10 +-
 .../person/test_detection2d_conformance.py    |  20 +--
 .../person/test_imagedetections2d.py          |  21 +--
 .../detection/detectors/person/test_yolo.py   |  57 ++++---
 .../detection/detectors/person/yolo.py        | 114 +------------
 .../detection/detectors/test_yolo.py          | 159 ++++++++++++++++++
 dimos/perception/detection/detectors/types.py |   6 +-
 dimos/perception/detection/detectors/yolo.py  |  93 +---------
 dimos/perception/detection/module2D.py        |   8 +-
 dimos/perception/detection/type/__init__.py   |   2 +-
 .../perception/detection/type/detection2d.py  |  90 +++++++++-
 dimos/perception/detection/type/person.py     |  77 +++++++--
 13 files changed, 407 insertions(+), 288 deletions(-)
 create mode 100644 dimos/perception/detection/detectors/conftest.py
 create mode 100644 dimos/perception/detection/detectors/test_yolo.py

diff --git a/dimos/perception/detection/detectors/conftest.py b/dimos/perception/detection/detectors/conftest.py
new file mode 100644
index 0000000000..cf4b1712e3
--- /dev/null
+++ b/dimos/perception/detection/detectors/conftest.py
@@ -0,0 +1,38 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
+from dimos.perception.detection.detectors.yolo import Yolo2DDetector
+from dimos.utils.data import get_data
+
+
+@pytest.fixture()
+def test_image():
+    """Load the test image used for detector tests."""
+    return Image.from_file(get_data("cafe.jpg"))
+
+
+@pytest.fixture()
+def person_detector():
+    """Create a YoloPersonDetector instance."""
+    return YoloPersonDetector()
+
+
+@pytest.fixture()
+def bbox_detector():
+    """Create a Yolo2DDetector instance for general object detection."""
+    return Yolo2DDetector()
diff --git a/dimos/perception/detection/detectors/person/test_annotations.py b/dimos/perception/detection/detectors/person/test_annotations.py
index a5c238029c..d3c06f9a29 100644
--- a/dimos/perception/detection/detectors/person/test_annotations.py
+++ b/dimos/perception/detection/detectors/person/test_annotations.py
@@ -25,10 +25,10 @@ def test_person_annotations():
     """Test that Person annotations include keypoints and skeleton."""
     image = Image.from_file(get_data("cafe.jpg"))
     detector = YoloPersonDetector()
-    people = detector.detect_people(image)
+    detections = detector.process_image(image)
 
-    assert len(people) > 0
-    person = people[0]
+    assert len(detections.detections) > 0
+    person = detections.detections[0]
 
     # Test text annotations
     text_anns = person.to_text_annotation()
@@ -64,7 +64,3 @@ def test_person_annotations():
 
     print(f"\n✓ Person annotations working correctly!")
     print(f"  - {len(person.get_visible_keypoints(0.5))}/17 visible keypoints")
-
-
-if __name__ == "__main__":
-    test_person_annotations()
diff --git a/dimos/perception/detection/detectors/person/test_detection2d_conformance.py b/dimos/perception/detection/detectors/person/test_detection2d_conformance.py
index b8fb92182e..300d5da5fd 100644
--- a/dimos/perception/detection/detectors/person/test_detection2d_conformance.py
+++ b/dimos/perception/detection/detectors/person/test_detection2d_conformance.py
@@ -12,21 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
 from dimos.msgs.sensor_msgs import Image
 from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
-from dimos.perception.detection.type.person import Person
 from dimos.utils.data import get_data
 
 
 def test_person_detection2d_bbox_conformance():
-    """Test that Person conforms to Detection2DBBox interface."""
+    """Test that Detection2DPerson conforms to Detection2DBBox interface."""
     image = Image.from_file(get_data("cafe.jpg"))
     detector = YoloPersonDetector()
-    people = detector.detect_people(image)
+    detections = detector.process_image(image)
 
-    assert len(people) > 0
-    person = people[0]
+    assert len(detections.detections) > 0
+    person = detections.detections[0]
 
     # Test Detection2DBBox methods
     # Test bbox operations
@@ -68,15 +66,11 @@ def test_person_detection2d_bbox_conformance():
 
     # Test string representation
     str_repr = str(person)
-    assert "Person" in str_repr
+    assert "Detection2DPerson" in str_repr
     assert "person" in str_repr  # name field
 
-    print("\n✓ Person class fully conforms to Detection2DBBox interface")
-    print(f"  - Detected {len(people)} people")
+    print("\n✓ Detection2DPerson class fully conforms to Detection2DBBox interface")
+    print(f"  - Detected {len(detections.detections)} people")
     print(f"  - First person confidence: {person.confidence:.3f}")
     print(f"  - Bbox volume: {volume:.1f}")
     print(f"  - Has {len(person.get_visible_keypoints(0.5))} visible keypoints")
-
-
-if __name__ == "__main__":
-    test_person_detection2d_bbox_conformance()
diff --git a/dimos/perception/detection/detectors/person/test_imagedetections2d.py b/dimos/perception/detection/detectors/person/test_imagedetections2d.py
index 5f8eac584f..ce595a244b 100644
--- a/dimos/perception/detection/detectors/person/test_imagedetections2d.py
+++ b/dimos/perception/detection/detectors/person/test_imagedetections2d.py
@@ -25,31 +25,24 @@ def test_image_detections_2d_with_person():
     # Load image and detect people
     image = Image.from_file(get_data("cafe.jpg"))
     detector = YoloPersonDetector()
-    people = detector.detect_people(image)
-
-    # Create ImageDetections2D using from_pose_detector
-    image_detections = ImageDetections2D.from_pose_detector(image, people)
+    image_detections = detector.process_image(image)
 
     # Verify structure
     assert image_detections.image is image
-    assert len(image_detections.detections) == len(people)
-    assert all(det in people for det in image_detections.detections)
+    assert len(image_detections.detections) > 0
 
     # Test image annotations (includes pose keypoints)
     annotations = image_detections.to_foxglove_annotations()
-    print(f"\nImageDetections2D created with {len(people)} people")
+    num_people = len(image_detections.detections)
+    print(f"\nImageDetections2D created with {num_people} people")
     print(f"Total text annotations: {annotations.texts_length}")
     print(f"Total points annotations: {annotations.points_length}")
 
     # Points should include: bounding boxes + keypoints + skeleton lines
     # At least 3 annotations per person (bbox, keypoints, skeleton)
-    assert annotations.points_length >= len(people) * 3
+    assert annotations.points_length >= num_people * 3
 
     # Text annotations should include confidence, name/id, and keypoint count
-    assert annotations.texts_length >= len(people) * 3
-
-    print("\n✓ ImageDetections2D.from_pose_detector working correctly!")
-
+    assert annotations.texts_length >= num_people * 3
 
-if __name__ == "__main__":
-    test_image_detections_2d_with_person()
+    print("\n✓ ImageDetections2D from person detector working correctly!")
diff --git a/dimos/perception/detection/detectors/person/test_yolo.py b/dimos/perception/detection/detectors/person/test_yolo.py
index b9a0d18566..2c70dc1232 100644
--- a/dimos/perception/detection/detectors/person/test_yolo.py
+++ b/dimos/perception/detection/detectors/person/test_yolo.py
@@ -14,43 +14,37 @@
 
 import pytest
 
-from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
-from dimos.perception.detection.type.person import Person
-from dimos.utils.data import get_data
+from dimos.perception.detection.type import Detection2DBBox, Detection2DPerson, ImageDetections2D
 
 
 @pytest.fixture()
-def detector():
-    return YoloPersonDetector()
+def people(person_detector, test_image):
+    """Get ImageDetections2D from person detector."""
+    return person_detector.process_image(test_image)
 
 
 @pytest.fixture()
-def test_image():
-    return Image.from_file(get_data("cafe.jpg"))
+def people_list(people, test_image):
+    """Get list of Detection2DPerson objects."""
+    return people.detections
 
 
-@pytest.fixture()
-def people(detector, test_image):
-    return detector.detect_people(test_image)
-
-
-def test_person_detection(people):
+def test_person_detection(people_list):
     """Test that we can detect people with pose keypoints."""
-    assert len(people) > 0
+    assert len(people_list) > 0
 
     # Check first person
-    person = people[0]
-    assert isinstance(person, Person)
+    person = people_list[0]
+    assert isinstance(person, Detection2DPerson)
     assert person.confidence > 0
     assert len(person.bbox) == 4  # bbox is now a tuple
     assert person.keypoints.shape == (17, 2)
     assert person.keypoint_scores.shape == (17,)
 
 
-def test_person_properties(people):
-    """Test Person object properties and methods."""
-    person = people[0]
+def test_person_properties(people_list):
+    """Test Detection2DPerson object properties and methods."""
+    person = people_list[0]
 
     # Test bounding box properties
     assert person.width > 0
@@ -70,9 +64,9 @@ def test_person_properties(people):
     assert all(0 <= conf <= 1 for _, _, conf in visible)
 
 
-def test_person_normalized_coords(people):
+def test_person_normalized_coords(people_list):
     """Test normalized coordinates if available."""
-    person = people[0]
+    person = people_list[0]
 
     if person.keypoints_normalized is not None:
         assert person.keypoints_normalized.shape == (17, 2)
@@ -86,11 +80,11 @@ def test_person_normalized_coords(people):
         assert (person.bbox_normalized <= 1).all()
 
 
-def test_multiple_people(people):
+def test_multiple_people(people_list):
     """Test that multiple people can be detected."""
-    print(f"\nDetected {len(people)} people in test image")
+    print(f"\nDetected {len(people_list)} people in test image")
 
-    for i, person in enumerate(people[:3]):  # Show first 3
+    for i, person in enumerate(people_list[:3]):  # Show first 3
         print(f"\nPerson {i}:")
         print(f"  Confidence: {person.confidence:.3f}")
         print(f"  Size: {person.width:.1f} x {person.height:.1f}")
@@ -101,12 +95,19 @@ def test_multiple_people(people):
             print(f"    {name}: ({xy[0]:.1f}, {xy[1]:.1f}) conf={conf:.3f}")
 
 
+def test_image_detections2d_structure(people):
+    """Test that process_image returns ImageDetections2D."""
+    assert isinstance(people, ImageDetections2D)
+    assert len(people.detections) > 0
+    assert all(isinstance(d, Detection2DPerson) for d in people.detections)
+
+
 def test_invalid_keypoint(test_image):
     """Test error handling for invalid keypoint names."""
-    # Create a dummy person
+    # Create a dummy Detection2DPerson
     import numpy as np
 
-    person = Person(
+    person = Detection2DPerson(
         # Detection2DBBox fields
         bbox=(0.0, 0.0, 100.0, 100.0),
         track_id=0,
@@ -115,7 +116,7 @@ def test_invalid_keypoint(test_image):
         name="person",
         ts=test_image.ts,
         image=test_image,
-        # Person fields
+        # Detection2DPerson fields
         keypoints=np.zeros((17, 2)),
         keypoint_scores=np.zeros(17),
     )
diff --git a/dimos/perception/detection/detectors/person/yolo.py b/dimos/perception/detection/detectors/person/yolo.py
index 506c63adc9..a5bd211210 100644
--- a/dimos/perception/detection/detectors/person/yolo.py
+++ b/dimos/perception/detection/detectors/person/yolo.py
@@ -12,127 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
-
-import numpy as np
-import torch
 from ultralytics import YOLO
-from ultralytics.engine.results import Boxes, Keypoints, Results
 
 from dimos.msgs.sensor_msgs import Image
 from dimos.perception.detection.detectors.types import Detector
+from dimos.perception.detection.type import ImageDetections2D
 from dimos.utils.data import get_data
 from dimos.utils.logging_config import setup_logger
 
 logger = setup_logger("dimos.perception.detection.yolo.person")
 
 
-# Type alias for YOLO person detection results
-YoloPersonResults = List[Results]
-
-"""
-YOLO Person Detection Results Structure:
-
-Each Results object in the list contains:
-
-1. boxes (Boxes object):
-   - boxes.xyxy: torch.Tensor [N, 4] - bounding boxes in [x1, y1, x2, y2] format
-   - boxes.xywh: torch.Tensor [N, 4] - boxes in [x_center, y_center, width, height] format
-   - boxes.conf: torch.Tensor [N] - confidence scores (0-1)
-   - boxes.cls: torch.Tensor [N] - class IDs (0 for person)
-   - boxes.xyxyn: torch.Tensor [N, 4] - normalized xyxy coordinates (0-1)
-   - boxes.xywhn: torch.Tensor [N, 4] - normalized xywh coordinates (0-1)
-
-2. keypoints (Keypoints object):
-   - keypoints.xy: torch.Tensor [N, 17, 2] - absolute x,y coordinates for 17 keypoints
-   - keypoints.conf: torch.Tensor [N, 17] - confidence/visibility scores for each keypoint
-   - keypoints.xyn: torch.Tensor [N, 17, 2] - normalized coordinates (0-1)
-   
-   Keypoint order (COCO format):
-   0: nose, 1: left_eye, 2: right_eye, 3: left_ear, 4: right_ear,
-   5: left_shoulder, 6: right_shoulder, 7: left_elbow, 8: right_elbow,
-   9: left_wrist, 10: right_wrist, 11: left_hip, 12: right_hip,
-   13: left_knee, 14: right_knee, 15: left_ankle, 16: right_ankle
-
-3. Other attributes:
-   - names: Dict[int, str] - class names mapping {0: 'person'}
-   - orig_shape: Tuple[int, int] - original image (height, width)
-   - speed: Dict[str, float] - timing info {'preprocess': ms, 'inference': ms, 'postprocess': ms}
-   - path: str - image path
-   - orig_img: np.ndarray - original image array
-
-Note: All tensor data is on GPU by default. Use .cpu() to move to CPU.
-"""
-from dimos.perception.detection.type.person import Person
-
-
 class YoloPersonDetector(Detector):
     def __init__(self, model_path="models_yolo", model_name="yolo11n-pose.pt"):
         self.model = YOLO(get_data(model_path) / model_name, task="pose")
 
-    def process_image(self, image: Image) -> YoloPersonResults:
-        """Process image and return YOLO person detection results.
-
-        Returns:
-            List of Results objects, typically one per image.
-            Each Results object contains:
-            - boxes: Boxes with xyxy, xywh, conf, cls tensors
-            - keypoints: Keypoints with xy, conf, xyn tensors
-            - names: {0: 'person'} class mapping
-            - orig_shape: original image dimensions
-            - speed: inference timing
-        """
-        return self.model(source=image.to_opencv())
+    def process_image(self, image: Image) -> ImageDetections2D:
+        """Process image and return detection results.
 
-    def detect_people(self, image: Image) -> List[Person]:
-        """Process image and return list of Person objects.
+        Args:
+            image: Input image
 
         Returns:
-            List of Person objects with pose keypoints
+            ImageDetections2D containing Detection2DPerson objects with pose keypoints
         """
-        results = self.process_image(image)
-
-        people = []
-        for result in results:
-            if result.keypoints is None or result.boxes is None:
-                continue
-
-            # Create Person object for each detection
-            num_detections = len(result.boxes.xyxy)
-            for i in range(num_detections):
-                person = Person.from_yolo(result, i, image)
-                people.append(person)
-
-        return people
-
-
-def main():
-    image = Image.from_file(get_data("cafe.jpg"))
-    detector = YoloPersonDetector()
-
-    # Get Person objects
-    people = detector.detect_people(image)
-
-    print(f"Detected {len(people)} people")
-    for i, person in enumerate(people):
-        print(f"\nPerson {i}:")
-        print(f"  Confidence: {person.confidence:.3f}")
-        print(f"  Bounding box: {person.bbox}")
-        cx, cy = person.center
-        print(f"  Center: ({cx:.1f}, {cy:.1f})")
-        print(f"  Size: {person.width:.1f} x {person.height:.1f}")
-
-        # Get specific keypoints
-        nose_xy, nose_conf = person.get_keypoint("nose")
-        print(f"  Nose: {nose_xy} (conf: {nose_conf:.3f})")
-
-        # Get all visible keypoints
-        visible = person.get_visible_keypoints(threshold=0.7)
-        print(f"  Visible keypoints (>0.7): {len(visible)}")
-        for name, xy, conf in visible[:3]:  # Show first 3
-            print(f"    {name}: {xy} (conf: {conf:.3f})")
-
-
-if __name__ == "__main__":
-    main()
+        results = self.model(source=image.to_opencv())
+        return ImageDetections2D.from_ultralytics_result(image, results)
diff --git a/dimos/perception/detection/detectors/test_yolo.py b/dimos/perception/detection/detectors/test_yolo.py
new file mode 100644
index 0000000000..27cfb8cb9d
--- /dev/null
+++ b/dimos/perception/detection/detectors/test_yolo.py
@@ -0,0 +1,159 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D
+
+
+@pytest.fixture()
+def bboxes(bbox_detector, test_image):
+    """Get ImageDetections2D from bbox detector."""
+    return bbox_detector.process_image(test_image)
+
+
+@pytest.fixture()
+def bbox_list(bbox_detector, test_image):
+    """Get list of Detection2DBBox objects."""
+    detections = bbox_detector.process_image(test_image)
+    return detections.detections
+
+
+def test_bbox_detection(bbox_list):
+    """Test that we can detect objects with bounding boxes."""
+    assert len(bbox_list) > 0
+
+    # Check first detection
+    detection = bbox_list[0]
+    assert isinstance(detection, Detection2DBBox)
+    assert detection.confidence > 0
+    assert len(detection.bbox) == 4  # bbox is a tuple (x1, y1, x2, y2)
+    assert detection.class_id >= 0
+    assert detection.name is not None
+
+
+def test_bbox_properties(bbox_list):
+    """Test Detection2DBBox object properties and methods."""
+    detection = bbox_list[0]
+
+    # Test bounding box is valid
+    x1, y1, x2, y2 = detection.bbox
+    assert x2 > x1, "x2 should be greater than x1"
+    assert y2 > y1, "y2 should be greater than y1"
+    assert all(coord >= 0 for coord in detection.bbox), "Coordinates should be non-negative"
+
+    # Test bbox volume
+    volume = detection.bbox_2d_volume()
+    assert volume > 0
+    expected_volume = (x2 - x1) * (y2 - y1)
+    assert abs(volume - expected_volume) < 0.01
+
+    # Test center calculation
+    center_x, center_y, width, height = detection.get_bbox_center()
+    assert center_x == (x1 + x2) / 2.0
+    assert center_y == (y1 + y2) / 2.0
+    assert width == x2 - x1
+    assert height == y2 - y1
+
+
+def test_bbox_cropped_image(bbox_list, test_image):
+    """Test cropping image to detection bbox."""
+    detection = bbox_list[0]
+
+    # Test cropped image
+    cropped = detection.cropped_image(padding=20)
+    assert cropped is not None
+
+    # Cropped image should be smaller than original (usually)
+    if test_image.shape:
+        assert cropped.shape[0] <= test_image.shape[0]
+        assert cropped.shape[1] <= test_image.shape[1]
+
+
+def test_bbox_annotations(bbox_list):
+    """Test annotation generation for bboxes."""
+    detection = bbox_list[0]
+
+    # Test text annotations
+    text_annotations = detection.to_text_annotation()
+    assert len(text_annotations) == 2  # confidence and name/track_id
+
+    # Test points annotations (bounding box)
+    points_annotations = detection.to_points_annotation()
+    assert len(points_annotations) == 1  # Just the bbox polygon
+
+    # Test image annotations
+    annotations = detection.to_image_annotations()
+    assert annotations.texts_length == 2
+    assert annotations.points_length == 1
+
+
+def test_bbox_ros_conversion(bbox_list):
+    """Test conversion to ROS Detection2D message."""
+    detection = bbox_list[0]
+
+    ros_det = detection.to_ros_detection2d()
+
+    # Check bbox conversion
+    center_x, center_y, width, height = detection.get_bbox_center()
+    assert abs(ros_det.bbox.center.position.x - center_x) < 0.01
+    assert abs(ros_det.bbox.center.position.y - center_y) < 0.01
+    assert abs(ros_det.bbox.size_x - width) < 0.01
+    assert abs(ros_det.bbox.size_y - height) < 0.01
+
+    # Check confidence and class_id
+    assert len(ros_det.results) > 0
+    assert ros_det.results[0].hypothesis.score == detection.confidence
+    assert ros_det.results[0].hypothesis.class_id == detection.class_id
+
+
+def test_bbox_is_valid(bbox_list):
+    """Test bbox validation."""
+    detection = bbox_list[0]
+
+    # Detection from real detector should be valid
+    assert detection.is_valid()
+
+
+def test_image_detections2d_structure(bboxes):
+    """Test that process_image returns ImageDetections2D."""
+    assert isinstance(bboxes, ImageDetections2D)
+    assert len(bboxes.detections) > 0
+    assert all(isinstance(d, Detection2DBBox) for d in bboxes.detections)
+
+
+def test_multiple_detections(bboxes):
+    """Test that multiple objects can be detected."""
+    print(f"\nDetected {len(bboxes.detections)} objects in test image")
+
+    for i, detection in enumerate(bboxes.detections[:5]):  # Show first 5
+        print(f"\nDetection {i}:")
+        print(f"  Class: {detection.name} (id: {detection.class_id})")
+        print(f"  Confidence: {detection.confidence:.3f}")
+        print(
+            f"  Bbox: ({detection.bbox[0]:.1f}, {detection.bbox[1]:.1f}, {detection.bbox[2]:.1f}, {detection.bbox[3]:.1f})"
+        )
+        print(f"  Track ID: {detection.track_id}")
+
+
+def test_detection_string_representation(bbox_list):
+    """Test string representation of detections."""
+    detection = bbox_list[0]
+    str_repr = str(detection)
+
+    # Should contain class name
+    assert "Detection2DBBox" in str_repr
+
+    # Should show object name
+    assert detection.name in str_repr or f"class_{detection.class_id}" in str_repr
diff --git a/dimos/perception/detection/detectors/types.py b/dimos/perception/detection/detectors/types.py
index 6acbba601e..1a3b0b5471 100644
--- a/dimos/perception/detection/detectors/types.py
+++ b/dimos/perception/detection/detectors/types.py
@@ -15,11 +15,9 @@
 from abc import ABC, abstractmethod
 
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.type import (
-    InconvinientDetectionFormat,
-)
+from dimos.perception.detection.type import ImageDetections2D
 
 
 class Detector(ABC):
     @abstractmethod
-    def process_image(self, image: Image) -> InconvinientDetectionFormat: ...
+    def process_image(self, image: Image) -> ImageDetections2D: ...
diff --git a/dimos/perception/detection/detectors/yolo.py b/dimos/perception/detection/detectors/yolo.py
index 0f47ea246e..af457540cc 100644
--- a/dimos/perception/detection/detectors/yolo.py
+++ b/dimos/perception/detection/detectors/yolo.py
@@ -20,11 +20,7 @@
 
 from dimos.msgs.sensor_msgs import Image
 from dimos.perception.detection.detectors.types import Detector
-from dimos.perception.detection2d.utils import (
-    extract_detection_results,
-    filter_detections,
-    plot_results,
-)
+from dimos.perception.detection.type import ImageDetections2D
 from dimos.utils.data import get_data
 from dimos.utils.gpu_utils import is_cuda_available
 from dimos.utils.logging_config import setup_logger
@@ -56,20 +52,15 @@ def __init__(self, model_path="models_yolo", model_name="yolo11n.onnx", device="
             self.device = "cpu"
             logger.debug("Using CPU for YOLO 2d detector")
 
-    def process_image(self, image: Image):
+    def process_image(self, image: Image) -> ImageDetections2D:
         """
         Process an image and return detection results.
 
         Args:
-            image: Input image in BGR format (OpenCV)
+            image: Input image
 
         Returns:
-            tuple: (bboxes, track_ids, class_ids, confidences, names)
-                - bboxes: list of [x1, y1, x2, y2] coordinates
-                - track_ids: list of tracking IDs (or -1 if no tracking)
-                - class_ids: list of class indices
-                - confidences: list of detection confidences
-                - names: list of class names
+            ImageDetections2D containing all detected objects
         """
         results = self.model.track(
             source=image.to_opencv(),
@@ -81,29 +72,7 @@ def process_image(self, image: Image):
             tracker=self.tracker_config,
         )
 
-        if len(results) > 0:
-            # Extract detection results
-            bboxes, track_ids, class_ids, confidences, names = extract_detection_results(results[0])
-            return bboxes, track_ids, class_ids, confidences, names
-
-        return [], [], [], [], []
-
-    def visualize_results(self, image, bboxes, track_ids, class_ids, confidences, names):
-        """
-        Generate visualization of detection results.
-
-        Args:
-            image: Original input image
-            bboxes: List of bounding boxes
-            track_ids: List of tracking IDs
-            class_ids: List of class indices
-            confidences: List of detection confidences
-            names: List of class names
-
-        Returns:
-            Image with visualized detections
-        """
-        return plot_results(image, bboxes, track_ids, class_ids, confidences, names)
+        return ImageDetections2D.from_ultralytics_result(image, results)
 
     def stop(self):
         """
@@ -118,55 +87,3 @@ def stop(self):
                         if hasattr(gmc, "executor") and gmc.executor is not None:
                             gmc.executor.shutdown(wait=True)
             self.model.predictor = None
-
-
-def main():
-    """Example usage of the Yolo2DDetector class."""
-    # Initialize video capture
-    cap = cv2.VideoCapture(0)
-
-    # Initialize detector
-    detector = Yolo2DDetector()
-
-    enable_person_filter = True
-
-    try:
-        while cap.isOpened():
-            ret, frame = cap.read()
-            if not ret:
-                break
-
-            # Process frame
-            bboxes, track_ids, class_ids, confidences, names = detector.process_image(frame)
-
-            # Apply person filtering if enabled
-            if enable_person_filter and len(bboxes) > 0:
-                # Person is class_id 0 in COCO dataset
-                bboxes, track_ids, class_ids, confidences, names = filter_detections(
-                    bboxes,
-                    track_ids,
-                    class_ids,
-                    confidences,
-                    names,
-                    class_filter=[0],  # 0 is the class_id for person
-                    name_filter=["person"],
-                )
-
-            # Visualize results
-            if len(bboxes) > 0:
-                frame = detector.visualize_results(
-                    frame, bboxes, track_ids, class_ids, confidences, names
-                )
-
-            # Display results
-            cv2.imshow("YOLO Detection", frame)
-            if cv2.waitKey(1) & 0xFF == ord("q"):
-                break
-
-    finally:
-        cap.release()
-        cv2.destroyAllWindows()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/dimos/perception/detection/module2D.py b/dimos/perception/detection/module2D.py
index eca73afa8e..1977362bae 100644
--- a/dimos/perception/detection/module2D.py
+++ b/dimos/perception/detection/module2D.py
@@ -60,13 +60,7 @@ def __init__(self, *args, **kwargs):
         self.vlm_detections_subject = Subject()
 
     def process_image_frame(self, image: Image) -> ImageDetections2D:
-        # Use person detection specifically if it's a YoloPersonDetector
-        if isinstance(self.detector, YoloPersonDetector):
-            people = self.detector.detect_people(image)
-            return ImageDetections2D.from_pose_detector(image, people)
-        else:
-            # Fallback to generic dettection for other detectors
-            return ImageDetections2D.from_bbox_detector(image, self.detector.process_image(image))
+        return self.detector.process_image(image)
 
     @simple_mcache
     def sharp_image_stream(self) -> Observable[Image]:
diff --git a/dimos/perception/detection/type/__init__.py b/dimos/perception/detection/type/__init__.py
index 54147da975..74ab3dacab 100644
--- a/dimos/perception/detection/type/__init__.py
+++ b/dimos/perception/detection/type/__init__.py
@@ -13,4 +13,4 @@
     ImageDetections3DPC,
 )
 from dimos.perception.detection.type.imageDetections import ImageDetections, TableStr
-from dimos.perception.detection.type.person import Person
+from dimos.perception.detection.type.person import Detection2DPerson
diff --git a/dimos/perception/detection/type/detection2d.py b/dimos/perception/detection/type/detection2d.py
index 44dcf47153..e097728992 100644
--- a/dimos/perception/detection/type/detection2d.py
+++ b/dimos/perception/detection/type/detection2d.py
@@ -19,8 +19,6 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple
 
-from dimos.utils.decorators.decorators import simple_mcache
-
 from dimos_lcm.foxglove_msgs.ImageAnnotations import (
     PointsAnnotation,
     TextAnnotation,
@@ -38,6 +36,7 @@
 )
 from rich.console import Console
 from rich.text import Text
+from ultralytics.engine.results import Boxes, Keypoints, Results
 
 from dimos.msgs.foxglove_msgs import ImageAnnotations
 from dimos.msgs.foxglove_msgs.Color import Color
@@ -45,9 +44,10 @@
 from dimos.msgs.std_msgs import Header
 from dimos.perception.detection.type.imageDetections import ImageDetections
 from dimos.types.timestamped import Timestamped, to_ros_stamp, to_timestamp
+from dimos.utils.decorators.decorators import simple_mcache
 
 if TYPE_CHECKING:
-    from dimos.perception.detection.type.person import Person
+    from dimos.perception.detection.type.person import Detection2DPerson
 
 Bbox = Tuple[float, float, float, float]
 CenteredBbox = Tuple[float, float, float, float]
@@ -216,6 +216,53 @@ def from_detection(cls, raw_detection: Detection, **kwargs) -> "Detection2D":
             **kwargs,
         )
 
+    @classmethod
+    def from_ultralytics_result(cls, result: Results, idx: int, image: Image) -> "Detection2DBBox":
+        """Create Detection2DBBox from ultralytics Results object.
+
+        Args:
+            result: Ultralytics Results object containing detection data
+            idx: Index of the detection in the results
+            image: Source image
+
+        Returns:
+            Detection2DBBox instance
+        """
+        # Extract bounding box coordinates
+        bbox_array = result.boxes.xyxy[idx].cpu().numpy()
+        bbox: Bbox = (
+            float(bbox_array[0]),
+            float(bbox_array[1]),
+            float(bbox_array[2]),
+            float(bbox_array[3]),
+        )
+
+        # Extract confidence
+        confidence = float(result.boxes.conf[idx].cpu())
+
+        # Extract class ID and name
+        class_id = int(result.boxes.cls[idx].cpu())
+        name = (
+            result.names.get(class_id, f"class_{class_id}")
+            if hasattr(result, "names")
+            else f"class_{class_id}"
+        )
+
+        # Extract track ID if available
+        track_id = -1
+        if hasattr(result.boxes, "id") and result.boxes.id is not None:
+            track_id = int(result.boxes.id[idx].cpu())
+
+        return cls(
+            bbox=bbox,
+            track_id=track_id,
+            class_id=class_id,
+            confidence=confidence,
+            name=name,
+            ts=image.ts,
+            image=image,
+        )
+
     def get_bbox_center(self) -> CenteredBbox:
         x1, y1, x2, y2 = self.bbox
         center_x = (x1 + x2) / 2.0
@@ -359,6 +406,43 @@ def to_ros_detection2d(self) -> ROSDetection2D:
 
 
 class ImageDetections2D(ImageDetections[Detection2D]):
+    @classmethod
+    def from_ultralytics_result(
+        cls, image: Image, results: List[Results], **kwargs
+    ) -> "ImageDetections2D":
+        """Create ImageDetections2D from ultralytics Results.
+
+        Dispatches to appropriate Detection2D subclass based on result type:
+        - If keypoints present: creates Detection2DPerson
+        - Otherwise: creates Detection2DBBox
+
+        Args:
+            image: Source image
+            results: List of ultralytics Results objects
+            **kwargs: Additional arguments passed to detection constructors
+
+        Returns:
+            ImageDetections2D containing appropriate detection types
+        """
+        from dimos.perception.detection.type.person import Detection2DPerson
+
+        detections = []
+        for result in results:
+            if result.boxes is None:
+                continue
+
+            num_detections = len(result.boxes.xyxy)
+            for i in range(num_detections):
+                if result.keypoints is not None:
+                    # Pose detection with keypoints
+                    detection = Detection2DPerson.from_ultralytics_result(result, i, image)
+                else:
+                    # Regular bbox detection
+                    detection = Detection2DBBox.from_ultralytics_result(result, i, image)
+                detections.append(detection)
+
+        return cls(image=image, detections=detections)
+
     @classmethod
     def from_bbox_detector(
         cls, image: Image, raw_detections: InconvinientDetectionFormat, **kwargs
diff --git a/dimos/perception/detection/type/person.py b/dimos/perception/detection/type/person.py
index 22608b76e3..773217194b 100644
--- a/dimos/perception/detection/type/person.py
+++ b/dimos/perception/detection/type/person.py
@@ -31,7 +31,7 @@
 
 
 @dataclass
-class Person(Detection2DBBox):
+class Detection2DPerson(Detection2DBBox):
     """Represents a detected person with pose keypoints."""
 
     # Pose keypoints - additional fields beyond Detection2DBBox
@@ -68,16 +68,48 @@ class Person(Detection2DBBox):
     ]
 
     @classmethod
-    def from_yolo(cls, result: "Results", person_idx: int, image: Image) -> "Person":
-        """Create Person instance from YOLO results.
+    def from_ultralytics_result(
+        cls, result: "Results", idx: int, image: Image
+    ) -> "Detection2DPerson":
+        """Create Detection2DPerson from ultralytics Results object with pose keypoints.
 
         Args:
-            result: Single Results object from YOLO
-            person_idx: Index of the person in the detection results
-            image: Original image for the detection
+            result: Ultralytics Results object containing detection and keypoint data
+            idx: Index of the detection in the results
+            image: Source image
+
+        Returns:
+            Detection2DPerson instance
+
+        Raises:
+            ValueError: If the result doesn't contain keypoints or is not a person detection
         """
+        # Validate that this is a pose detection result
+        if not hasattr(result, "keypoints") or result.keypoints is None:
+            raise ValueError(
+                f"Cannot create Detection2DPerson from result without keypoints. "
+                f"This appears to be a regular detection result, not a pose detection. "
+                f"Use Detection2DBBox.from_ultralytics_result() instead."
+            )
+
+        if not hasattr(result, "boxes") or result.boxes is None:
+            raise ValueError("Cannot create Detection2DPerson from result without bounding boxes")
+
+        # Check if this is actually a person detection (class 0 in COCO)
+        class_id = int(result.boxes.cls[idx].cpu())
+        if class_id != 0:  # Person is class 0 in COCO
+            class_name = (
+                result.names.get(class_id, f"class_{class_id}")
+                if hasattr(result, "names")
+                else f"class_{class_id}"
+            )
+            raise ValueError(
+                f"Cannot create Detection2DPerson from non-person detection. "
+                f"Got class {class_id} ({class_name}), expected class 0 (person)."
+            )
+
         # Extract bounding box as tuple for Detection2DBBox
-        bbox_array = result.boxes.xyxy[person_idx].cpu().numpy()
+        bbox_array = result.boxes.xyxy[idx].cpu().numpy()
 
         bbox: Bbox = (
             float(bbox_array[0]),
@@ -87,31 +119,37 @@ def from_yolo(cls, result: "Results", person_idx: int, image: Image) -> "Person"
         )
 
         bbox_norm = (
-            result.boxes.xyxyn[person_idx].cpu().numpy() if hasattr(result.boxes, "xyxyn") else None
+            result.boxes.xyxyn[idx].cpu().numpy() if hasattr(result.boxes, "xyxyn") else None
         )
 
-        confidence = float(result.boxes.conf[person_idx].cpu())
-        class_id = int(result.boxes.cls[person_idx].cpu())
+        confidence = float(result.boxes.conf[idx].cpu())
+        class_id = int(result.boxes.cls[idx].cpu())
 
         # Extract keypoints
-        keypoints = result.keypoints.xy[person_idx].cpu().numpy()
-        keypoint_scores = result.keypoints.conf[person_idx].cpu().numpy()
+        keypoints = result.keypoints.xy[idx].cpu().numpy()
+        keypoint_scores = result.keypoints.conf[idx].cpu().numpy()
         keypoints_norm = (
-            result.keypoints.xyn[person_idx].cpu().numpy()
-            if hasattr(result.keypoints, "xyn")
-            else None
+            result.keypoints.xyn[idx].cpu().numpy() if hasattr(result.keypoints, "xyn") else None
         )
 
         # Get image dimensions
         height, width = result.orig_shape
 
+        # Extract track ID if available
+        track_id = idx  # Use index as default
+        if hasattr(result.boxes, "id") and result.boxes.id is not None:
+            track_id = int(result.boxes.id[idx].cpu())
+
+        # Get class name
+        name = result.names.get(class_id, "person") if hasattr(result, "names") else "person"
+
         return cls(
             # Detection2DBBox fields
             bbox=bbox,
-            track_id=person_idx,  # Use person index as track_id for now
+            track_id=track_id,
             class_id=class_id,
             confidence=confidence,
-            name="person",
+            name=name,
             ts=image.ts,
             image=image,
             # Person specific fields
@@ -123,6 +161,11 @@ def from_yolo(cls, result: "Results", person_idx: int, image: Image) -> "Person"
             image_height=height,
         )
 
+    @classmethod
+    def from_yolo(cls, result: "Results", idx: int, image: Image) -> "Detection2DPerson":
+        """Alias for from_ultralytics_result for backward compatibility."""
+        return cls.from_ultralytics_result(result, idx, image)
+
     def get_keypoint(self, name: str) -> Tuple[np.ndarray, float]:
         """Get specific keypoint by name.
         Returns:

From 638d81e8dbfcc6ae59f65c733024c2c2bd4e9d86 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 16:57:40 -0700
Subject: [PATCH 06/47] tests cleanup

---
 .../detectors/person/test_annotations.py      | 66 ----------------
 .../person/test_detection2d_conformance.py    | 76 -------------------
 .../person/test_imagedetections2d.py          | 48 ------------
 .../detection/detectors/person/test_yolo.py   | 65 ++++++++++++----
 .../detection/detectors/test_yolo.py          | 37 ++++-----
 5 files changed, 65 insertions(+), 227 deletions(-)
 delete mode 100644 dimos/perception/detection/detectors/person/test_annotations.py
 delete mode 100644 dimos/perception/detection/detectors/person/test_detection2d_conformance.py
 delete mode 100644 dimos/perception/detection/detectors/person/test_imagedetections2d.py

diff --git a/dimos/perception/detection/detectors/person/test_annotations.py b/dimos/perception/detection/detectors/person/test_annotations.py
deleted file mode 100644
index d3c06f9a29..0000000000
--- a/dimos/perception/detection/detectors/person/test_annotations.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2025 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test person annotations work correctly."""
-
-import sys
-
-from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
-from dimos.utils.data import get_data
-
-
-def test_person_annotations():
-    """Test that Person annotations include keypoints and skeleton."""
-    image = Image.from_file(get_data("cafe.jpg"))
-    detector = YoloPersonDetector()
-    detections = detector.process_image(image)
-
-    assert len(detections.detections) > 0
-    person = detections.detections[0]
-
-    # Test text annotations
-    text_anns = person.to_text_annotation()
-    print(f"\nText annotations: {len(text_anns)}")
-    for i, ann in enumerate(text_anns):
-        print(f"  {i}: {ann.text}")
-    assert len(text_anns) == 3  # confidence, name/track_id, keypoints count
-    assert any("keypoints:" in ann.text for ann in text_anns)
-
-    # Test points annotations
-    points_anns = person.to_points_annotation()
-    print(f"\nPoints annotations: {len(points_anns)}")
-
-    # Count different types (use actual LCM constants)
-    from dimos_lcm.foxglove_msgs.ImageAnnotations import PointsAnnotation
-
-    bbox_count = sum(1 for ann in points_anns if ann.type == PointsAnnotation.LINE_LOOP)  # 2
-    keypoint_count = sum(1 for ann in points_anns if ann.type == PointsAnnotation.POINTS)  # 1
-    skeleton_count = sum(1 for ann in points_anns if ann.type == PointsAnnotation.LINE_LIST)  # 4
-
-    print(f"  - Bounding boxes: {bbox_count}")
-    print(f"  - Keypoint circles: {keypoint_count}")
-    print(f"  - Skeleton lines: {skeleton_count}")
-
-    assert bbox_count >= 1  # At least the person bbox
-    assert keypoint_count >= 1  # At least some visible keypoints
-    assert skeleton_count >= 1  # At least some skeleton connections
-
-    # Test full image annotations
-    img_anns = person.to_image_annotations()
-    assert img_anns.texts_length == len(text_anns)
-    assert img_anns.points_length == len(points_anns)
-
-    print(f"\n✓ Person annotations working correctly!")
-    print(f"  - {len(person.get_visible_keypoints(0.5))}/17 visible keypoints")
diff --git a/dimos/perception/detection/detectors/person/test_detection2d_conformance.py b/dimos/perception/detection/detectors/person/test_detection2d_conformance.py
deleted file mode 100644
index 300d5da5fd..0000000000
--- a/dimos/perception/detection/detectors/person/test_detection2d_conformance.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2025 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
-from dimos.utils.data import get_data
-
-
-def test_person_detection2d_bbox_conformance():
-    """Test that Detection2DPerson conforms to Detection2DBBox interface."""
-    image = Image.from_file(get_data("cafe.jpg"))
-    detector = YoloPersonDetector()
-    detections = detector.process_image(image)
-
-    assert len(detections.detections) > 0
-    person = detections.detections[0]
-
-    # Test Detection2DBBox methods
-    # Test bbox operations
-    assert hasattr(person, "bbox")
-    assert len(person.bbox) == 4
-    assert all(isinstance(x, float) for x in person.bbox)
-
-    # Test inherited properties
-    assert hasattr(person, "get_bbox_center")
-    center_bbox = person.get_bbox_center()
-    assert len(center_bbox) == 4  # center_x, center_y, width, height
-
-    # Test volume calculation
-    volume = person.bbox_2d_volume()
-    assert volume > 0
-
-    # Test cropped image
-    cropped = person.cropped_image(padding=10)
-    assert isinstance(cropped, Image)
-
-    # Test annotation methods
-    text_annotations = person.to_text_annotation()
-    assert len(text_annotations) == 3  # confidence, name/track_id, and keypoints count
-
-    points_annotations = person.to_points_annotation()
-    # Should have: 1 bbox + 1 keypoints + multiple skeleton lines
-    assert len(points_annotations) > 1
-    print(f"  - Points annotations: {len(points_annotations)} (bbox + keypoints + skeleton)")
-
-    # Test image annotations
-    annotations = person.to_image_annotations()
-    assert annotations.texts_length == 3
-    assert annotations.points_length > 1
-
-    # Test ROS conversion
-    ros_det = person.to_ros_detection2d()
-    assert ros_det.bbox.size_x == person.width
-    assert ros_det.bbox.size_y == person.height
-
-    # Test string representation
-    str_repr = str(person)
-    assert "Detection2DPerson" in str_repr
-    assert "person" in str_repr  # name field
-
-    print("\n✓ Detection2DPerson class fully conforms to Detection2DBBox interface")
-    print(f"  - Detected {len(detections.detections)} people")
-    print(f"  - First person confidence: {person.confidence:.3f}")
-    print(f"  - Bbox volume: {volume:.1f}")
-    print(f"  - Has {len(person.get_visible_keypoints(0.5))} visible keypoints")
diff --git a/dimos/perception/detection/detectors/person/test_imagedetections2d.py b/dimos/perception/detection/detectors/person/test_imagedetections2d.py
deleted file mode 100644
index ce595a244b..0000000000
--- a/dimos/perception/detection/detectors/person/test_imagedetections2d.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2025 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test ImageDetections2D with pose detections."""
-
-from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
-from dimos.perception.detection.type import ImageDetections2D
-from dimos.utils.data import get_data
-
-
-def test_image_detections_2d_with_person():
-    """Test creating ImageDetections2D from person detector."""
-    # Load image and detect people
-    image = Image.from_file(get_data("cafe.jpg"))
-    detector = YoloPersonDetector()
-    image_detections = detector.process_image(image)
-
-    # Verify structure
-    assert image_detections.image is image
-    assert len(image_detections.detections) > 0
-
-    # Test image annotations (includes pose keypoints)
-    annotations = image_detections.to_foxglove_annotations()
-    num_people = len(image_detections.detections)
-    print(f"\nImageDetections2D created with {num_people} people")
-    print(f"Total text annotations: {annotations.texts_length}")
-    print(f"Total points annotations: {annotations.points_length}")
-
-    # Points should include: bounding boxes + keypoints + skeleton lines
-    # At least 3 annotations per person (bbox, keypoints, skeleton)
-    assert annotations.points_length >= num_people * 3
-
-    # Text annotations should include confidence, name/id, and keypoint count
-    assert annotations.texts_length >= num_people * 3
-
-    print("\n✓ ImageDetections2D from person detector working correctly!")
diff --git a/dimos/perception/detection/detectors/person/test_yolo.py b/dimos/perception/detection/detectors/person/test_yolo.py
index 2c70dc1232..de0bbf34e8 100644
--- a/dimos/perception/detection/detectors/person/test_yolo.py
+++ b/dimos/perception/detection/detectors/person/test_yolo.py
@@ -14,27 +14,25 @@
 
 import pytest
 
-from dimos.perception.detection.type import Detection2DBBox, Detection2DPerson, ImageDetections2D
+from dimos.perception.detection.type import Detection2DPerson, ImageDetections2D
 
 
 @pytest.fixture()
 def people(person_detector, test_image):
-    """Get ImageDetections2D from person detector."""
     return person_detector.process_image(test_image)
 
 
 @pytest.fixture()
-def people_list(people, test_image):
-    """Get list of Detection2DPerson objects."""
-    return people.detections
+def person(people):
+    return people[0]
 
 
-def test_person_detection(people_list):
+def test_person_detection(people):
     """Test that we can detect people with pose keypoints."""
-    assert len(people_list) > 0
+    assert len(people) > 0
 
     # Check first person
-    person = people_list[0]
+    person = people[0]
     assert isinstance(person, Detection2DPerson)
     assert person.confidence > 0
     assert len(person.bbox) == 4  # bbox is now a tuple
@@ -42,9 +40,9 @@ def test_person_detection(people_list):
     assert person.keypoint_scores.shape == (17,)
 
 
-def test_person_properties(people_list):
+def test_person_properties(people):
     """Test Detection2DPerson object properties and methods."""
-    person = people_list[0]
+    person = people[0]
 
     # Test bounding box properties
     assert person.width > 0
@@ -64,9 +62,9 @@ def test_person_properties(people_list):
     assert all(0 <= conf <= 1 for _, _, conf in visible)
 
 
-def test_person_normalized_coords(people_list):
+def test_person_normalized_coords(people):
     """Test normalized coordinates if available."""
-    person = people_list[0]
+    person = people[0]
 
     if person.keypoints_normalized is not None:
         assert person.keypoints_normalized.shape == (17, 2)
@@ -80,11 +78,11 @@ def test_person_normalized_coords(people_list):
         assert (person.bbox_normalized <= 1).all()
 
 
-def test_multiple_people(people_list):
+def test_multiple_people(people):
     """Test that multiple people can be detected."""
-    print(f"\nDetected {len(people_list)} people in test image")
+    print(f"\nDetected {len(people)} people in test image")
 
-    for i, person in enumerate(people_list[:3]):  # Show first 3
+    for i, person in enumerate(people[:3]):  # Show first 3
         print(f"\nPerson {i}:")
         print(f"  Confidence: {person.confidence:.3f}")
         print(f"  Size: {person.width:.1f} x {person.height:.1f}")
@@ -123,3 +121,40 @@ def test_invalid_keypoint(test_image):
 
     with pytest.raises(ValueError):
         person.get_keypoint("invalid_keypoint")
+
+
+def test_person_annotations(person):
+    # Test text annotations
+    text_anns = person.to_text_annotation()
+    print(f"\nText annotations: {len(text_anns)}")
+    for i, ann in enumerate(text_anns):
+        print(f"  {i}: {ann.text}")
+    assert len(text_anns) == 3  # confidence, name/track_id, keypoints count
+    assert any("keypoints:" in ann.text for ann in text_anns)
+
+    # Test points annotations
+    points_anns = person.to_points_annotation()
+    print(f"\nPoints annotations: {len(points_anns)}")
+
+    # Count different types (use actual LCM constants)
+    from dimos_lcm.foxglove_msgs.ImageAnnotations import PointsAnnotation
+
+    bbox_count = sum(1 for ann in points_anns if ann.type == PointsAnnotation.LINE_LOOP)  # 2
+    keypoint_count = sum(1 for ann in points_anns if ann.type == PointsAnnotation.POINTS)  # 1
+    skeleton_count = sum(1 for ann in points_anns if ann.type == PointsAnnotation.LINE_LIST)  # 4
+
+    print(f"  - Bounding boxes: {bbox_count}")
+    print(f"  - Keypoint circles: {keypoint_count}")
+    print(f"  - Skeleton lines: {skeleton_count}")
+
+    assert bbox_count >= 1  # At least the person bbox
+    assert keypoint_count >= 1  # At least some visible keypoints
+    assert skeleton_count >= 1  # At least some skeleton connections
+
+    # Test full image annotations
+    img_anns = person.to_image_annotations()
+    assert img_anns.texts_length == len(text_anns)
+    assert img_anns.points_length == len(points_anns)
+
+    print(f"\n✓ Person annotations working correctly!")
+    print(f"  - {len(person.get_visible_keypoints(0.5))}/17 visible keypoints")
diff --git a/dimos/perception/detection/detectors/test_yolo.py b/dimos/perception/detection/detectors/test_yolo.py
index 27cfb8cb9d..733c3c9c80 100644
--- a/dimos/perception/detection/detectors/test_yolo.py
+++ b/dimos/perception/detection/detectors/test_yolo.py
@@ -23,19 +23,12 @@ def bboxes(bbox_detector, test_image):
     return bbox_detector.process_image(test_image)
 
 
-@pytest.fixture()
-def bbox_list(bbox_detector, test_image):
-    """Get list of Detection2DBBox objects."""
-    detections = bbox_detector.process_image(test_image)
-    return detections.detections
-
-
-def test_bbox_detection(bbox_list):
+def test_bbox_detection(bboxes):
     """Test that we can detect objects with bounding boxes."""
-    assert len(bbox_list) > 0
+    assert len(bboxes) > 0
 
     # Check first detection
-    detection = bbox_list[0]
+    detection = bboxes[0]
     assert isinstance(detection, Detection2DBBox)
     assert detection.confidence > 0
     assert len(detection.bbox) == 4  # bbox is a tuple (x1, y1, x2, y2)
@@ -43,9 +36,9 @@ def test_bbox_detection(bbox_list):
     assert detection.name is not None
 
 
-def test_bbox_properties(bbox_list):
+def test_bbox_properties(bboxes):
     """Test Detection2DBBox object properties and methods."""
-    detection = bbox_list[0]
+    detection = bboxes[0]
 
     # Test bounding box is valid
     x1, y1, x2, y2 = detection.bbox
@@ -67,9 +60,9 @@ def test_bbox_properties(bbox_list):
     assert height == y2 - y1
 
 
-def test_bbox_cropped_image(bbox_list, test_image):
+def test_bbox_cropped_image(bboxes, test_image):
     """Test cropping image to detection bbox."""
-    detection = bbox_list[0]
+    detection = bboxes[0]
 
     # Test cropped image
     cropped = detection.cropped_image(padding=20)
@@ -81,9 +74,9 @@ def test_bbox_cropped_image(bbox_list, test_image):
         assert cropped.shape[1] <= test_image.shape[1]
 
 
-def test_bbox_annotations(bbox_list):
+def test_bbox_annotations(bboxes):
     """Test annotation generation for bboxes."""
-    detection = bbox_list[0]
+    detection = bboxes[0]
 
     # Test text annotations
     text_annotations = detection.to_text_annotation()
@@ -99,9 +92,9 @@ def test_bbox_annotations(bbox_list):
     assert annotations.points_length == 1
 
 
-def test_bbox_ros_conversion(bbox_list):
+def test_bbox_ros_conversion(bboxes):
     """Test conversion to ROS Detection2D message."""
-    detection = bbox_list[0]
+    detection = bboxes[0]
 
     ros_det = detection.to_ros_detection2d()
 
@@ -118,9 +111,9 @@ def test_bbox_ros_conversion(bbox_list):
     assert ros_det.results[0].hypothesis.class_id == detection.class_id
 
 
-def test_bbox_is_valid(bbox_list):
+def test_bbox_is_valid(bboxes):
     """Test bbox validation."""
-    detection = bbox_list[0]
+    detection = bboxes[0]
 
     # Detection from real detector should be valid
     assert detection.is_valid()
@@ -147,9 +140,9 @@ def test_multiple_detections(bboxes):
         print(f"  Track ID: {detection.track_id}")
 
 
-def test_detection_string_representation(bbox_list):
+def test_detection_string_representation(bboxes):
     """Test string representation of detections."""
-    detection = bbox_list[0]
+    detection = bboxes[0]
     str_repr = str(detection)
 
     # Should contain class name

From b68619853be45726119becdd7df8d8b028e769d5 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 17:00:51 -0700
Subject: [PATCH 07/47] detector grid testing

---
 ...{test_yolo.py => test_person_detectors.py} |  0
 .../{test_yolo.py => test_bbox_detectors.py}  | 82 ++++++++++---------
 2 files changed, 44 insertions(+), 38 deletions(-)
 rename dimos/perception/detection/detectors/person/{test_yolo.py => test_person_detectors.py} (100%)
 rename dimos/perception/detection/detectors/{test_yolo.py => test_bbox_detectors.py} (61%)

diff --git a/dimos/perception/detection/detectors/person/test_yolo.py b/dimos/perception/detection/detectors/person/test_person_detectors.py
similarity index 100%
rename from dimos/perception/detection/detectors/person/test_yolo.py
rename to dimos/perception/detection/detectors/person/test_person_detectors.py
diff --git a/dimos/perception/detection/detectors/test_yolo.py b/dimos/perception/detection/detectors/test_bbox_detectors.py
similarity index 61%
rename from dimos/perception/detection/detectors/test_yolo.py
rename to dimos/perception/detection/detectors/test_bbox_detectors.py
index 733c3c9c80..193238217e 100644
--- a/dimos/perception/detection/detectors/test_yolo.py
+++ b/dimos/perception/detection/detectors/test_bbox_detectors.py
@@ -14,31 +14,37 @@
 
 import pytest
 
-from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D
+from dimos.perception.detection.type import Detection2D, ImageDetections2D
+
+
+@pytest.fixture(params=["bbox_detector", "person_detector"])
+def detector(request):
+    """Parametrized fixture that provides both bbox and person detectors."""
+    return request.getfixturevalue(request.param)
 
 
 @pytest.fixture()
-def bboxes(bbox_detector, test_image):
-    """Get ImageDetections2D from bbox detector."""
-    return bbox_detector.process_image(test_image)
+def detections(detector, test_image):
+    """Get ImageDetections2D from any detector."""
+    return detector.process_image(test_image)
 
 
-def test_bbox_detection(bboxes):
-    """Test that we can detect objects with bounding boxes."""
-    assert len(bboxes) > 0
+def test_detection_basic(detections):
+    """Test that we can detect objects with all detectors."""
+    assert len(detections.detections) > 0
 
     # Check first detection
-    detection = bboxes[0]
-    assert isinstance(detection, Detection2DBBox)
+    detection = detections.detections[0]
+    assert isinstance(detection, Detection2D)
     assert detection.confidence > 0
     assert len(detection.bbox) == 4  # bbox is a tuple (x1, y1, x2, y2)
     assert detection.class_id >= 0
     assert detection.name is not None
 
 
-def test_bbox_properties(bboxes):
-    """Test Detection2DBBox object properties and methods."""
-    detection = bboxes[0]
+def test_detection_bbox_properties(detections):
+    """Test Detection2D bbox properties work for all detectors."""
+    detection = detections.detections[0]
 
     # Test bounding box is valid
     x1, y1, x2, y2 = detection.bbox
@@ -60,9 +66,9 @@ def test_bbox_properties(bboxes):
     assert height == y2 - y1
 
 
-def test_bbox_cropped_image(bboxes, test_image):
+def test_detection_cropped_image(detections, test_image):
     """Test cropping image to detection bbox."""
-    detection = bboxes[0]
+    detection = detections.detections[0]
 
     # Test cropped image
     cropped = detection.cropped_image(padding=20)
@@ -74,27 +80,27 @@ def test_bbox_cropped_image(bboxes, test_image):
         assert cropped.shape[1] <= test_image.shape[1]
 
 
-def test_bbox_annotations(bboxes):
-    """Test annotation generation for bboxes."""
-    detection = bboxes[0]
+def test_detection_annotations(detections):
+    """Test annotation generation for detections."""
+    detection = detections.detections[0]
 
-    # Test text annotations
+    # Test text annotations - all detections should have at least 2
     text_annotations = detection.to_text_annotation()
-    assert len(text_annotations) == 2  # confidence and name/track_id
+    assert len(text_annotations) >= 2  # confidence and name/track_id (person has keypoints too)
 
-    # Test points annotations (bounding box)
+    # Test points annotations - at least bbox
     points_annotations = detection.to_points_annotation()
-    assert len(points_annotations) == 1  # Just the bbox polygon
+    assert len(points_annotations) >= 1  # At least the bbox polygon
 
     # Test image annotations
     annotations = detection.to_image_annotations()
-    assert annotations.texts_length == 2
-    assert annotations.points_length == 1
+    assert annotations.texts_length >= 2
+    assert annotations.points_length >= 1
 
 
-def test_bbox_ros_conversion(bboxes):
+def test_detection_ros_conversion(detections):
     """Test conversion to ROS Detection2D message."""
-    detection = bboxes[0]
+    detection = detections.detections[0]
 
     ros_det = detection.to_ros_detection2d()
 
@@ -111,26 +117,26 @@ def test_bbox_ros_conversion(bboxes):
     assert ros_det.results[0].hypothesis.class_id == detection.class_id
 
 
-def test_bbox_is_valid(bboxes):
+def test_detection_is_valid(detections):
     """Test bbox validation."""
-    detection = bboxes[0]
+    detection = detections.detections[0]
 
     # Detection from real detector should be valid
     assert detection.is_valid()
 
 
-def test_image_detections2d_structure(bboxes):
+def test_image_detections2d_structure(detections):
     """Test that process_image returns ImageDetections2D."""
-    assert isinstance(bboxes, ImageDetections2D)
-    assert len(bboxes.detections) > 0
-    assert all(isinstance(d, Detection2DBBox) for d in bboxes.detections)
+    assert isinstance(detections, ImageDetections2D)
+    assert len(detections.detections) > 0
+    assert all(isinstance(d, Detection2D) for d in detections.detections)
 
 
-def test_multiple_detections(bboxes):
+def test_multiple_detections(detections):
     """Test that multiple objects can be detected."""
-    print(f"\nDetected {len(bboxes.detections)} objects in test image")
+    print(f"\nDetected {len(detections.detections)} objects in test image")
 
-    for i, detection in enumerate(bboxes.detections[:5]):  # Show first 5
+    for i, detection in enumerate(detections.detections[:5]):  # Show first 5
         print(f"\nDetection {i}:")
         print(f"  Class: {detection.name} (id: {detection.class_id})")
         print(f"  Confidence: {detection.confidence:.3f}")
@@ -140,13 +146,13 @@ def test_multiple_detections(bboxes):
         print(f"  Track ID: {detection.track_id}")
 
 
-def test_detection_string_representation(bboxes):
+def test_detection_string_representation(detections):
     """Test string representation of detections."""
-    detection = bboxes[0]
+    detection = detections.detections[0]
     str_repr = str(detection)
 
-    # Should contain class name
-    assert "Detection2DBBox" in str_repr
+    # Should contain class name (either Detection2DBBox or Detection2DPerson)
+    assert "Detection2D" in str_repr
 
     # Should show object name
     assert detection.name in str_repr or f"class_{detection.class_id}" in str_repr

From 75a4abfff98f66b932ed699e63f29e62f63c821c Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 17:02:23 -0700
Subject: [PATCH 08/47] yolo person detector cuda

---
 .../detection/detectors/person/yolo.py        | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/dimos/perception/detection/detectors/person/yolo.py b/dimos/perception/detection/detectors/person/yolo.py
index a5bd211210..a4e764878c 100644
--- a/dimos/perception/detection/detectors/person/yolo.py
+++ b/dimos/perception/detection/detectors/person/yolo.py
@@ -12,21 +12,40 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import onnxruntime
 from ultralytics import YOLO
 
 from dimos.msgs.sensor_msgs import Image
 from dimos.perception.detection.detectors.types import Detector
 from dimos.perception.detection.type import ImageDetections2D
 from dimos.utils.data import get_data
+from dimos.utils.gpu_utils import is_cuda_available
 from dimos.utils.logging_config import setup_logger
 
 logger = setup_logger("dimos.perception.detection.yolo.person")
 
 
 class YoloPersonDetector(Detector):
-    def __init__(self, model_path="models_yolo", model_name="yolo11n-pose.pt"):
+    def __init__(self, model_path="models_yolo", model_name="yolo11n-pose.pt", device="cpu"):
+        """Initialize the YOLO person detector.
+
+        Args:
+            model_path (str): Path to the YOLO model weights in tests/data LFS directory
+            model_name (str): Name of the YOLO model weights file
+            device (str): Device to run inference on ('cuda' or 'cpu')
+        """
+        self.device = device
         self.model = YOLO(get_data(model_path) / model_name, task="pose")
 
+        if is_cuda_available():
+            if hasattr(onnxruntime, "preload_dlls"):  # Handles CUDA 11 / onnxruntime-gpu<=1.18
+                onnxruntime.preload_dlls(cuda=True, cudnn=True)
+            self.device = "cuda"
+            logger.debug("Using CUDA for YOLO person detector")
+        else:
+            self.device = "cpu"
+            logger.debug("Using CPU for YOLO person detector")
+
     def process_image(self, image: Image) -> ImageDetections2D:
         """Process image and return detection results.
 
@@ -36,5 +55,5 @@ def process_image(self, image: Image) -> ImageDetections2D:
         Returns:
             ImageDetections2D containing Detection2DPerson objects with pose keypoints
         """
-        results = self.model(source=image.to_opencv())
+        results = self.model(source=image.to_opencv(), device=self.device)
         return ImageDetections2D.from_ultralytics_result(image, results)

From b6be8806ba72b14a9d0d669d94b25a214dbcdc50 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 18:23:32 -0700
Subject: [PATCH 09/47] vlm sketch

---
 dimos/agents2/temp/webcam_agent.py            |  8 -----
 dimos/perception/detection/module2D.py        |  2 +-
 dimos/perception/detection/module3D.py        | 31 +++++++++++++------
 .../unitree_webrtc/modular/ivan_unitree.py    |  4 +--
 4 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/dimos/agents2/temp/webcam_agent.py b/dimos/agents2/temp/webcam_agent.py
index fed01ed96f..deb5cce3e4 100644
--- a/dimos/agents2/temp/webcam_agent.py
+++ b/dimos/agents2/temp/webcam_agent.py
@@ -18,16 +18,11 @@
 This is the migrated version using the new LangChain-based agent system.
 """
 
-import asyncio  # Needed for event loop management in setup_agent
-import os
-import sys
 import time
-from pathlib import Path
 from threading import Thread
 
 import reactivex as rx
 import reactivex.operators as ops
-from dotenv import load_dotenv
 
 from dimos.agents2 import Agent, Output, Reducer, Stream, skill
 from dimos.agents2.cli.human import HumanInput
@@ -41,9 +36,6 @@
 # from dimos.hardware.webcam import ColorCameraModule, Webcam
 from dimos.msgs.sensor_msgs import CameraInfo, Image
 from dimos.protocol.skill.test_coordinator import SkillContainerTest
-from dimos.robot.unitree_webrtc.unitree_go2 import UnitreeGo2
-from dimos.robot.unitree_webrtc.unitree_skill_container import UnitreeSkillContainer
-from dimos.utils.logging_config import setup_logger
 from dimos.web.robot_web_interface import RobotWebInterface
 
 
diff --git a/dimos/perception/detection/module2D.py b/dimos/perception/detection/module2D.py
index 1977362bae..2b1263bb4a 100644
--- a/dimos/perception/detection/module2D.py
+++ b/dimos/perception/detection/module2D.py
@@ -37,7 +37,7 @@
 @dataclass
 class Config:
     max_freq: float = 10  # hz
-    detector: Optional[Callable[[Any], Detector]] = lambda: Yolo2DDetector()
+    detector: Optional[Callable[[Any], Detector]] = lambda: YoloPersonDetector()
 
 
 class Detection2DModule(Module):
diff --git a/dimos/perception/detection/module3D.py b/dimos/perception/detection/module3D.py
index a94c73046c..91d64cde8e 100644
--- a/dimos/perception/detection/module3D.py
+++ b/dimos/perception/detection/module3D.py
@@ -17,12 +17,12 @@
 from reactivex import operators as ops
 from reactivex.observable import Observable
 
+from dimos.agents2 import skill
 from dimos.core import In, Out, rpc
 from dimos.msgs.geometry_msgs import Transform
 from dimos.msgs.sensor_msgs import Image, PointCloud2
 from dimos.perception.detection.module2D import Detection2DModule
 from dimos.perception.detection.type import (
-    Detection2D,
     ImageDetections2D,
     ImageDetections3D,
     ImageDetections3DPC,
@@ -77,7 +77,27 @@ def process_frame(
 
         return ImageDetections3D(detections.image, detection3d_list)
 
-    def process_detection(self, detections: ImageDetections2D) -> ImageDetections3DPC: ...
+    @skill
+    def ask_vlm(self, question: str):
+        """
+        query visual model about the view in front of the camera
+        you can ask to mark objects like:
+
+        "red cup on the table left of the pencil"
+        "laptop on the desk"
+        "a person wearing a red shirt"
+        """
+        from dimos.models.vl.qwen import QwenVLModel
+
+        model = QwenVLModel()
+        detections: ImageDetections2D = model.query(self.image.get_next(), question)
+
+        if not detections or not len(detections):
+            return "No detections"
+
+        pc = self.pointcloud.get_next()
+        transform = self.tf.get("camera_optical", pc.frame_id, detections.image.ts, 5.0)
+        return self.process_frame(detections, pc, transform)
 
     @rpc
     def start(self):
@@ -88,7 +108,6 @@ def detection2d_to_3d(args):
             transform = self.tf.get("camera_optical", pc.frame_id, detections.image.ts, 5.0)
             return self.process_frame(detections, pc, transform)
 
-        # does align message timestamps
         self.detection_stream_3d = align_timestamped(
             backpressure(self.detection_stream_2d()),
             self.pointcloud.observable(),
@@ -96,12 +115,6 @@ def detection2d_to_3d(args):
             buffer_size=20.0,
         ).pipe(ops.map(detection2d_to_3d))
 
-        # doesn't align message timestamps
-        #
-        # self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe(
-        #    ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d)
-        # )
-
         self.detection_stream_3d.subscribe(self._publish_detections)
 
     def _publish_detections(self, detections: ImageDetections3D):
diff --git a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
index 73927cf248..e892ad35dc 100644
--- a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
+++ b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
@@ -15,7 +15,6 @@
 import logging
 import time
 
-from dimos_lcm.sensor_msgs import CameraInfo
 from lcm_msgs.foxglove_msgs import SceneUpdate
 
 from dimos.agents2.spec import Model, Provider
@@ -25,8 +24,7 @@
 from dimos.msgs.foxglove_msgs import ImageAnnotations
 from dimos.msgs.sensor_msgs import Image, PointCloud2
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection2d import Detection3DModule
-from dimos.perception.detection2d.moduleDB import ObjectDBModule
+from dimos.perception.detection.moduleDB import ObjectDBModule
 from dimos.protocol.pubsub import lcm
 from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation
 from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule

From 8056b249c5f4477a5f5103d363472f5d3b1e90f4 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 19:57:38 -0700
Subject: [PATCH 10/47] completely removed dep on old detection format

---
 dimos/models/vl/base.py                       | 53 +++++++++++----
 dimos/perception/detection/type/__init__.py   |  1 -
 .../perception/detection/type/detection2d.py  | 65 +++----------------
 dimos/protocol/service/lcmservice.py          |  2 +-
 dimos/robot/unitree_webrtc/modular/detect.py  |  4 +-
 dimos/robot/unitree_webrtc/unitree_g1.py      |  5 +-
 6 files changed, 52 insertions(+), 78 deletions(-)

diff --git a/dimos/models/vl/base.py b/dimos/models/vl/base.py
index a46611b206..f5e7a335e5 100644
--- a/dimos/models/vl/base.py
+++ b/dimos/models/vl/base.py
@@ -1,35 +1,63 @@
 import json
+import logging
 from abc import ABC, abstractmethod
 
 from dimos.msgs.sensor_msgs import Image
 from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D
-from dimos.perception.detection.type.detection2d import Detection
 from dimos.utils.decorators import retry
 from dimos.utils.llm_utils import extract_json
 
+logger = logging.getLogger(__name__)
 
-def vlm_detection_to_yolo(vlm_detection: list, track_id: int) -> Detection | None:
-    """Convert a single VLM detection [label, x1, y1, x2, y2] to Detection tuple.
+
+def vlm_detection_to_detection2d(
+    vlm_detection: list, track_id: int, image: Image
+) -> Detection2DBBox | None:
+    """Convert a single VLM detection [label, x1, y1, x2, y2] to Detection2DBBox.
 
     Args:
         vlm_detection: Single detection list containing [label, x1, y1, x2, y2]
         track_id: Track ID to assign to this detection
+        image: Source image for the detection
 
     Returns:
-        Detection tuple (bbox, track_id, class_id, confidence, name) or None if invalid
+        Detection2DBBox instance or None if invalid
     """
+    # Validate list structure
+    if not isinstance(vlm_detection, list):
+        logger.debug(f"VLM detection is not a list: {type(vlm_detection)}")
+        return None
+
     if len(vlm_detection) != 5:
+        logger.debug(
+            f"Invalid VLM detection length: {len(vlm_detection)}, expected 5. Got: {vlm_detection}"
+        )
         return None
 
+    # Extract label
     name = str(vlm_detection[0])
+
+    # Validate and convert coordinates
     try:
-        bbox = tuple(map(float, vlm_detection[1:]))
-        # Use -1 for class_id since VLM doesn't provide it
-        # confidence defaults to 1.0 for VLM
-        return (bbox, track_id, -1, 1.0, name)
-    except (ValueError, TypeError):
+        coords = [float(x) for x in vlm_detection[1:]]
+    except (ValueError, TypeError) as e:
+        logger.debug(f"Invalid VLM detection coordinates: {vlm_detection[1:]}. Error: {e}")
         return None
 
+    bbox = tuple(coords)
+
+    # Use -1 for class_id since VLM doesn't provide it
+    # confidence defaults to 1.0 for VLM
+    return Detection2DBBox(
+        bbox=bbox,
+        track_id=track_id,
+        class_id=-1,
+        confidence=1.0,
+        name=name,
+        ts=image.ts,
+        image=image,
+    )
+
 
 class VlModel(ABC):
     @abstractmethod
@@ -63,11 +91,8 @@ def query_detections(self, image: Image, query: str) -> ImageDetections2D:
             return image_detections
 
         for track_id, detection_tuple in enumerate(detection_tuples):
-            detection = vlm_detection_to_yolo(detection_tuple, track_id)
-            if detection is None:
-                continue
-            detection2d = Detection2DBBox.from_detection(detection, ts=image.ts, image=image)
-            if detection2d.is_valid():
+            detection2d = vlm_detection_to_detection2d(detection_tuple, track_id, image)
+            if detection2d is not None and detection2d.is_valid():
                 image_detections.detections.append(detection2d)
 
         return image_detections
diff --git a/dimos/perception/detection/type/__init__.py b/dimos/perception/detection/type/__init__.py
index 74ab3dacab..c368fcac0f 100644
--- a/dimos/perception/detection/type/__init__.py
+++ b/dimos/perception/detection/type/__init__.py
@@ -2,7 +2,6 @@
     Detection2D,
     Detection2DBBox,
     ImageDetections2D,
-    InconvinientDetectionFormat,
 )
 from dimos.perception.detection.type.detection3d import (
     Detection3D,
diff --git a/dimos/perception/detection/type/detection2d.py b/dimos/perception/detection/type/detection2d.py
index e097728992..b4b1d149ed 100644
--- a/dimos/perception/detection/type/detection2d.py
+++ b/dimos/perception/detection/type/detection2d.py
@@ -15,10 +15,13 @@
 from __future__ import annotations
 
 import hashlib
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple
 
+if TYPE_CHECKING:
+    from dimos.perception.detection.type.person import Detection2DPerson
+
 from dimos_lcm.foxglove_msgs.ImageAnnotations import (
     PointsAnnotation,
     TextAnnotation,
@@ -36,7 +39,7 @@
 )
 from rich.console import Console
 from rich.text import Text
-from ultralytics.engine.results import Boxes, Keypoints, Results
+from ultralytics.engine.results import Results
 
 from dimos.msgs.foxglove_msgs import ImageAnnotations
 from dimos.msgs.foxglove_msgs.Color import Color
@@ -46,18 +49,9 @@
 from dimos.types.timestamped import Timestamped, to_ros_stamp, to_timestamp
 from dimos.utils.decorators.decorators import simple_mcache
 
-if TYPE_CHECKING:
-    from dimos.perception.detection.type.person import Detection2DPerson
-
 Bbox = Tuple[float, float, float, float]
 CenteredBbox = Tuple[float, float, float, float]
 
-# yolo and detic have bad output formats
-InconvinientDetectionFormat = Tuple[List[Bbox], List[int], List[int], List[float], List[str]]
-
-Detection = Tuple[Bbox, int, int, float, str]
-Detections = List[Detection]
-
 
 def _hash_to_color(name: str) -> str:
     """Generate a consistent color for a given name using hash."""
@@ -85,17 +79,6 @@ def _hash_to_color(name: str) -> str:
     return colors[hash_value % len(colors)]
 
 
-# yolo and detic have bad formats this translates into list of detections
-def better_detection_format(inconvinient_detections: InconvinientDetectionFormat) -> Detections:
-    bboxes, track_ids, class_ids, confidences, names = inconvinient_detections
-    return [
-        (bbox, track_id, class_id, confidence, name if name else "")
-        for bbox, track_id, class_id, confidence, name in zip(
-            bboxes, track_ids, class_ids, confidences, names
-        )
-    ]
-
-
 class Detection2D(Timestamped):
     @abstractmethod
     def cropped_image(self, padding: int = 20) -> Image: ...
@@ -195,27 +178,6 @@ def is_valid(self) -> bool:
 
         return True
 
-    @classmethod
-    def from_detector(
-        cls, raw_detections: InconvinientDetectionFormat, **kwargs
-    ) -> List["Detection2D"]:
-        return [
-            cls.from_detection(raw, **kwargs) for raw in better_detection_format(raw_detections)
-        ]
-
-    @classmethod
-    def from_detection(cls, raw_detection: Detection, **kwargs) -> "Detection2D":
-        bbox, track_id, class_id, confidence, name = raw_detection
-
-        return cls(
-            bbox=bbox,
-            track_id=track_id,
-            class_id=class_id,
-            confidence=confidence,
-            name=name,
-            **kwargs,
-        )
-
     @classmethod
     def from_ultralytics_result(cls, result: Results, idx: int, image: Image) -> "Detection2DBBox":
         """Create Detection2DBBox from ultralytics Results object.
@@ -443,27 +405,18 @@ def from_ultralytics_result(
 
         return cls(image=image, detections=detections)
 
-    @classmethod
-    def from_bbox_detector(
-        cls, image: Image, raw_detections: InconvinientDetectionFormat, **kwargs
-    ) -> "ImageDetections2D":
-        return cls(
-            image=image,
-            detections=Detection2DBBox.from_detector(raw_detections, image=image, ts=image.ts),
-        )
-
     @classmethod
     def from_pose_detector(
-        cls, image: Image, people: List["Person"], **kwargs
+        cls, image: Image, people: List["Detection2DPerson"], **kwargs
     ) -> "ImageDetections2D":
-        """Create ImageDetections2D from a list of Person detections.
+        """Create ImageDetections2D from a list of Detection2DPerson detections.
         Args:
             image: Source image
-            people: List of Person objects with pose keypoints
+            people: List of Detection2DPerson objects with pose keypoints
         Returns:
             ImageDetections2D containing the pose detections
         """
         return cls(
             image=image,
-            detections=people,  # Person objects are already Detection2D subclasses
+            detections=people,  # Detection2DPerson objects are already Detection2D subclasses
         )
diff --git a/dimos/protocol/service/lcmservice.py b/dimos/protocol/service/lcmservice.py
index bc3f7317b7..2228a671fc 100644
--- a/dimos/protocol/service/lcmservice.py
+++ b/dimos/protocol/service/lcmservice.py
@@ -21,7 +21,7 @@
 import traceback
 from dataclasses import dataclass
 from functools import cache
-from typing import Any, Callable, Optional, Protocol, runtime_checkable
+from typing import Optional, Protocol, runtime_checkable
 
 import lcm
 
diff --git a/dimos/robot/unitree_webrtc/modular/detect.py b/dimos/robot/unitree_webrtc/modular/detect.py
index 7d0ded7ac8..3f6c2c04b2 100644
--- a/dimos/robot/unitree_webrtc/modular/detect.py
+++ b/dimos/robot/unitree_webrtc/modular/detect.py
@@ -135,7 +135,7 @@ def broadcast(
 
 def process_data():
     from dimos.msgs.sensor_msgs import Image
-    from dimos.perception.detection2d.module import Detect2DModule, build_imageannotations
+    from dimos.perception.detection.module2D import Detection2DModule, build_imageannotations
     from dimos.robot.unitree_webrtc.type.lidar import LidarMessage
     from dimos.robot.unitree_webrtc.type.odometry import Odometry
     from dimos.utils.data import get_data
@@ -155,7 +155,7 @@ def attach_frame_id(image: Image) -> Image:
     video_frame = attach_frame_id(video_store.find_closest(target, tolerance=1))
     odom_frame = odom_store.find_closest(target, tolerance=1)
 
-    detector = Detect2DModule()
+    detector = Detection2DModule()
     detections = detector.detect(video_frame)
     annotations = build_imageannotations(detections)
 
diff --git a/dimos/robot/unitree_webrtc/unitree_g1.py b/dimos/robot/unitree_webrtc/unitree_g1.py
index 08a23bc2dc..a57323896d 100644
--- a/dimos/robot/unitree_webrtc/unitree_g1.py
+++ b/dimos/robot/unitree_webrtc/unitree_g1.py
@@ -27,7 +27,6 @@
 from geometry_msgs.msg import PoseStamped as ROSPoseStamped
 from geometry_msgs.msg import TwistStamped as ROSTwistStamped
 from nav_msgs.msg import Odometry as ROSOdometry
-from sensor_msgs.msg import Image as ROSImage
 from sensor_msgs.msg import Joy as ROSJoy
 from sensor_msgs.msg import PointCloud2 as ROSPointCloud2
 from tf2_msgs.msg import TFMessage as ROSTFMessage
@@ -55,8 +54,7 @@
 from dimos.msgs.std_msgs.Bool import Bool
 from dimos.msgs.tf2_msgs.TFMessage import TFMessage
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection2d import Detection3DModule
-from dimos.perception.detection2d.moduleDB import ObjectDBModule
+from dimos.perception.detection.moduleDB import ObjectDBModule
 from dimos.perception.spatial_perception import SpatialMemory
 from dimos.protocol import pubsub
 from dimos.protocol.pubsub.lcmpubsub import LCM
@@ -410,7 +408,6 @@ def _deploy_ros_bridge(self):
             "/tf", TFMessage, ROSTFMessage, direction=BridgeDirection.ROS_TO_DIMOS
         )
 
-        from geometry_msgs.msg import PoseStamped as ROSPoseStamped
         from std_msgs.msg import Bool as ROSBool
 
         from dimos.msgs.std_msgs import Bool

From e25689c98d2db653b13ed2a2a1fa9fd237155d71 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 20:06:36 -0700
Subject: [PATCH 11/47] tests fix, module config fix

---
 dimos/perception/detection/conftest.py | 8 ++++++--
 dimos/perception/detection/module2D.py | 8 +++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/dimos/perception/detection/conftest.py b/dimos/perception/detection/conftest.py
index 1f3bd55486..8a30334ced 100644
--- a/dimos/perception/detection/conftest.py
+++ b/dimos/perception/detection/conftest.py
@@ -162,7 +162,9 @@ def detection3dpc(get_moment_3dpc) -> Detection3DPC:
 
 @pytest.fixture
 def get_moment_2d(get_moment) -> Callable[[], Moment2D]:
-    module = Detection2DModule()
+    from dimos.perception.detection.detectors import Yolo2DDetector
+
+    module = Detection2DModule(detector=Yolo2DDetector)
 
     def moment_provider(**kwargs) -> Moment2D:
         moment = get_moment(**kwargs)
@@ -206,7 +208,9 @@ def moment_provider(**kwargs) -> Moment2D:
 @pytest.fixture
 def object_db_module(get_moment):
     """Create and populate an ObjectDBModule with detections from multiple frames."""
-    module2d = Detection2DModule()
+    from dimos.perception.detection.detectors import Yolo2DDetector
+
+    module2d = Detection2DModule(detector=Yolo2DDetector)
     module3d = Detection3DModule(camera_info=ConnectionModule._camera_info())
     moduleDB = ObjectDBModule(
         camera_info=ConnectionModule._camera_info(),
diff --git a/dimos/perception/detection/module2D.py b/dimos/perception/detection/module2D.py
index 2b1263bb4a..50c3010d4b 100644
--- a/dimos/perception/detection/module2D.py
+++ b/dimos/perception/detection/module2D.py
@@ -22,10 +22,11 @@
 from reactivex.subject import Subject
 
 from dimos.core import In, Module, Out, rpc
+from dimos.core.module import ModuleConfig
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.sensor_msgs.Image import sharpness_barrier
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection.detectors import Detector, Yolo2DDetector
+from dimos.perception.detection.detectors import Detector
 from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
 from dimos.perception.detection.type import (
     ImageDetections2D,
@@ -35,12 +36,13 @@
 
 
 @dataclass
-class Config:
+class Config(ModuleConfig):
     max_freq: float = 10  # hz
-    detector: Optional[Callable[[Any], Detector]] = lambda: YoloPersonDetector()
+    detector: Optional[Callable[[Any], Detector]] = YoloPersonDetector
 
 
 class Detection2DModule(Module):
+    default_config = Config
     config: Config
     detector: Detector
 

From ea238d8a4490a37027c197572c1d3398fd3ca5c2 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 20:36:10 -0700
Subject: [PATCH 12/47] detection3d split into bbox and pc

---
 dimos/perception/detection/conftest.py        |   3 +-
 dimos/perception/detection/module3D.py        |   9 +-
 dimos/perception/detection/moduleDB.py        |  25 +--
 dimos/perception/detection/type/__init__.py   |   2 +-
 .../perception/detection/type/detection2d.py  |  15 +-
 .../perception/detection/type/detection3d.py  | 192 +++--------------
 .../detection/type/detection3dpc.py           | 195 ++++++++++++++++--
 .../detection/type/test_object3d.py           |   6 +-
 8 files changed, 236 insertions(+), 211 deletions(-)

diff --git a/dimos/perception/detection/conftest.py b/dimos/perception/detection/conftest.py
index 8a30334ced..e902f88b6a 100644
--- a/dimos/perception/detection/conftest.py
+++ b/dimos/perception/detection/conftest.py
@@ -31,7 +31,6 @@
     Detection3D,
     Detection3DPC,
     ImageDetections2D,
-    ImageDetections3D,
     ImageDetections3DPC,
 )
 from dimos.protocol.tf import TF
@@ -60,7 +59,7 @@ class Moment2D(Moment):
 
 
 class Moment3D(Moment):
-    detections3dpc: ImageDetections3D
+    detections3dpc: ImageDetections3DPC
 
 
 @pytest.fixture
diff --git a/dimos/perception/detection/module3D.py b/dimos/perception/detection/module3D.py
index 91d64cde8e..ce0c19af89 100644
--- a/dimos/perception/detection/module3D.py
+++ b/dimos/perception/detection/module3D.py
@@ -24,7 +24,6 @@
 from dimos.perception.detection.module2D import Detection2DModule
 from dimos.perception.detection.type import (
     ImageDetections2D,
-    ImageDetections3D,
     ImageDetections3DPC,
 )
 from dimos.perception.detection.type.detection3dpc import Detection3DPC
@@ -60,9 +59,9 @@ def process_frame(
         detections: ImageDetections2D,
         pointcloud: PointCloud2,
         transform: Transform,
-    ) -> ImageDetections3D:
+    ) -> ImageDetections3DPC:
         if not transform:
-            return ImageDetections3D(detections.image, [])
+            return ImageDetections3DPC(detections.image, [])
 
         detection3d_list = []
         for detection in detections:
@@ -75,7 +74,7 @@ def process_frame(
             if detection3d is not None:
                 detection3d_list.append(detection3d)
 
-        return ImageDetections3D(detections.image, detection3d_list)
+        return ImageDetections3DPC(detections.image, detection3d_list)
 
     @skill
     def ask_vlm(self, question: str):
@@ -117,7 +116,7 @@ def detection2d_to_3d(args):
 
         self.detection_stream_3d.subscribe(self._publish_detections)
 
-    def _publish_detections(self, detections: ImageDetections3D):
+    def _publish_detections(self, detections: ImageDetections3DPC):
         if not detections:
             return
 
diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py
index 56203b2f5c..4a274f0e26 100644
--- a/dimos/perception/detection/moduleDB.py
+++ b/dimos/perception/detection/moduleDB.py
@@ -26,15 +26,16 @@
 from dimos.msgs.sensor_msgs import Image, PointCloud2
 from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.perception.detection.module3D import Detection3DModule
-from dimos.perception.detection.type import Detection3D, ImageDetections3D, TableStr
+from dimos.perception.detection.type import Detection3D, ImageDetections3DPC, TableStr
+from dimos.perception.detection.type.detection3dpc import Detection3DPC
 from dimos.protocol.skill.skill import skill
 from dimos.protocol.skill.type import Output, Reducer, Stream
 from dimos.types.timestamped import to_datetime
 
 
 # Represents an object in space, as collection of 3d detections over time
-class Object3D(Detection3D):
-    best_detection: Detection3D = None
+class Object3D(Detection3DPC):
+    best_detection: Detection3DPC = None
     center: Vector3 = None
     track_id: str = None
     detections: int = 0
@@ -46,7 +47,7 @@ def to_repr_dict(self) -> Dict[str, Any]:
             "center": "[" + ", ".join(list(map(lambda n: f"{n:1f}", self.center.to_list()))) + "]",
         }
 
-    def __init__(self, track_id: str, detection: Optional[Detection3D] = None, *args, **kwargs):
+    def __init__(self, track_id: str, detection: Optional[Detection3DPC] = None, *args, **kwargs):
         if detection is None:
             return
         self.ts = detection.ts
@@ -62,7 +63,7 @@ def __init__(self, track_id: str, detection: Optional[Detection3D] = None, *args
         self.detections = self.detections + 1
         self.best_detection = detection
 
-    def __add__(self, detection: Detection3D) -> "Object3D":
+    def __add__(self, detection: Detection3DPC) -> "Object3D":
         new_object = Object3D(self.track_id)
         new_object.bbox = detection.bbox
         new_object.confidence = max(self.confidence, detection.confidence)
@@ -156,7 +157,7 @@ def __init__(self, goto: Callable[[PoseStamped], Any], *args, **kwargs):
         self.objects = {}
         self.remembered_locations = {}
 
-    def closest_object(self, detection: Detection3D) -> Optional[Object3D]:
+    def closest_object(self, detection: Detection3DPC) -> Optional[Object3D]:
         # Filter objects to only those with matching names
         matching_objects = [obj for obj in self.objects.values() if obj.name == detection.name]
 
@@ -168,12 +169,12 @@ def closest_object(self, detection: Detection3D) -> Optional[Object3D]:
 
         return distances[0]
 
-    def add_detections(self, detections: List[Detection3D]) -> List[Object3D]:
+    def add_detections(self, detections: List[Detection3DPC]) -> List[Object3D]:
         return [
             detection for detection in map(self.add_detection, detections) if detection is not None
         ]
 
-    def add_detection(self, detection: Detection3D):
+    def add_detection(self, detection: Detection3DPC):
         """Add detection to existing object or create new one."""
         closest = self.closest_object(detection)
         if closest and closest.bounding_box_intersects(detection):
@@ -181,12 +182,12 @@ def add_detection(self, detection: Detection3D):
         else:
             return self.create_new_object(detection)
 
-    def add_to_object(self, closest: Object3D, detection: Detection3D):
+    def add_to_object(self, closest: Object3D, detection: Detection3DPC):
         new_object = closest + detection
         self.objects[closest.track_id] = new_object
         return new_object
 
-    def create_new_object(self, detection: Detection3D):
+    def create_new_object(self, detection: Detection3DPC):
         new_object = Object3D(f"obj_{self.cnt}", detection)
         self.objects[new_object.track_id] = new_object
         self.cnt += 1
@@ -295,7 +296,7 @@ def navigate_to_object_by_id(self, object_id: str):
         self.nav_to(target_pose)
         return f"Navigating to f{object_id} f{target_obj.name}"
 
-    def lookup(self, label: str) -> List[Detection3D]:
+    def lookup(self, label: str) -> List[Detection3DPC]:
         """Look up a detection by label."""
         return []
 
@@ -303,7 +304,7 @@ def lookup(self, label: str) -> List[Detection3D]:
     def start(self):
         Detection3DModule.start(self)
 
-        def update_objects(imageDetections: ImageDetections3D):
+        def update_objects(imageDetections: ImageDetections3DPC):
             for detection in imageDetections.detections:
                 # print(detection)
                 return self.add_detection(detection)
diff --git a/dimos/perception/detection/type/__init__.py b/dimos/perception/detection/type/__init__.py
index c368fcac0f..4be15e0f78 100644
--- a/dimos/perception/detection/type/__init__.py
+++ b/dimos/perception/detection/type/__init__.py
@@ -5,7 +5,7 @@
 )
 from dimos.perception.detection.type.detection3d import (
     Detection3D,
-    ImageDetections3D,
+    Detection3DBBox,
 )
 from dimos.perception.detection.type.detection3dpc import (
     Detection3DPC,
diff --git a/dimos/perception/detection/type/detection2d.py b/dimos/perception/detection/type/detection2d.py
index b4b1d149ed..e032355749 100644
--- a/dimos/perception/detection/type/detection2d.py
+++ b/dimos/perception/detection/type/detection2d.py
@@ -17,7 +17,7 @@
 import hashlib
 from abc import abstractmethod
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Union
 
 if TYPE_CHECKING:
     from dimos.perception.detection.type.person import Detection2DPerson
@@ -190,6 +190,9 @@ def from_ultralytics_result(cls, result: Results, idx: int, image: Image) -> "De
         Returns:
             Detection2DBBox instance
         """
+        if result.boxes is None:
+            raise ValueError("Result has no boxes")
+
         # Extract bounding box coordinates
         bbox_array = result.boxes.xyxy[idx].cpu().numpy()
         bbox: Bbox = (
@@ -388,13 +391,14 @@ def from_ultralytics_result(
         """
         from dimos.perception.detection.type.person import Detection2DPerson
 
-        detections = []
+        detections: List[Detection2D] = []
         for result in results:
             if result.boxes is None:
                 continue
 
             num_detections = len(result.boxes.xyxy)
             for i in range(num_detections):
+                detection: Detection2D
                 if result.keypoints is not None:
                     # Pose detection with keypoints
                     detection = Detection2DPerson.from_ultralytics_result(result, i, image)
@@ -407,16 +411,17 @@ def from_ultralytics_result(
 
     @classmethod
     def from_pose_detector(
-        cls, image: Image, people: List["Detection2DPerson"], **kwargs
+        cls, image: Image, people: Sequence["Detection2DPerson"], **kwargs
     ) -> "ImageDetections2D":
         """Create ImageDetections2D from a list of Detection2DPerson detections.
         Args:
             image: Source image
-            people: List of Detection2DPerson objects with pose keypoints
+            people: Sequence of Detection2DPerson objects with pose keypoints
         Returns:
             ImageDetections2D containing the pose detections
         """
+        detections: List[Detection2D] = list(people)
         return cls(
             image=image,
-            detections=people,  # Detection2DPerson objects are already Detection2D subclasses
+            detections=detections,
         )
diff --git a/dimos/perception/detection/type/detection3d.py b/dimos/perception/detection/type/detection3d.py
index 5a0f09f570..e1f7fe3b6d 100644
--- a/dimos/perception/detection/type/detection3d.py
+++ b/dimos/perception/detection/type/detection3d.py
@@ -34,62 +34,35 @@
 
 
 @dataclass
-class Detection3D(Detection2DBBox):
-    transform: Transform
-    frame_id: str
+class Detection3DBBox(Detection2DBBox):
+    """3D bounding box detection with center, size, and orientation.
 
-    @classmethod
-    def from_2d(
-        cls,
-        det: Detection2D,
-        distance: float,
-        camera_info: CameraInfo,
-        world_to_optical_transform: Transform,
-    ) -> Optional["Detection3D"]:
-        raise NotImplementedError()
+    Represents a 3D detection as an oriented bounding box in world space.
+    """
 
-    @functools.cached_property
-    def center(self) -> Vector3:
-        return Vector3(*self.pointcloud.center)
+    transform: Transform  # Camera to world transform
+    frame_id: str  # Frame ID (e.g., "world", "map")
+    center: Vector3  # Center point in world frame
+    size: Vector3  # Width, height, depth
+    orientation: tuple[float, float, float, float]  # Quaternion (x, y, z, w)
 
     @functools.cached_property
     def pose(self) -> PoseStamped:
-        """Convert detection to a PoseStamped using pointcloud center.
+        """Convert detection to a PoseStamped using bounding box center.
 
-        Returns pose in world frame with identity rotation.
-        The pointcloud is already in world frame.
+        Returns pose in world frame with the detection's orientation.
         """
         return PoseStamped(
             ts=self.ts,
             frame_id=self.frame_id,
             position=self.center,
-            orientation=(0.0, 0.0, 0.0, 1.0),  # Identity quaternion
+            orientation=self.orientation,
         )
 
-    def get_bounding_box(self):
-        """Get axis-aligned bounding box of the detection's pointcloud."""
-        return self.pointcloud.get_axis_aligned_bounding_box()
-
-    def get_oriented_bounding_box(self):
-        """Get oriented bounding box of the detection's pointcloud."""
-        return self.pointcloud.get_oriented_bounding_box()
-
-    def get_bounding_box_dimensions(self) -> tuple[float, float, float]:
-        """Get dimensions (width, height, depth) of the detection's bounding box."""
-        return self.pointcloud.get_bounding_box_dimensions()
-
-    def bounding_box_intersects(self, other: "Detection3D") -> bool:
-        """Check if this detection's bounding box intersects with another's."""
-        return self.pointcloud.bounding_box_intersects(other.pointcloud)
-
     def to_repr_dict(self) -> Dict[str, Any]:
         # Calculate distance from camera
-        # The pointcloud is in world frame, and transform gives camera position in world
-        center_world = self.center
-        # Camera position in world frame is the translation part of the transform
         camera_pos = self.transform.translation
-        # Use Vector3 subtraction and magnitude
-        distance = (center_world - camera_pos).magnitude()
+        distance = (self.center - camera_pos).magnitude()
 
         parent_dict = super().to_repr_dict()
         # Remove bbox key if present
@@ -98,132 +71,23 @@ def to_repr_dict(self) -> Dict[str, Any]:
         return {
             **parent_dict,
             "dist": f"{distance:.2f}m",
-            "points": str(len(self.pointcloud)),
+            "size": f"[{self.size.x:.2f},{self.size.y:.2f},{self.size.z:.2f}]",
         }
 
-    def to_foxglove_scene_entity(self, entity_id: str = None) -> "SceneEntity":
-        """Convert detection to a Foxglove SceneEntity with cube primitive and text label.
-
-        Args:
-            entity_id: Optional custom entity ID. If None, generates one from name and hash.
 
-        Returns:
-            SceneEntity with cube bounding box and text label
-        """
-
-        # Create a cube primitive for the bounding box
-        cube = CubePrimitive()
-
-        # Get the axis-aligned bounding box
-        aabb = self.get_bounding_box()
-
-        # Set pose from axis-aligned bounding box
-        cube.pose = Pose()
-        cube.pose.position = Point()
-        # Get center of the axis-aligned bounding box
-        aabb_center = aabb.get_center()
-        cube.pose.position.x = aabb_center[0]
-        cube.pose.position.y = aabb_center[1]
-        cube.pose.position.z = aabb_center[2]
-
-        # For axis-aligned box, use identity quaternion (no rotation)
-        cube.pose.orientation = Quaternion()
-        cube.pose.orientation.x = 0
-        cube.pose.orientation.y = 0
-        cube.pose.orientation.z = 0
-        cube.pose.orientation.w = 1
-
-        # Set size from axis-aligned bounding box
-        cube.size = LCMVector3()
-        aabb_extent = aabb.get_extent()
-        cube.size.x = aabb_extent[0]  # width
-        cube.size.y = aabb_extent[1]  # height
-        cube.size.z = aabb_extent[2]  # depth
-
-        # Set color based on name hash
-        cube.color = Color.from_string(self.name, alpha=0.2)
-
-        # Create text label
-        text = TextPrimitive()
-        text.pose = Pose()
-        text.pose.position = Point()
-        text.pose.position.x = aabb_center[0]
-        text.pose.position.y = aabb_center[1]
-        text.pose.position.z = aabb_center[2] + aabb_extent[2] / 2 + 0.1  # Above the box
-        text.pose.orientation = Quaternion()
-        text.pose.orientation.x = 0
-        text.pose.orientation.y = 0
-        text.pose.orientation.z = 0
-        text.pose.orientation.w = 1
-        text.billboard = True
-        text.font_size = 20.0
-        text.scale_invariant = True
-        text.color = Color()
-        text.color.r = 1.0
-        text.color.g = 1.0
-        text.color.b = 1.0
-        text.color.a = 1.0
-        text.text = self.scene_entity_label()
-
-        # Create scene entity
-        entity = SceneEntity()
-        entity.timestamp = to_ros_stamp(self.ts)
-        entity.frame_id = self.frame_id
-        entity.id = str(self.track_id)
-        entity.lifetime = Duration()
-        entity.lifetime.sec = 0  # Persistent
-        entity.lifetime.nanosec = 0
-        entity.frame_locked = False
-
-        # Initialize all primitive arrays
-        entity.metadata_length = 0
-        entity.metadata = []
-        entity.arrows_length = 0
-        entity.arrows = []
-        entity.cubes_length = 1
-        entity.cubes = [cube]
-        entity.spheres_length = 0
-        entity.spheres = []
-        entity.cylinders_length = 0
-        entity.cylinders = []
-        entity.lines_length = 0
-        entity.lines = []
-        entity.triangles_length = 0
-        entity.triangles = []
-        entity.texts_length = 1
-        entity.texts = [text]
-        entity.models_length = 0
-        entity.models = []
-
-        return entity
-
-    def scene_entity_label(self) -> str:
-        return f"{self.track_id}/{self.name} ({self.confidence:.0%})"
-
-
-T = TypeVar("T", bound="Detection2D")
-
-
-class ImageDetections3D(ImageDetections[Detection3D]):
-    """Specialized class for 3D detections in an image."""
-
-    def to_foxglove_scene_update(self) -> "SceneUpdate":
-        """Convert all detections to a Foxglove SceneUpdate message.
-
-        Returns:
-            SceneUpdate containing SceneEntity objects for all detections
-        """
-
-        # Create SceneUpdate message with all detections
-        scene_update = SceneUpdate()
-        scene_update.deletions_length = 0
-        scene_update.deletions = []
-        scene_update.entities = []
+@dataclass
+class Detection3D(Detection2DBBox):
+    """Base class for 3D detections (deprecated, use Detection3DBBox or Detection3DPC)."""
 
-        # Process each detection
-        for i, detection in enumerate(self.detections):
-            entity = detection.to_foxglove_scene_entity(entity_id=f"detection_{detection.name}_{i}")
-            scene_update.entities.append(entity)
+    transform: Transform
+    frame_id: str
 
-        scene_update.entities_length = len(scene_update.entities)
-        return scene_update
+    @classmethod
+    def from_2d(
+        cls,
+        det: Detection2DBBox,
+        distance: float,
+        camera_info: CameraInfo,
+        world_to_optical_transform: Transform,
+    ) -> Optional["Detection3D"]:
+        raise NotImplementedError()
diff --git a/dimos/perception/detection/type/detection3dpc.py b/dimos/perception/detection/type/detection3dpc.py
index e7ca16c290..9fa0c53db6 100644
--- a/dimos/perception/detection/type/detection3dpc.py
+++ b/dimos/perception/detection/type/detection3dpc.py
@@ -28,23 +28,24 @@
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3
 from dimos.msgs.sensor_msgs import PointCloud2
-from dimos.perception.detection.type.detection2d import Detection2D
+from dimos.perception.detection.type.detection2d import Detection2D, Detection2DBBox
 from dimos.perception.detection.type.detection3d import Detection3D
 from dimos.perception.detection.type.imageDetections import ImageDetections
 from dimos.types.timestamped import to_ros_stamp
 
-Detection3DPCFilter = Callable[
-    [Detection2D, PointCloud2, CameraInfo, Transform], Optional["Detection3DPC"]
+# Filters take Detection2DBBox, PointCloud2, CameraInfo, Transform and return filtered PointCloud2 or None
+PointCloudFilter = Callable[
+    [Detection2DBBox, PointCloud2, CameraInfo, Transform], Optional[PointCloud2]
 ]
 
 
-def height_filter(height=0.1) -> Detection3DPCFilter:
+def height_filter(height=0.1) -> PointCloudFilter:
     return lambda det, pc, ci, tf: pc.filter_by_height(height)
 
 
-def statistical(nb_neighbors=40, std_ratio=0.5) -> Detection3DPCFilter:
+def statistical(nb_neighbors=40, std_ratio=0.5) -> PointCloudFilter:
     def filter_func(
-        det: Detection2D, pc: PointCloud2, ci: CameraInfo, tf: Transform
+        det: Detection2DBBox, pc: PointCloud2, ci: CameraInfo, tf: Transform
     ) -> Optional[PointCloud2]:
         try:
             statistical, removed = pc.pointcloud.remove_statistical_outlier(
@@ -58,9 +59,9 @@ def filter_func(
     return filter_func
 
 
-def raycast() -> Detection3DPCFilter:
+def raycast() -> PointCloudFilter:
     def filter_func(
-        det: Detection2D, pc: PointCloud2, ci: CameraInfo, tf: Transform
+        det: Detection2DBBox, pc: PointCloud2, ci: CameraInfo, tf: Transform
     ) -> Optional[PointCloud2]:
         try:
             camera_pos = tf.inverse().translation
@@ -75,14 +76,14 @@ def filter_func(
     return filter_func
 
 
-def radius_outlier(min_neighbors: int = 20, radius: float = 0.3) -> Detection3DPCFilter:
+def radius_outlier(min_neighbors: int = 20, radius: float = 0.3) -> PointCloudFilter:
     """
     Remove isolated points: keep only points that have at least `min_neighbors`
     neighbors within `radius` meters (same units as your point cloud).
     """
 
     def filter_func(
-        det: Detection2D, pc: PointCloud2, ci: CameraInfo, tf: Transform
+        det: Detection2DBBox, pc: PointCloud2, ci: CameraInfo, tf: Transform
     ) -> Optional[PointCloud2]:
         filtered_pcd, removed = pc.pointcloud.remove_radius_outlier(
             nb_points=min_neighbors, radius=radius
@@ -96,21 +97,168 @@ def filter_func(
 class Detection3DPC(Detection3D):
     pointcloud: PointCloud2
 
+    @functools.cached_property
+    def center(self) -> Vector3:
+        return Vector3(*self.pointcloud.center)
+
+    @functools.cached_property
+    def pose(self) -> PoseStamped:
+        """Convert detection to a PoseStamped using pointcloud center.
+
+        Returns pose in world frame with identity rotation.
+        The pointcloud is already in world frame.
+        """
+        return PoseStamped(
+            ts=self.ts,
+            frame_id=self.frame_id,
+            position=self.center,
+            orientation=(0.0, 0.0, 0.0, 1.0),  # Identity quaternion
+        )
+
+    def get_bounding_box(self):
+        """Get axis-aligned bounding box of the detection's pointcloud."""
+        return self.pointcloud.get_axis_aligned_bounding_box()
+
+    def get_oriented_bounding_box(self):
+        """Get oriented bounding box of the detection's pointcloud."""
+        return self.pointcloud.get_oriented_bounding_box()
+
+    def get_bounding_box_dimensions(self) -> tuple[float, float, float]:
+        """Get dimensions (width, height, depth) of the detection's bounding box."""
+        return self.pointcloud.get_bounding_box_dimensions()
+
+    def bounding_box_intersects(self, other: "Detection3DPC") -> bool:
+        """Check if this detection's bounding box intersects with another's."""
+        return self.pointcloud.bounding_box_intersects(other.pointcloud)
+
+    def to_repr_dict(self) -> Dict[str, Any]:
+        # Calculate distance from camera
+        # The pointcloud is in world frame, and transform gives camera position in world
+        center_world = self.center
+        # Camera position in world frame is the translation part of the transform
+        camera_pos = self.transform.translation
+        # Use Vector3 subtraction and magnitude
+        distance = (center_world - camera_pos).magnitude()
+
+        parent_dict = super().to_repr_dict()
+        # Remove bbox key if present
+        parent_dict.pop("bbox", None)
+
+        return {
+            **parent_dict,
+            "dist": f"{distance:.2f}m",
+            "points": str(len(self.pointcloud)),
+        }
+
+    def to_foxglove_scene_entity(self, entity_id: Optional[str] = None) -> "SceneEntity":
+        """Convert detection to a Foxglove SceneEntity with cube primitive and text label.
+
+        Args:
+            entity_id: Optional custom entity ID. If None, generates one from name and hash.
+
+        Returns:
+            SceneEntity with cube bounding box and text label
+        """
+
+        # Create a cube primitive for the bounding box
+        cube = CubePrimitive()
+
+        # Get the axis-aligned bounding box
+        aabb = self.get_bounding_box()
+
+        # Set pose from axis-aligned bounding box
+        cube.pose = Pose()
+        cube.pose.position = Point()
+        # Get center of the axis-aligned bounding box
+        aabb_center = aabb.get_center()
+        cube.pose.position.x = aabb_center[0]
+        cube.pose.position.y = aabb_center[1]
+        cube.pose.position.z = aabb_center[2]
+
+        # For axis-aligned box, use identity quaternion (no rotation)
+        cube.pose.orientation = Quaternion()
+        cube.pose.orientation.x = 0
+        cube.pose.orientation.y = 0
+        cube.pose.orientation.z = 0
+        cube.pose.orientation.w = 1
+
+        # Set size from axis-aligned bounding box
+        cube.size = LCMVector3()
+        aabb_extent = aabb.get_extent()
+        cube.size.x = aabb_extent[0]  # width
+        cube.size.y = aabb_extent[1]  # height
+        cube.size.z = aabb_extent[2]  # depth
+
+        # Set color based on name hash
+        cube.color = Color.from_string(self.name, alpha=0.2)
+
+        # Create text label
+        text = TextPrimitive()
+        text.pose = Pose()
+        text.pose.position = Point()
+        text.pose.position.x = aabb_center[0]
+        text.pose.position.y = aabb_center[1]
+        text.pose.position.z = aabb_center[2] + aabb_extent[2] / 2 + 0.1  # Above the box
+        text.pose.orientation = Quaternion()
+        text.pose.orientation.x = 0
+        text.pose.orientation.y = 0
+        text.pose.orientation.z = 0
+        text.pose.orientation.w = 1
+        text.billboard = True
+        text.font_size = 20.0
+        text.scale_invariant = True
+        text.color = Color()
+        text.color.r = 1.0
+        text.color.g = 1.0
+        text.color.b = 1.0
+        text.color.a = 1.0
+        text.text = self.scene_entity_label()
+
+        # Create scene entity
+        entity = SceneEntity()
+        entity.timestamp = to_ros_stamp(self.ts)
+        entity.frame_id = self.frame_id
+        entity.id = str(self.track_id)
+        entity.lifetime = Duration()
+        entity.lifetime.sec = 0  # Persistent
+        entity.lifetime.nanosec = 0
+        entity.frame_locked = False
+
+        # Initialize all primitive arrays
+        entity.metadata_length = 0
+        entity.metadata = []
+        entity.arrows_length = 0
+        entity.arrows = []
+        entity.cubes_length = 1
+        entity.cubes = [cube]
+        entity.spheres_length = 0
+        entity.spheres = []
+        entity.cylinders_length = 0
+        entity.cylinders = []
+        entity.lines_length = 0
+        entity.lines = []
+        entity.triangles_length = 0
+        entity.triangles = []
+        entity.texts_length = 1
+        entity.texts = [text]
+        entity.models_length = 0
+        entity.models = []
+
+        return entity
+
+    def scene_entity_label(self) -> str:
+        return f"{self.track_id}/{self.name} ({self.confidence:.0%})"
+
     @classmethod
-    def from_2d(
+    def from_2d(  # type: ignore[override]
         cls,
-        det: Detection2D,
+        det: Detection2DBBox,
         world_pointcloud: PointCloud2,
         camera_info: CameraInfo,
         world_to_optical_transform: Transform,
         # filters are to be adjusted based on the sensor noise characteristics if feeding
         # sensor data directly
-        filters: list[Callable[[PointCloud2], PointCloud2]] = [
-            # height_filter(0.1),
-            raycast(),
-            radius_outlier(),
-            statistical(),
-        ],
+        filters: Optional[list[PointCloudFilter]] = None,
     ) -> Optional["Detection3D"]:
         """Create a Detection3D from a 2D detection by projecting world pointcloud.
 
@@ -129,6 +277,15 @@ def from_2d(
         Returns:
             Detection3D with filtered pointcloud, or None if no valid points
         """
+        # Set default filters if none provided
+        if filters is None:
+            filters = [
+                # height_filter(0.1),
+                raycast(),
+                radius_outlier(),
+                statistical(),
+            ]
+
         # Extract camera parameters
         fx, fy = camera_info.K[0], camera_info.K[4]
         cx, cy = camera_info.K[2], camera_info.K[5]
@@ -195,7 +352,7 @@ def from_2d(
             timestamp=world_pointcloud.ts,
         )
 
-        # Apply filters - each filter needs all 4 arguments
+        # Apply filters - each filter gets all arguments
         detection_pc = initial_pc
         for filter_func in filters:
             result = filter_func(det, detection_pc, camera_info, world_to_optical_transform)
diff --git a/dimos/perception/detection/type/test_object3d.py b/dimos/perception/detection/type/test_object3d.py
index eb7b963a4e..d23477200b 100644
--- a/dimos/perception/detection/type/test_object3d.py
+++ b/dimos/perception/detection/type/test_object3d.py
@@ -17,7 +17,7 @@
 from dimos.perception.detection.module2D import Detection2DModule
 from dimos.perception.detection.module3D import Detection3DModule
 from dimos.perception.detection.moduleDB import Object3D, ObjectDBModule
-from dimos.perception.detection.type.detection3d import ImageDetections3D
+from dimos.perception.detection.type.detection3dpc import ImageDetections3DPC
 from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule
 
 
@@ -158,7 +158,7 @@ def test_objectdb_module(object_db_module):
     assert combined.center is not None
 
     # def test_image_detections3d_scene_update(object_db_module):
-    """Test ImageDetections3D to Foxglove scene update conversion."""
+    """Test ImageDetections3DPC to Foxglove scene update conversion."""
     # Get some detections
     objects = list(object_db_module.objects.values())
     if not objects:
@@ -166,7 +166,7 @@ def test_objectdb_module(object_db_module):
 
     detections = [obj.best_detection for obj in objects[:3]]  # Take up to 3
 
-    image_detections = ImageDetections3D(image=detections[0].image, detections=detections)
+    image_detections = ImageDetections3DPC(image=detections[0].image, detections=detections)
 
     scene_update = image_detections.to_foxglove_scene_update()
 

From 928c76c6eca4548b58e4eacf1d5adec469f4914b Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 20:53:27 -0700
Subject: [PATCH 13/47] big detection restructure

---
 dimos/perception/detection/module3D.py        |  2 +-
 dimos/perception/detection/moduleDB.py        |  2 +-
 dimos/perception/detection/type/__init__.py   | 31 +++++++++++++--
 .../detection/type/detection2d/__init__.py    | 27 +++++++++++++
 .../detection.py}                             |  4 +-
 .../type/{ => detection2d}/person.py          |  2 +-
 .../detection/type/detection3d/__init__.py    | 39 +++++++++++++++++++
 .../detection.py}                             |  0
 .../detection_pc.py}                          |  2 +-
 .../detection/type/test_object3d.py           |  2 +-
 10 files changed, 101 insertions(+), 10 deletions(-)
 create mode 100644 dimos/perception/detection/type/detection2d/__init__.py
 rename dimos/perception/detection/type/{detection2d.py => detection2d/detection.py} (98%)
 rename dimos/perception/detection/type/{ => detection2d}/person.py (99%)
 create mode 100644 dimos/perception/detection/type/detection3d/__init__.py
 rename dimos/perception/detection/type/{detection3d.py => detection3d/detection.py} (100%)
 rename dimos/perception/detection/type/{detection3dpc.py => detection3d/detection_pc.py} (99%)

diff --git a/dimos/perception/detection/module3D.py b/dimos/perception/detection/module3D.py
index ce0c19af89..2c393b586e 100644
--- a/dimos/perception/detection/module3D.py
+++ b/dimos/perception/detection/module3D.py
@@ -26,7 +26,7 @@
     ImageDetections2D,
     ImageDetections3DPC,
 )
-from dimos.perception.detection.type.detection3dpc import Detection3DPC
+from dimos.perception.detection.type.detection3d import Detection3DPC
 from dimos.types.timestamped import align_timestamped
 from dimos.utils.reactive import backpressure
 
diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py
index 4a274f0e26..6239ddf921 100644
--- a/dimos/perception/detection/moduleDB.py
+++ b/dimos/perception/detection/moduleDB.py
@@ -27,7 +27,7 @@
 from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.perception.detection.module3D import Detection3DModule
 from dimos.perception.detection.type import Detection3D, ImageDetections3DPC, TableStr
-from dimos.perception.detection.type.detection3dpc import Detection3DPC
+from dimos.perception.detection.type.detection3d import Detection3DPC
 from dimos.protocol.skill.skill import skill
 from dimos.protocol.skill.type import Output, Reducer, Stream
 from dimos.types.timestamped import to_datetime
diff --git a/dimos/perception/detection/type/__init__.py b/dimos/perception/detection/type/__init__.py
index 4be15e0f78..41f4b2a194 100644
--- a/dimos/perception/detection/type/__init__.py
+++ b/dimos/perception/detection/type/__init__.py
@@ -1,15 +1,40 @@
 from dimos.perception.detection.type.detection2d import (
     Detection2D,
     Detection2DBBox,
+    Detection2DPerson,
     ImageDetections2D,
 )
 from dimos.perception.detection.type.detection3d import (
     Detection3D,
     Detection3DBBox,
-)
-from dimos.perception.detection.type.detection3dpc import (
     Detection3DPC,
     ImageDetections3DPC,
+    PointCloudFilter,
+    height_filter,
+    radius_outlier,
+    raycast,
+    statistical,
 )
 from dimos.perception.detection.type.imageDetections import ImageDetections, TableStr
-from dimos.perception.detection.type.person import Detection2DPerson
+
+__all__ = [
+    # 2D Detection types
+    "Detection2D",
+    "Detection2DBBox",
+    "Detection2DPerson",
+    "ImageDetections2D",
+    # 3D Detection types
+    "Detection3D",
+    "Detection3DBBox",
+    "Detection3DPC",
+    "ImageDetections3DPC",
+    # Point cloud filters
+    "PointCloudFilter",
+    "height_filter",
+    "radius_outlier",
+    "raycast",
+    "statistical",
+    # Base types
+    "ImageDetections",
+    "TableStr",
+]
diff --git a/dimos/perception/detection/type/detection2d/__init__.py b/dimos/perception/detection/type/detection2d/__init__.py
new file mode 100644
index 0000000000..2f08316ed0
--- /dev/null
+++ b/dimos/perception/detection/type/detection2d/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dimos.perception.detection.type.detection2d.detection import (
+    Detection2D,
+    Detection2DBBox,
+    ImageDetections2D,
+)
+from dimos.perception.detection.type.detection2d.person import Detection2DPerson
+
+__all__ = [
+    "Detection2D",
+    "Detection2DBBox",
+    "ImageDetections2D",
+    "Detection2DPerson",
+]
diff --git a/dimos/perception/detection/type/detection2d.py b/dimos/perception/detection/type/detection2d/detection.py
similarity index 98%
rename from dimos/perception/detection/type/detection2d.py
rename to dimos/perception/detection/type/detection2d/detection.py
index e032355749..3d3e7abd99 100644
--- a/dimos/perception/detection/type/detection2d.py
+++ b/dimos/perception/detection/type/detection2d/detection.py
@@ -20,7 +20,7 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Union
 
 if TYPE_CHECKING:
-    from dimos.perception.detection.type.person import Detection2DPerson
+    from dimos.perception.detection.type.detection2d.person import Detection2DPerson
 
 from dimos_lcm.foxglove_msgs.ImageAnnotations import (
     PointsAnnotation,
@@ -389,7 +389,7 @@ def from_ultralytics_result(
         Returns:
             ImageDetections2D containing appropriate detection types
         """
-        from dimos.perception.detection.type.person import Detection2DPerson
+        from dimos.perception.detection.type.detection2d.person import Detection2DPerson
 
         detections: List[Detection2D] = []
         for result in results:
diff --git a/dimos/perception/detection/type/person.py b/dimos/perception/detection/type/detection2d/person.py
similarity index 99%
rename from dimos/perception/detection/type/person.py
rename to dimos/perception/detection/type/detection2d/person.py
index 773217194b..fb2d18a17b 100644
--- a/dimos/perception/detection/type/person.py
+++ b/dimos/perception/detection/type/detection2d/person.py
@@ -23,7 +23,7 @@
 
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.type.detection2d import Bbox, Detection2DBBox
+from dimos.perception.detection.type.detection2d.detection import Bbox, Detection2DBBox
 from dimos.types.timestamped import to_ros_stamp
 
 if TYPE_CHECKING:
diff --git a/dimos/perception/detection/type/detection3d/__init__.py b/dimos/perception/detection/type/detection3d/__init__.py
new file mode 100644
index 0000000000..010cd981d2
--- /dev/null
+++ b/dimos/perception/detection/type/detection3d/__init__.py
@@ -0,0 +1,39 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dimos.perception.detection.type.detection3d.detection import (
+    Detection3D,
+    Detection3DBBox,
+)
+from dimos.perception.detection.type.detection3d.detection_pc import (
+    Detection3DPC,
+    ImageDetections3DPC,
+    PointCloudFilter,
+    height_filter,
+    raycast,
+    radius_outlier,
+    statistical,
+)
+
+__all__ = [
+    "Detection3D",
+    "Detection3DBBox",
+    "Detection3DPC",
+    "ImageDetections3DPC",
+    "PointCloudFilter",
+    "height_filter",
+    "raycast",
+    "radius_outlier",
+    "statistical",
+]
diff --git a/dimos/perception/detection/type/detection3d.py b/dimos/perception/detection/type/detection3d/detection.py
similarity index 100%
rename from dimos/perception/detection/type/detection3d.py
rename to dimos/perception/detection/type/detection3d/detection.py
diff --git a/dimos/perception/detection/type/detection3dpc.py b/dimos/perception/detection/type/detection3d/detection_pc.py
similarity index 99%
rename from dimos/perception/detection/type/detection3dpc.py
rename to dimos/perception/detection/type/detection3d/detection_pc.py
index 9fa0c53db6..66fb8318e0 100644
--- a/dimos/perception/detection/type/detection3dpc.py
+++ b/dimos/perception/detection/type/detection3d/detection_pc.py
@@ -29,7 +29,7 @@
 from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3
 from dimos.msgs.sensor_msgs import PointCloud2
 from dimos.perception.detection.type.detection2d import Detection2D, Detection2DBBox
-from dimos.perception.detection.type.detection3d import Detection3D
+from dimos.perception.detection.type.detection3d.detection import Detection3D
 from dimos.perception.detection.type.imageDetections import ImageDetections
 from dimos.types.timestamped import to_ros_stamp
 
diff --git a/dimos/perception/detection/type/test_object3d.py b/dimos/perception/detection/type/test_object3d.py
index d23477200b..c032664b46 100644
--- a/dimos/perception/detection/type/test_object3d.py
+++ b/dimos/perception/detection/type/test_object3d.py
@@ -17,7 +17,7 @@
 from dimos.perception.detection.module2D import Detection2DModule
 from dimos.perception.detection.module3D import Detection3DModule
 from dimos.perception.detection.moduleDB import Object3D, ObjectDBModule
-from dimos.perception.detection.type.detection3dpc import ImageDetections3DPC
+from dimos.perception.detection.type.detection3d import ImageDetections3DPC
 from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule
 
 

From 4e82fa9c10182ac0023121855b5ba5c325d6b739 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 21:10:55 -0700
Subject: [PATCH 14/47] restructure, mypy

---
 dimos/perception/detection/detectors/detic.py |  6 +++---
 .../detection/type/detection2d/__init__.py    |  7 ++-----
 .../detection2d/{detection.py => bbox.py}     | 11 ++---------
 .../detection/type/detection2d/person.py      |  2 +-
 .../detection/type/detection3d/__init__.py    | 10 ++++------
 .../detection3d/{detection.py => bbox.py}     | 19 +------------------
 .../{detection_pc.py => pointcloud.py}        |  2 +-
 .../detection/type/imageDetections.py         |  8 ++++++--
 8 files changed, 20 insertions(+), 45 deletions(-)
 rename dimos/perception/detection/type/detection2d/{detection.py => bbox.py} (98%)
 rename dimos/perception/detection/type/detection3d/{detection.py => bbox.py} (86%)
 rename dimos/perception/detection/type/detection3d/{detection_pc.py => pointcloud.py} (99%)

diff --git a/dimos/perception/detection/detectors/detic.py b/dimos/perception/detection/detectors/detic.py
index 57a459f750..db2d8bb634 100644
--- a/dimos/perception/detection/detectors/detic.py
+++ b/dimos/perception/detection/detectors/detic.py
@@ -25,9 +25,9 @@
 from dimos.constants import DIMOS_PROJECT_ROOT
 
 detic_path = DIMOS_PROJECT_ROOT / "dimos/models/Detic"
-if detic_path not in sys.path:
-    sys.path.append(detic_path)
-    sys.path.append(os.path.join(detic_path, "third_party/CenterNet2"))
+if str(detic_path) not in sys.path:
+    sys.path.append(str(detic_path))
+    sys.path.append(str(detic_path / "third_party/CenterNet2"))
 
 # PIL patch for compatibility
 import PIL.Image
diff --git a/dimos/perception/detection/type/detection2d/__init__.py b/dimos/perception/detection/type/detection2d/__init__.py
index 2f08316ed0..3a5cb27dce 100644
--- a/dimos/perception/detection/type/detection2d/__init__.py
+++ b/dimos/perception/detection/type/detection2d/__init__.py
@@ -12,11 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dimos.perception.detection.type.detection2d.detection import (
-    Detection2D,
-    Detection2DBBox,
-    ImageDetections2D,
-)
+from dimos.perception.detection.type.detection2d.base import Detection2D
+from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox, ImageDetections2D
 from dimos.perception.detection.type.detection2d.person import Detection2DPerson
 
 __all__ = [
diff --git a/dimos/perception/detection/type/detection2d/detection.py b/dimos/perception/detection/type/detection2d/bbox.py
similarity index 98%
rename from dimos/perception/detection/type/detection2d/detection.py
rename to dimos/perception/detection/type/detection2d/bbox.py
index 3d3e7abd99..1bec4a55d4 100644
--- a/dimos/perception/detection/type/detection2d/detection.py
+++ b/dimos/perception/detection/type/detection2d/bbox.py
@@ -45,8 +45,9 @@
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.std_msgs import Header
+from dimos.perception.detection.type.detection2d.base import Detection2D
 from dimos.perception.detection.type.imageDetections import ImageDetections
-from dimos.types.timestamped import Timestamped, to_ros_stamp, to_timestamp
+from dimos.types.timestamped import to_ros_stamp, to_timestamp
 from dimos.utils.decorators.decorators import simple_mcache
 
 Bbox = Tuple[float, float, float, float]
@@ -79,14 +80,6 @@ def _hash_to_color(name: str) -> str:
     return colors[hash_value % len(colors)]
 
 
-class Detection2D(Timestamped):
-    @abstractmethod
-    def cropped_image(self, padding: int = 20) -> Image: ...
-
-    @abstractmethod
-    def to_image_annotations(self) -> ImageAnnotations: ...
-
-
 @dataclass
 class Detection2DBBox(Detection2D):
     bbox: Bbox
diff --git a/dimos/perception/detection/type/detection2d/person.py b/dimos/perception/detection/type/detection2d/person.py
index fb2d18a17b..ef8b243297 100644
--- a/dimos/perception/detection/type/detection2d/person.py
+++ b/dimos/perception/detection/type/detection2d/person.py
@@ -23,7 +23,7 @@
 
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.type.detection2d.detection import Bbox, Detection2DBBox
+from dimos.perception.detection.type.detection2d.bbox import Bbox, Detection2DBBox
 from dimos.types.timestamped import to_ros_stamp
 
 if TYPE_CHECKING:
diff --git a/dimos/perception/detection/type/detection3d/__init__.py b/dimos/perception/detection/type/detection3d/__init__.py
index 010cd981d2..e9e1950abf 100644
--- a/dimos/perception/detection/type/detection3d/__init__.py
+++ b/dimos/perception/detection/type/detection3d/__init__.py
@@ -12,17 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dimos.perception.detection.type.detection3d.detection import (
-    Detection3D,
-    Detection3DBBox,
-)
-from dimos.perception.detection.type.detection3d.detection_pc import (
+from dimos.perception.detection.type.detection3d.base import Detection3D
+from dimos.perception.detection.type.detection3d.bbox import Detection3DBBox
+from dimos.perception.detection.type.detection3d.pointcloud import (
     Detection3DPC,
     ImageDetections3DPC,
     PointCloudFilter,
     height_filter,
-    raycast,
     radius_outlier,
+    raycast,
     statistical,
 )
 
diff --git a/dimos/perception/detection/type/detection3d/detection.py b/dimos/perception/detection/type/detection3d/bbox.py
similarity index 86%
rename from dimos/perception/detection/type/detection3d/detection.py
rename to dimos/perception/detection/type/detection3d/bbox.py
index e1f7fe3b6d..2bc0c1c541 100644
--- a/dimos/perception/detection/type/detection3d/detection.py
+++ b/dimos/perception/detection/type/detection3d/bbox.py
@@ -29,6 +29,7 @@
 from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3
 from dimos.msgs.sensor_msgs import PointCloud2
 from dimos.perception.detection.type.detection2d import Detection2D, Detection2DBBox
+from dimos.perception.detection.type.detection3d.base import Detection3D
 from dimos.perception.detection.type.imageDetections import ImageDetections
 from dimos.types.timestamped import to_ros_stamp
 
@@ -73,21 +74,3 @@ def to_repr_dict(self) -> Dict[str, Any]:
             "dist": f"{distance:.2f}m",
             "size": f"[{self.size.x:.2f},{self.size.y:.2f},{self.size.z:.2f}]",
         }
-
-
-@dataclass
-class Detection3D(Detection2DBBox):
-    """Base class for 3D detections (deprecated, use Detection3DBBox or Detection3DPC)."""
-
-    transform: Transform
-    frame_id: str
-
-    @classmethod
-    def from_2d(
-        cls,
-        det: Detection2DBBox,
-        distance: float,
-        camera_info: CameraInfo,
-        world_to_optical_transform: Transform,
-    ) -> Optional["Detection3D"]:
-        raise NotImplementedError()
diff --git a/dimos/perception/detection/type/detection3d/detection_pc.py b/dimos/perception/detection/type/detection3d/pointcloud.py
similarity index 99%
rename from dimos/perception/detection/type/detection3d/detection_pc.py
rename to dimos/perception/detection/type/detection3d/pointcloud.py
index 66fb8318e0..1949541830 100644
--- a/dimos/perception/detection/type/detection3d/detection_pc.py
+++ b/dimos/perception/detection/type/detection3d/pointcloud.py
@@ -29,7 +29,7 @@
 from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3
 from dimos.msgs.sensor_msgs import PointCloud2
 from dimos.perception.detection.type.detection2d import Detection2D, Detection2DBBox
-from dimos.perception.detection.type.detection3d.detection import Detection3D
+from dimos.perception.detection.type.detection3d.base import Detection3D
 from dimos.perception.detection.type.imageDetections import ImageDetections
 from dimos.types.timestamped import to_ros_stamp
 
diff --git a/dimos/perception/detection/type/imageDetections.py b/dimos/perception/detection/type/imageDetections.py
index c09d7cb052..6513e4fe07 100644
--- a/dimos/perception/detection/type/imageDetections.py
+++ b/dimos/perception/detection/type/imageDetections.py
@@ -28,9 +28,13 @@
 from dimos.types.timestamped import to_timestamp
 
 if TYPE_CHECKING:
-    from dimos.perception.detection.type.detection2d import Detection2D
+    from dimos.perception.detection.type.detection2d.base import Detection2D
 
-T = TypeVar("T", bound="Detection2D")
+    T = TypeVar("T", bound=Detection2D)
+else:
+    from dimos.perception.detection.type.detection2d.base import Detection2D
+
+    T = TypeVar("T", bound=Detection2D)
 
 
 def _hash_to_color(name: str) -> str:

From 415eb6486eae9b8174cc1bf8e24984cca813274f Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 21:20:10 -0700
Subject: [PATCH 15/47] base.py for types, extracted table rendering to utils

---
 dimos/perception/detection/type/__init__.py   |   3 +-
 .../detection/type/detection2d/base.py        |  52 +++++++++
 .../detection/type/detection3d/base.py        |  44 ++++++++
 .../detection/type/imageDetections.py         |  87 +--------------
 dimos/perception/detection/type/utils.py      | 101 ++++++++++++++++++
 5 files changed, 201 insertions(+), 86 deletions(-)
 create mode 100644 dimos/perception/detection/type/detection2d/base.py
 create mode 100644 dimos/perception/detection/type/detection3d/base.py
 create mode 100644 dimos/perception/detection/type/utils.py

diff --git a/dimos/perception/detection/type/__init__.py b/dimos/perception/detection/type/__init__.py
index 41f4b2a194..d8f36d79dc 100644
--- a/dimos/perception/detection/type/__init__.py
+++ b/dimos/perception/detection/type/__init__.py
@@ -15,7 +15,8 @@
     raycast,
     statistical,
 )
-from dimos.perception.detection.type.imageDetections import ImageDetections, TableStr
+from dimos.perception.detection.type.imageDetections import ImageDetections
+from dimos.perception.detection.type.utils import TableStr
 
 __all__ = [
     # 2D Detection types
diff --git a/dimos/perception/detection/type/detection2d/base.py b/dimos/perception/detection/type/detection2d/base.py
new file mode 100644
index 0000000000..e89bf65409
--- /dev/null
+++ b/dimos/perception/detection/type/detection2d/base.py
@@ -0,0 +1,52 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from typing import List
+
+from dimos_lcm.foxglove_msgs.ImageAnnotations import PointsAnnotation, TextAnnotation
+from dimos_lcm.vision_msgs import Detection2D as ROSDetection2D
+
+from dimos.msgs.foxglove_msgs import ImageAnnotations
+from dimos.msgs.sensor_msgs import Image
+from dimos.types.timestamped import Timestamped
+
+
+class Detection2D(Timestamped):
+    """Abstract base class for 2D detections."""
+
+    @abstractmethod
+    def cropped_image(self, padding: int = 20) -> Image:
+        """Return a cropped version of the image focused on the detection area."""
+        ...
+
+    @abstractmethod
+    def to_image_annotations(self) -> ImageAnnotations:
+        """Convert detection to Foxglove ImageAnnotations for visualization."""
+        ...
+
+    @abstractmethod
+    def to_text_annotation(self) -> List[TextAnnotation]:
+        """Return text annotations for visualization."""
+        ...
+
+    @abstractmethod
+    def to_points_annotation(self) -> List[PointsAnnotation]:
+        """Return points/shape annotations for visualization."""
+        ...
+
+    @abstractmethod
+    def to_ros_detection2d(self) -> ROSDetection2D:
+        """Convert detection to ROS Detection2D message."""
+        ...
diff --git a/dimos/perception/detection/type/detection3d/base.py b/dimos/perception/detection/type/detection3d/base.py
new file mode 100644
index 0000000000..a82a50d474
--- /dev/null
+++ b/dimos/perception/detection/type/detection3d/base.py
@@ -0,0 +1,44 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+
+from dimos_lcm.sensor_msgs import CameraInfo
+
+from dimos.msgs.geometry_msgs import Transform
+from dimos.perception.detection.type.detection2d import Detection2DBBox
+
+
+@dataclass
+class Detection3D(Detection2DBBox):
+    """Abstract base class for 3D detections."""
+
+    transform: Transform
+    frame_id: str
+
+    @classmethod
+    @abstractmethod
+    def from_2d(
+        cls,
+        det: Detection2DBBox,
+        distance: float,
+        camera_info: CameraInfo,
+        world_to_optical_transform: Transform,
+    ) -> Optional["Detection3D"]:
+        """Create a 3D detection from a 2D detection."""
+        ...
diff --git a/dimos/perception/detection/type/imageDetections.py b/dimos/perception/detection/type/imageDetections.py
index 6513e4fe07..4431b028ff 100644
--- a/dimos/perception/detection/type/imageDetections.py
+++ b/dimos/perception/detection/type/imageDetections.py
@@ -14,18 +14,13 @@
 
 from __future__ import annotations
 
-import hashlib
-from typing import TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple, TypeVar
-
-from rich.console import Console
-from rich.table import Table
-from rich.text import Text
+from typing import TYPE_CHECKING, Generic, List, Optional, TypeVar
 
 from dimos.msgs.foxglove_msgs import ImageAnnotations
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.std_msgs import Header
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.types.timestamped import to_timestamp
+from dimos.perception.detection.type.utils import TableStr
 
 if TYPE_CHECKING:
     from dimos.perception.detection.type.detection2d.base import Detection2D
@@ -37,84 +32,6 @@
     T = TypeVar("T", bound=Detection2D)
 
 
-def _hash_to_color(name: str) -> str:
-    """Generate a consistent color for a given name using hash."""
-    # List of rich colors to choose from
-    colors = [
-        "cyan",
-        "magenta",
-        "yellow",
-        "blue",
-        "green",
-        "red",
-        "bright_cyan",
-        "bright_magenta",
-        "bright_yellow",
-        "bright_blue",
-        "bright_green",
-        "bright_red",
-        "purple",
-        "white",
-        "pink",
-    ]
-
-    # Hash the name and pick a color
-    hash_value = hashlib.md5(name.encode()).digest()[0]
-    return colors[hash_value % len(colors)]
-
-
-class TableStr:
-    def __str__(self):
-        console = Console(force_terminal=True, legacy_windows=False)
-
-        # Create a table for detections
-        table = Table(
-            title=f"{self.__class__.__name__} [{len(self.detections)} detections @ {to_timestamp(self.image.ts):.3f}]",
-            show_header=True,
-            show_edge=True,
-        )
-
-        # Dynamically build columns based on the first detection's dict keys
-        if not self.detections:
-            return (
-                f"   {self.__class__.__name__} [0 detections @ {to_timestamp(self.image.ts):.3f}]"
-            )
-
-        # Cache all repr_dicts to avoid double computation
-        detection_dicts = [det.to_repr_dict() for det in self]
-
-        first_dict = detection_dicts[0]
-        table.add_column("#", style="dim")
-        for col in first_dict.keys():
-            color = _hash_to_color(col)
-            table.add_column(col.title(), style=color)
-
-        # Add each detection to the table
-        for i, d in enumerate(detection_dicts):
-            row = [str(i)]
-
-            for key in first_dict.keys():
-                if key == "conf":
-                    # Color-code confidence
-                    conf_color = (
-                        "green"
-                        if float(d[key]) > 0.8
-                        else "yellow"
-                        if float(d[key]) > 0.5
-                        else "red"
-                    )
-                    row.append(Text(f"{d[key]}", style=conf_color))
-                elif key == "points" and d.get(key) == "None":
-                    row.append(Text(d.get(key, ""), style="dim"))
-                else:
-                    row.append(str(d.get(key, "")))
-            table.add_row(*row)
-
-        with console.capture() as capture:
-            console.print(table)
-        return capture.get().strip()
-
-
 class ImageDetections(Generic[T], TableStr):
     image: Image
     detections: List[T]
diff --git a/dimos/perception/detection/type/utils.py b/dimos/perception/detection/type/utils.py
new file mode 100644
index 0000000000..f1e2187015
--- /dev/null
+++ b/dimos/perception/detection/type/utils.py
@@ -0,0 +1,101 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+
+from rich.console import Console
+from rich.table import Table
+from rich.text import Text
+
+from dimos.types.timestamped import to_timestamp
+
+
+def _hash_to_color(name: str) -> str:
+    """Generate a consistent color for a given name using hash."""
+    # List of rich colors to choose from
+    colors = [
+        "cyan",
+        "magenta",
+        "yellow",
+        "blue",
+        "green",
+        "red",
+        "bright_cyan",
+        "bright_magenta",
+        "bright_yellow",
+        "bright_blue",
+        "bright_green",
+        "bright_red",
+        "purple",
+        "white",
+        "pink",
+    ]
+
+    # Hash the name and pick a color
+    hash_value = hashlib.md5(name.encode()).digest()[0]
+    return colors[hash_value % len(colors)]
+
+
+class TableStr:
+    """Mixin class that provides table-based string representation for detection collections."""
+
+    def __str__(self):
+        console = Console(force_terminal=True, legacy_windows=False)
+
+        # Create a table for detections
+        table = Table(
+            title=f"{self.__class__.__name__} [{len(self.detections)} detections @ {to_timestamp(self.image.ts):.3f}]",
+            show_header=True,
+            show_edge=True,
+        )
+
+        # Dynamically build columns based on the first detection's dict keys
+        if not self.detections:
+            return (
+                f"   {self.__class__.__name__} [0 detections @ {to_timestamp(self.image.ts):.3f}]"
+            )
+
+        # Cache all repr_dicts to avoid double computation
+        detection_dicts = [det.to_repr_dict() for det in self]
+
+        first_dict = detection_dicts[0]
+        table.add_column("#", style="dim")
+        for col in first_dict.keys():
+            color = _hash_to_color(col)
+            table.add_column(col.title(), style=color)
+
+        # Add each detection to the table
+        for i, d in enumerate(detection_dicts):
+            row = [str(i)]
+
+            for key in first_dict.keys():
+                if key == "conf":
+                    # Color-code confidence
+                    conf_color = (
+                        "green"
+                        if float(d[key]) > 0.8
+                        else "yellow"
+                        if float(d[key]) > 0.5
+                        else "red"
+                    )
+                    row.append(Text(f"{d[key]}", style=conf_color))
+                elif key == "points" and d.get(key) == "None":
+                    row.append(Text(d.get(key, ""), style="dim"))
+                else:
+                    row.append(str(d.get(key, "")))
+            table.add_row(*row)
+
+        with console.capture() as capture:
+            console.print(table)
+        return capture.get().strip()

From a0a17d63d85f4baeab8c2188710474dc42adcbde Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 21:27:36 -0700
Subject: [PATCH 16/47] conftest typing

---
 dimos/perception/detection/conftest.py | 98 ++++++++++++++++++--------
 1 file changed, 68 insertions(+), 30 deletions(-)

diff --git a/dimos/perception/detection/conftest.py b/dimos/perception/detection/conftest.py
index e902f88b6a..6d0fabbceb 100644
--- a/dimos/perception/detection/conftest.py
+++ b/dimos/perception/detection/conftest.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Callable, Optional, TypedDict, Union
+from typing import Callable, Generator, Optional, TypedDict, Union
 
 import pytest
 from dimos_lcm.foxglove_msgs.ImageAnnotations import ImageAnnotations
@@ -77,26 +77,40 @@ def moment_provider(**kwargs) -> Moment:
         data_dir = "unitree_go2_lidar_corrected"
         get_data(data_dir)
 
-        lidar_frame = TimedSensorReplay(f"{data_dir}/lidar").find_closest_seek(seek)
+        lidar_frame_result = TimedSensorReplay(f"{data_dir}/lidar").find_closest_seek(seek)
+        if lidar_frame_result is None:
+            raise ValueError("No lidar frame found")
+        lidar_frame: LidarMessage = lidar_frame_result
 
         image_frame = TimedSensorReplay(
             f"{data_dir}/video",
         ).find_closest(lidar_frame.ts)
 
+        if image_frame is None:
+            raise ValueError("No image frame found")
+
         image_frame.frame_id = "camera_optical"
 
         odom_frame = TimedSensorReplay(f"{data_dir}/odom", autocast=Odometry.from_msg).find_closest(
             lidar_frame.ts
         )
 
+        if odom_frame is None:
+            raise ValueError("No odom frame found")
+
         transforms = ConnectionModule._odom_to_tf(odom_frame)
 
         tf.receive_transform(*transforms)
+        camera_info_out = ConnectionModule._camera_info()
+        # ConnectionModule._camera_info() returns Out[CameraInfo], extract the value
+        from typing import cast
+
+        camera_info = cast(CameraInfo, camera_info_out)
         return {
             "odom_frame": odom_frame,
             "lidar_frame": lidar_frame,
             "image_frame": image_frame,
-            "camera_info": ConnectionModule._camera_info(),
+            "camera_info": camera_info,
             "transforms": transforms,
             "tf": tf,
         }
@@ -107,37 +121,53 @@ def moment_provider(**kwargs) -> Moment:
 @pytest.fixture
 def publish_moment():
     def publisher(moment: Moment | Moment2D | Moment3D):
-        if moment.get("detections2d"):
+        detections2d_val = moment.get("detections2d")
+        if detections2d_val:
             # 2d annotations
-            annotations = LCMTransport("/annotations", ImageAnnotations)
-            annotations.publish(moment.get("detections2d").to_foxglove_annotations())
+            annotations: LCMTransport[ImageAnnotations] = LCMTransport(
+                "/annotations", ImageAnnotations
+            )
+            assert isinstance(detections2d_val, ImageDetections2D)
+            annotations.publish(detections2d_val.to_foxglove_annotations())
 
-            detections = LCMTransport("/detections", Detection2DArray)
-            detections.publish(moment.get("detections2d").to_ros_detection2d_array())
+            detections: LCMTransport[Detection2DArray] = LCMTransport(
+                "/detections", Detection2DArray
+            )
+            detections.publish(detections2d_val.to_ros_detection2d_array())
 
             annotations.lcm.stop()
             detections.lcm.stop()
 
-        if moment.get("detections3dpc"):
-            scene_update = LCMTransport("/scene_update", SceneUpdate)
+        detections3dpc_val = moment.get("detections3dpc")
+        if detections3dpc_val:
+            scene_update: LCMTransport[SceneUpdate] = LCMTransport("/scene_update", SceneUpdate)
             # 3d scene update
-            scene_update.publish(moment.get("detections3dpc").to_foxglove_scene_update())
+            assert isinstance(detections3dpc_val, ImageDetections3DPC)
+            scene_update.publish(detections3dpc_val.to_foxglove_scene_update())
             scene_update.lcm.stop()
 
-        lidar = LCMTransport("/lidar", PointCloud2)
-        lidar.publish(moment.get("lidar_frame"))
-        lidar.lcm.stop()
+        lidar_frame = moment.get("lidar_frame")
+        if lidar_frame:
+            lidar: LCMTransport[PointCloud2] = LCMTransport("/lidar", PointCloud2)
+            lidar.publish(lidar_frame)
+            lidar.lcm.stop()
 
-        image = LCMTransport("/image", Image)
-        image.publish(moment.get("image_frame"))
-        image.lcm.stop()
+        image_frame = moment.get("image_frame")
+        if image_frame:
+            image: LCMTransport[Image] = LCMTransport("/image", Image)
+            image.publish(image_frame)
+            image.lcm.stop()
 
-        camera_info = LCMTransport("/camera_info", CameraInfo)
-        camera_info.publish(moment.get("camera_info"))
-        camera_info.lcm.stop()
+        camera_info_val = moment.get("camera_info")
+        if camera_info_val:
+            camera_info: LCMTransport[CameraInfo] = LCMTransport("/camera_info", CameraInfo)
+            camera_info.publish(camera_info_val)
+            camera_info.lcm.stop()
 
         tf = moment.get("tf")
-        tf.publish(*moment.get("transforms"))
+        transforms = moment.get("transforms")
+        if tf is not None and transforms is not None:
+            tf.publish(*transforms)
 
     # moduleDB.scene_update.transport = LCMTransport("/scene_update", SceneUpdate)
     # moduleDB.target.transport = LCMTransport("/target", PoseStamped)
@@ -160,7 +190,7 @@ def detection3dpc(get_moment_3dpc) -> Detection3DPC:
 
 
 @pytest.fixture
-def get_moment_2d(get_moment) -> Callable[[], Moment2D]:
+def get_moment_2d(get_moment) -> Generator[Callable[[], Moment2D], None, None]:
     from dimos.perception.detection.detectors import Yolo2DDetector
 
     module = Detection2DModule(detector=Yolo2DDetector)
@@ -179,29 +209,37 @@ def moment_provider(**kwargs) -> Moment2D:
 
 
 @pytest.fixture
-def get_moment_3dpc(get_moment_2d) -> Callable[[], Moment2D]:
-    module = None
+def get_moment_3dpc(get_moment_2d) -> Generator[Callable[[], Moment3D], None, None]:
+    module: Optional[Detection3DModule] = None
 
-    def moment_provider(**kwargs) -> Moment2D:
+    def moment_provider(**kwargs) -> Moment3D:
         nonlocal module
         moment = get_moment_2d(**kwargs)
 
         if not module:
             module = Detection3DModule(camera_info=moment["camera_info"])
 
-        camera_transform = moment["tf"].get("camera_optical", moment.get("lidar_frame").frame_id)
+        lidar_frame = moment.get("lidar_frame")
+        if lidar_frame is None:
+            raise ValueError("No lidar frame found")
+
+        camera_transform = moment["tf"].get("camera_optical", lidar_frame.frame_id)
         if camera_transform is None:
             raise ValueError("No camera_optical transform in tf")
+
+        detections3dpc = module.process_frame(
+            moment["detections2d"], moment["lidar_frame"], camera_transform
+        )
+
         return {
             **moment,
-            "detections3dpc": module.process_frame(
-                moment["detections2d"], moment["lidar_frame"], camera_transform
-            ),
+            "detections3dpc": detections3dpc,
         }
 
     yield moment_provider
     print("Closing 3D detection module", module)
-    module._close_module()
+    if module is not None:
+        module._close_module()
 
 
 @pytest.fixture

From fb5f22f1a67a13ee50a6cc70d6572b30ff08ff35 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sat, 11 Oct 2025 21:42:22 -0700
Subject: [PATCH 17/47] all mypy resolved

---
 dimos/perception/detection/module3D.py        |  19 ++--
 dimos/perception/detection/moduleDB.py        | 101 +++++-------------
 .../detection/type/detection2d/person.py      |   7 +-
 .../detection/type/detection3d/pointcloud.py  |   2 +-
 .../detection/type/test_object3d.py           |   6 +-
 5 files changed, 47 insertions(+), 88 deletions(-)

diff --git a/dimos/perception/detection/module3D.py b/dimos/perception/detection/module3D.py
index 2c393b586e..a09cdb0e74 100644
--- a/dimos/perception/detection/module3D.py
+++ b/dimos/perception/detection/module3D.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 
+from typing import Optional
+
 from dimos_lcm.sensor_msgs import CameraInfo
 from reactivex import operators as ops
 from reactivex.observable import Observable
@@ -48,7 +50,7 @@ class Detection3DModule(Detection2DModule):
     detected_image_1: Out[Image] = None  # type: ignore
     detected_image_2: Out[Image] = None  # type: ignore
 
-    detection_3d_stream: Observable[ImageDetections3DPC] = None
+    detection_3d_stream: Optional[Observable[ImageDetections3DPC]] = None
 
     def __init__(self, camera_info: CameraInfo, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -63,7 +65,7 @@ def process_frame(
         if not transform:
             return ImageDetections3DPC(detections.image, [])
 
-        detection3d_list = []
+        detection3d_list: list[Detection3DPC] = []
         for detection in detections:
             detection3d = Detection3DPC.from_2d(
                 detection,
@@ -76,8 +78,8 @@ def process_frame(
 
         return ImageDetections3DPC(detections.image, detection3d_list)
 
-    @skill
-    def ask_vlm(self, question: str):
+    @skill  # type: ignore[arg-type]
+    def ask_vlm(self, question: str) -> str | ImageDetections3DPC:
         """
         query visual model about the view in front of the camera
         you can ask to mark objects like:
@@ -86,14 +88,15 @@ def ask_vlm(self, question: str):
         "laptop on the desk"
         "a person wearing a red shirt"
         """
-        from dimos.models.vl.qwen import QwenVLModel
+        from dimos.models.vl.qwen import QwenVlModel
 
-        model = QwenVLModel()
-        detections: ImageDetections2D = model.query(self.image.get_next(), question)
+        model = QwenVlModel()
+        result = model.query(self.image.get_next(), question)
 
-        if not detections or not len(detections):
+        if isinstance(result, str) or not result or not len(result):
             return "No detections"
 
+        detections: ImageDetections2D = result
         pc = self.pointcloud.get_next()
         transform = self.tf.get("camera_optical", pc.frame_id, detections.image.ts, 5.0)
         return self.process_frame(detections, pc, transform)
diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py
index 6239ddf921..ccc14d96f5 100644
--- a/dimos/perception/detection/moduleDB.py
+++ b/dimos/perception/detection/moduleDB.py
@@ -35,16 +35,22 @@
 
 # Represents an object in space, as collection of 3d detections over time
 class Object3D(Detection3DPC):
-    best_detection: Detection3DPC = None
-    center: Vector3 = None
-    track_id: str = None
+    best_detection: Optional[Detection3DPC] = None  # type: ignore
+    center: Optional[Vector3] = None  # type: ignore
+    track_id: Optional[str] = None  # type: ignore
     detections: int = 0
 
     def to_repr_dict(self) -> Dict[str, Any]:
+        if self.center is None:
+            center_str = "None"
+        else:
+            center_str = (
+                "[" + ", ".join(list(map(lambda n: f"{n:1f}", self.center.to_list()))) + "]"
+            )
         return {
             "object_id": self.track_id,
             "detections": self.detections,
-            "center": "[" + ", ".join(list(map(lambda n: f"{n:1f}", self.center.to_list()))) + "]",
+            "center": center_str,
         }
 
     def __init__(self, track_id: str, detection: Optional[Detection3DPC] = None, *args, **kwargs):
@@ -64,6 +70,8 @@ def __init__(self, track_id: str, detection: Optional[Detection3DPC] = None, *ar
         self.best_detection = detection
 
     def __add__(self, detection: Detection3DPC) -> "Object3D":
+        if self.track_id is None:
+            raise ValueError("Cannot add detection to object with None track_id")
         new_object = Object3D(self.track_id)
         new_object.bbox = detection.bbox
         new_object.confidence = max(self.confidence, detection.confidence)
@@ -84,9 +92,8 @@ def __add__(self, detection: Detection3DPC) -> "Object3D":
 
         return new_object
 
-    @property
-    def image(self) -> Image:
-        return self.best_detection.image
+    def get_image(self) -> Optional[Image]:
+        return self.best_detection.image if self.best_detection else None
 
     def scene_entity_label(self) -> str:
         return f"{self.name} ({self.detections})"
@@ -101,6 +108,9 @@ def agent_encode(self):
         }
 
     def to_pose(self) -> PoseStamped:
+        if self.best_detection is None or self.center is None:
+            raise ValueError("Cannot compute pose without best_detection and center")
+
         optical_inverse = Transform(
             translation=Vector3(0.0, 0.0, 0.0),
             rotation=Quaternion(-0.5, 0.5, -0.5, 0.5),
@@ -127,9 +137,9 @@ def to_pose(self) -> PoseStamped:
 class ObjectDBModule(Detection3DModule, TableStr):
     cnt: int = 0
     objects: dict[str, Object3D]
-    object_stream: Observable[Object3D] = None
+    object_stream: Optional[Observable[Object3D]] = None
 
-    goto: Callable[[PoseStamped], Any] = None
+    goto: Optional[Callable[[PoseStamped], Any]] = None
 
     image: In[Image] = None  # type: ignore
     pointcloud: In[PointCloud2] = None  # type: ignore
@@ -184,16 +194,18 @@ def add_detection(self, detection: Detection3DPC):
 
     def add_to_object(self, closest: Object3D, detection: Detection3DPC):
         new_object = closest + detection
-        self.objects[closest.track_id] = new_object
+        if closest.track_id is not None:
+            self.objects[closest.track_id] = new_object
         return new_object
 
     def create_new_object(self, detection: Detection3DPC):
         new_object = Object3D(f"obj_{self.cnt}", detection)
-        self.objects[new_object.track_id] = new_object
+        if new_object.track_id is not None:
+            self.objects[new_object.track_id] = new_object
         self.cnt += 1
         return new_object
 
-    def agent_encode(self) -> List[Any]:
+    def agent_encode(self) -> str:
         ret = []
         for obj in copy(self.objects).values():
             # we need at least 3 detectieons to consider it a valid object
@@ -205,8 +217,8 @@ def agent_encode(self) -> List[Any]:
             return "No objects detected yet."
         return "\n".join(ret)
 
-    def vlm_query(self, description: str) -> str:
-        imageDetections2D = super().vlm_query(description)
+    def vlm_query(self, description: str) -> Optional[Object3D]:  # type: ignore[override]
+        imageDetections2D = super().ask_vlm(description)
         print("VLM query found", imageDetections2D, "detections")
         time.sleep(3)
 
@@ -235,67 +247,6 @@ def vlm_query(self, description: str) -> str:
 
         return ret[0] if ret else None
 
-    @skill()
-    def remember_location(self, name: str) -> str:
-        """Remember the current location with a name."""
-        transform = self.tf.get("map", "sensor", time_point=time.time(), time_tolerance=1.0)
-        if not transform:
-            return f"Could not get current location transform from map to sensor"
-
-        pose = transform.to_pose()
-        pose.frame_id = "map"
-        self.remembered_locations[name] = pose
-        return f"Location '{name}' saved at position: {pose.position}"
-
-    @skill()
-    def goto_remembered_location(self, name: str) -> str:
-        """Go to a remembered location by name."""
-        pose = self.remembered_locations.get(name, None)
-        if not pose:
-            return f"Location {name} not found. Known locations: {list(self.remembered_locations.keys())}"
-        self.goto(pose)
-        return f"Navigating to remembered location {name} and pose {pose}"
-
-    @skill()
-    def list_remembered_locations(self) -> List[str]:
-        """List all remembered locations."""
-        return str(list(self.remembered_locations.keys()))
-
-    def nav_to(self, target_pose) -> str:
-        target_pose.orientation = Quaternion(0.0, 0.0, 0.0, 0.0)
-        self.target.publish(target_pose)
-        time.sleep(0.1)
-        self.target.publish(target_pose)
-        self.goto(target_pose)
-
-    @skill()
-    def navigate_to_object_in_view(self, query: str) -> str:
-        """Navigate to an object in your current image view via natural language query using vision-language model to find it."""
-        target_obj = self.vlm_query(query)
-        if not target_obj:
-            return f"No objects found matching '{query}'"
-        return self.navigate_to_object_by_id(target_obj.track_id)
-
-    @skill(reducer=Reducer.all)
-    def list_objects(self):
-        """List all detected objects that the system remembers and can navigate to."""
-        data = self.agent_encode()
-        return data
-
-    @skill()
-    def navigate_to_object_by_id(self, object_id: str):
-        """Navigate to an object by an object id"""
-        target_obj = self.objects.get(object_id, None)
-        if not target_obj:
-            return f"Object {object_id} not found\nHere are the known objects:\n{str(self.agent_encode())}"
-        target_pose = target_obj.to_pose()
-        target_pose.frame_id = "map"
-        self.target.publish(target_pose)
-        time.sleep(0.1)
-        self.target.publish(target_pose)
-        self.nav_to(target_pose)
-        return f"Navigating to f{object_id} f{target_obj.name}"
-
     def lookup(self, label: str) -> List[Detection3DPC]:
         """Look up a detection by label."""
         return []
diff --git a/dimos/perception/detection/type/detection2d/person.py b/dimos/perception/detection/type/detection2d/person.py
index ef8b243297..d339dff39d 100644
--- a/dimos/perception/detection/type/detection2d/person.py
+++ b/dimos/perception/detection/type/detection2d/person.py
@@ -126,10 +126,15 @@ def from_ultralytics_result(
         class_id = int(result.boxes.cls[idx].cpu())
 
         # Extract keypoints
+        if result.keypoints.xy is None or result.keypoints.conf is None:
+            raise ValueError("Keypoints xy or conf data is missing from the result")
+
         keypoints = result.keypoints.xy[idx].cpu().numpy()
         keypoint_scores = result.keypoints.conf[idx].cpu().numpy()
         keypoints_norm = (
-            result.keypoints.xyn[idx].cpu().numpy() if hasattr(result.keypoints, "xyn") else None
+            result.keypoints.xyn[idx].cpu().numpy()
+            if hasattr(result.keypoints, "xyn") and result.keypoints.xyn is not None
+            else None
         )
 
         # Get image dimensions
diff --git a/dimos/perception/detection/type/detection3d/pointcloud.py b/dimos/perception/detection/type/detection3d/pointcloud.py
index 1949541830..6f9e4c2e05 100644
--- a/dimos/perception/detection/type/detection3d/pointcloud.py
+++ b/dimos/perception/detection/type/detection3d/pointcloud.py
@@ -259,7 +259,7 @@ def from_2d(  # type: ignore[override]
         # filters are to be adjusted based on the sensor noise characteristics if feeding
         # sensor data directly
         filters: Optional[list[PointCloudFilter]] = None,
-    ) -> Optional["Detection3D"]:
+    ) -> Optional["Detection3DPC"]:
         """Create a Detection3D from a 2D detection by projecting world pointcloud.
 
         This method handles:
diff --git a/dimos/perception/detection/type/test_object3d.py b/dimos/perception/detection/type/test_object3d.py
index c032664b46..1dc3cb6bd0 100644
--- a/dimos/perception/detection/type/test_object3d.py
+++ b/dimos/perception/detection/type/test_object3d.py
@@ -86,9 +86,9 @@ def test_object3d_repr_dict(first_object):
     assert encoded["last_seen"].endswith("s ago")
 
     # def test_object3d_image_property(first_object):
-    """Test image property returns best_detection's image."""
-    assert first_object.image is not None
-    assert first_object.image is first_object.best_detection.image
+    """Test get_image method returns best_detection's image."""
+    assert first_object.get_image() is not None
+    assert first_object.get_image() is first_object.best_detection.image
 
 
 def test_all_objeects(all_objects):

From 84541f1780d82e0cb27b9a27773f5c327398cd92 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sun, 12 Oct 2025 10:23:51 -0700
Subject: [PATCH 18/47] session level fixtures

---
 dimos/conftest.py                             | 78 +++++++++++++++----
 dimos/perception/detection/conftest.py        | 30 ++++---
 .../detection/detectors/conftest.py           |  6 +-
 .../detectors/person/test_person_detectors.py |  4 +-
 .../detectors/test_bbox_detectors.py          |  4 +-
 .../detection/type/test_detection3d.py        |  2 +-
 .../detection/type/test_detection3dpc.py      |  2 +-
 7 files changed, 91 insertions(+), 35 deletions(-)

diff --git a/dimos/conftest.py b/dimos/conftest.py
index e2a8a3ec36..d63736e5a7 100644
--- a/dimos/conftest.py
+++ b/dimos/conftest.py
@@ -24,12 +24,41 @@ def event_loop():
     loop.close()
 
 
+_session_threads = set()
 _seen_threads = set()
 _seen_threads_lock = threading.RLock()
+_before_test_threads = {}  # Map test name to set of thread IDs before test
 
 _skip_for = ["lcm", "heavy", "ros"]
 
 
+@pytest.fixture(scope="session", autouse=True)
+def track_session_threads():
+    """Track threads that exist at session start - these are not leaks."""
+    # Capture initial threads before any tests run
+    initial = threading.enumerate()
+    with _seen_threads_lock:
+        for t in initial:
+            if t.ident is not None:
+                _session_threads.add(t.ident)
+
+    yield
+
+    # Check for session-level thread leaks at teardown
+    final_threads = [
+        t
+        for t in threading.enumerate()
+        if t.name != "MainThread" and t.ident not in _session_threads
+    ]
+
+    if final_threads:
+        thread_info = [f"{t.name} (daemon={t.daemon})" for t in final_threads]
+        pytest.fail(
+            f"\n{len(final_threads)} thread(s) leaked during test session: {thread_info}\n"
+            "Session-scoped fixtures must clean up all threads in their teardown."
+        )
+
+
 @pytest.fixture(autouse=True)
 def monitor_threads(request):
     # Skip monitoring for tests marked with specified markers
@@ -37,24 +66,45 @@ def monitor_threads(request):
         yield
         return
 
+    # Capture threads before test runs
+    test_name = request.node.nodeid
+    with _seen_threads_lock:
+        _before_test_threads[test_name] = {
+            t.ident for t in threading.enumerate() if t.ident is not None
+        }
+
     yield
 
-    threads = [t for t in threading.enumerate() if t.name != "MainThread"]
+    # Only check for threads created BY THIS TEST, not existing ones
+    with _seen_threads_lock:
+        before = _before_test_threads.get(test_name, set())
+        current = {t.ident for t in threading.enumerate() if t.ident is not None}
 
-    if not threads:
-        return
+        # New threads are ones that exist now but didn't exist before this test
+        new_thread_ids = current - before
 
-    with _seen_threads_lock:
-        new_leaks = [t for t in threads if t.ident not in _seen_threads]
-        for t in threads:
-            _seen_threads.add(t.ident)
+        if not new_thread_ids:
+            return
 
-    if not new_leaks:
-        return
+        # Get the actual thread objects for new threads
+        new_threads = [
+            t for t in threading.enumerate() if t.ident in new_thread_ids and t.name != "MainThread"
+        ]
+
+        # Filter out threads we've already seen (from previous tests)
+        truly_new = [t for t in new_threads if t.ident not in _seen_threads]
+
+        # Mark all new threads as seen
+        for t in new_threads:
+            if t.ident is not None:
+                _seen_threads.add(t.ident)
+
+        if not truly_new:
+            return
 
-    thread_names = [t.name for f in new_leaks]
+        thread_names = [t.name for t in truly_new]
 
-    pytest.fail(
-        f"Non-closed threads before or during this test. The thread names: {thread_names}. "
-        "Please look at the first test that fails and fix that."
-    )
+        pytest.fail(
+            f"Non-closed threads created during this test. Thread names: {thread_names}. "
+            "Please look at the first test that fails and fix that."
+        )
diff --git a/dimos/perception/detection/conftest.py b/dimos/perception/detection/conftest.py
index 6d0fabbceb..de0e0d21b6 100644
--- a/dimos/perception/detection/conftest.py
+++ b/dimos/perception/detection/conftest.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import functools
 from typing import Callable, Generator, Optional, TypedDict, Union
 
 import pytest
@@ -62,16 +63,18 @@ class Moment3D(Moment):
     detections3dpc: ImageDetections3DPC
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def tf():
     t = TF()
     yield t
     t.stop()
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def get_moment(tf):
+    @functools.lru_cache(maxsize=1)
     def moment_provider(**kwargs) -> Moment:
+        print("MOMENT PROVIDER ARGS:", kwargs)
         seek = kwargs.get("seek", 10.0)
 
         data_dir = "unitree_go2_lidar_corrected"
@@ -118,7 +121,7 @@ def moment_provider(**kwargs) -> Moment:
     return moment_provider
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def publish_moment():
     def publisher(moment: Moment | Moment2D | Moment3D):
         detections2d_val = moment.get("detections2d")
@@ -175,26 +178,27 @@ def publisher(moment: Moment | Moment2D | Moment3D):
     return publisher
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def detection2d(get_moment_2d) -> Detection2D:
-    moment = get_moment_2d(seek=10.0)
+    moment = get_moment_2d()
     assert len(moment["detections2d"]) > 0, "No detections found in the moment"
     return moment["detections2d"][0]
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def detection3dpc(get_moment_3dpc) -> Detection3DPC:
     moment = get_moment_3dpc(seek=10.0)
     assert len(moment["detections3dpc"]) > 0, "No detections found in the moment"
     return moment["detections3dpc"][0]
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def get_moment_2d(get_moment) -> Generator[Callable[[], Moment2D], None, None]:
     from dimos.perception.detection.detectors import Yolo2DDetector
 
     module = Detection2DModule(detector=Yolo2DDetector)
 
+    @functools.lru_cache(maxsize=1)
     def moment_provider(**kwargs) -> Moment2D:
         moment = get_moment(**kwargs)
         detections = module.process_image_frame(moment.get("image_frame"))
@@ -205,13 +209,15 @@ def moment_provider(**kwargs) -> Moment2D:
         }
 
     yield moment_provider
+
     module._close_module()
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def get_moment_3dpc(get_moment_2d) -> Generator[Callable[[], Moment3D], None, None]:
     module: Optional[Detection3DModule] = None
 
+    @functools.lru_cache(maxsize=1)
     def moment_provider(**kwargs) -> Moment3D:
         nonlocal module
         moment = get_moment_2d(**kwargs)
@@ -237,12 +243,11 @@ def moment_provider(**kwargs) -> Moment3D:
         }
 
     yield moment_provider
-    print("Closing 3D detection module", module)
     if module is not None:
         module._close_module()
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def object_db_module(get_moment):
     """Create and populate an ObjectDBModule with detections from multiple frames."""
     from dimos.perception.detection.detectors import Yolo2DDetector
@@ -274,12 +279,13 @@ def object_db_module(get_moment):
         moduleDB.add_detections(imageDetections3d)
 
     yield moduleDB
+
     module2d._close_module()
     module3d._close_module()
     moduleDB._close_module()
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def first_object(object_db_module):
     """Get the first object from the database."""
     objects = list(object_db_module.objects.values())
@@ -287,7 +293,7 @@ def first_object(object_db_module):
     return objects[0]
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def all_objects(object_db_module):
     """Get all objects from the database."""
     return list(object_db_module.objects.values())
diff --git a/dimos/perception/detection/detectors/conftest.py b/dimos/perception/detection/detectors/conftest.py
index cf4b1712e3..7caca818c9 100644
--- a/dimos/perception/detection/detectors/conftest.py
+++ b/dimos/perception/detection/detectors/conftest.py
@@ -20,19 +20,19 @@
 from dimos.utils.data import get_data
 
 
-@pytest.fixture()
+@pytest.fixture(scope="session")
 def test_image():
     """Load the test image used for detector tests."""
     return Image.from_file(get_data("cafe.jpg"))
 
 
-@pytest.fixture()
+@pytest.fixture(scope="session")
 def person_detector():
     """Create a YoloPersonDetector instance."""
     return YoloPersonDetector()
 
 
-@pytest.fixture()
+@pytest.fixture(scope="session")
 def bbox_detector():
     """Create a Yolo2DDetector instance for general object detection."""
     return Yolo2DDetector()
diff --git a/dimos/perception/detection/detectors/person/test_person_detectors.py b/dimos/perception/detection/detectors/person/test_person_detectors.py
index de0bbf34e8..bca39acbcd 100644
--- a/dimos/perception/detection/detectors/person/test_person_detectors.py
+++ b/dimos/perception/detection/detectors/person/test_person_detectors.py
@@ -17,12 +17,12 @@
 from dimos.perception.detection.type import Detection2DPerson, ImageDetections2D
 
 
-@pytest.fixture()
+@pytest.fixture(scope="session")
 def people(person_detector, test_image):
     return person_detector.process_image(test_image)
 
 
-@pytest.fixture()
+@pytest.fixture(scope="session")
 def person(people):
     return people[0]
 
diff --git a/dimos/perception/detection/detectors/test_bbox_detectors.py b/dimos/perception/detection/detectors/test_bbox_detectors.py
index 193238217e..d246ded8a3 100644
--- a/dimos/perception/detection/detectors/test_bbox_detectors.py
+++ b/dimos/perception/detection/detectors/test_bbox_detectors.py
@@ -17,13 +17,13 @@
 from dimos.perception.detection.type import Detection2D, ImageDetections2D
 
 
-@pytest.fixture(params=["bbox_detector", "person_detector"])
+@pytest.fixture(params=["bbox_detector", "person_detector"], scope="session")
 def detector(request):
     """Parametrized fixture that provides both bbox and person detectors."""
     return request.getfixturevalue(request.param)
 
 
-@pytest.fixture()
+@pytest.fixture(scope="session")
 def detections(detector, test_image):
     """Get ImageDetections2D from any detector."""
     return detector.process_image(test_image)
diff --git a/dimos/perception/detection/type/test_detection3d.py b/dimos/perception/detection/type/test_detection3d.py
index 2188583464..44413df1fe 100644
--- a/dimos/perception/detection/type/test_detection3d.py
+++ b/dimos/perception/detection/type/test_detection3d.py
@@ -18,7 +18,7 @@
 
 
 def test_guess_projection(get_moment_2d, publish_moment):
-    moment = get_moment_2d(seek=10.0)
+    moment = get_moment_2d()
     for key, value in moment.items():
         print(key, "====================================")
         print(value)
diff --git a/dimos/perception/detection/type/test_detection3dpc.py b/dimos/perception/detection/type/test_detection3dpc.py
index a25e27d458..c840f266f4 100644
--- a/dimos/perception/detection/type/test_detection3dpc.py
+++ b/dimos/perception/detection/type/test_detection3dpc.py
@@ -58,7 +58,7 @@ def test_detection3dpc(detection3dpc):
     # def test_point_cloud_properties(detection3dpc):
     """Test point cloud data and boundaries."""
     pc_points = detection3dpc.pointcloud.points()
-    assert len(pc_points) in [69, 70]
+    assert len(pc_points) > 60
     assert detection3dpc.pointcloud.frame_id == "world", (
         f"Expected frame_id 'world', got '{detection3dpc.pointcloud.frame_id}'"
     )

From 3d599d1be1a4c0a0141158fdd2f9567b60c6bd72 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sun, 12 Oct 2025 10:24:20 -0700
Subject: [PATCH 19/47] moondream integrated, generic huggingface model
 integration

---
 dimos/models/vl/base.py        |   4 +-
 dimos/models/vl/moondream.py   | 136 +++++++++++++++++++++++++++++++++
 dimos/models/vl/test_models.py |  68 +++++++++++++++++
 3 files changed, 206 insertions(+), 2 deletions(-)
 create mode 100644 dimos/models/vl/moondream.py
 create mode 100644 dimos/models/vl/test_models.py

diff --git a/dimos/models/vl/base.py b/dimos/models/vl/base.py
index f5e7a335e5..c7cb6457b3 100644
--- a/dimos/models/vl/base.py
+++ b/dimos/models/vl/base.py
@@ -61,7 +61,7 @@ def vlm_detection_to_detection2d(
 
 class VlModel(ABC):
     @abstractmethod
-    def query(self, image: Image, query: str) -> str: ...
+    def query(self, image: Image, query: str, **kwargs) -> str: ...
 
     # requery once if JSON parsing fails
     @retry(max_retries=2, on_exception=json.JSONDecodeError, delay=0.0)
@@ -69,7 +69,7 @@ def query_json(self, image: Image, query: str) -> dict:
         response = self.query(image, query)
         return extract_json(response)
 
-    def query_detections(self, image: Image, query: str) -> ImageDetections2D:
+    def query_detections(self, image: Image, query: str, **kwargs) -> ImageDetections2D:
         full_query = f"""show me bounding boxes in pixels for this query: `{query}`
 
         format should be:
diff --git a/dimos/models/vl/moondream.py b/dimos/models/vl/moondream.py
new file mode 100644
index 0000000000..1647f979fd
--- /dev/null
+++ b/dimos/models/vl/moondream.py
@@ -0,0 +1,136 @@
+import warnings
+from functools import cached_property
+from typing import Optional
+
+import numpy as np
+import torch
+from PIL import Image as PILImage
+from transformers import AutoModelForCausalLM
+
+from dimos.models.vl.base import VlModel
+from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D
+
+
+class MoondreamVlModel(VlModel):
+    _model_name: str
+    _device: str
+    _dtype: torch.dtype
+
+    def __init__(
+        self,
+        model_name: str = "vikhyatk/moondream2",
+        device: Optional[str] = None,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        self._model_name = model_name
+        self._device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self._dtype = dtype
+
+    @cached_property
+    def _model(self) -> AutoModelForCausalLM:
+        model = AutoModelForCausalLM.from_pretrained(
+            self._model_name,
+            trust_remote_code=True,
+            torch_dtype=self._dtype,
+        )
+        model = model.to(self._device)
+        model.compile()
+        return model
+
+    def query(self, image: Image | np.ndarray, query: str, **kwargs) -> str:
+        if isinstance(image, np.ndarray):
+            warnings.warn(
+                "MoondreamVlModel.query should receive standard dimos Image type, not a numpy array",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            image = Image.from_numpy(image)
+
+        # Convert dimos Image to PIL Image
+        # dimos Image stores data in RGB/BGR format, convert to RGB for PIL
+        rgb_image = image.to_rgb()
+        pil_image = PILImage.fromarray(rgb_image.data)
+
+        # Query the model
+        result = self._model.query(image=pil_image, question=query, reasoning=False)
+
+        # Handle both dict and string responses
+        if isinstance(result, dict):
+            return result.get("answer", str(result))
+
+        return str(result)
+
+    def query_detections(
+        self, image: Image, query: str, max_objects: int = 10
+    ) -> ImageDetections2D:
+        """Detect objects using Moondream's native detect method.
+
+        Args:
+            image: Input image
+            query: Object query (e.g., "person", "car")
+            max_objects: Maximum number of objects to detect
+
+        Returns:
+            ImageDetections2D containing detected bounding boxes
+        """
+        pil_image = PILImage.fromarray(image.data)
+
+        settings = {"max_objects": max_objects}
+        result = self._model.detect(pil_image, query, settings=settings)
+
+        # Convert to ImageDetections2D
+        image_detections = ImageDetections2D(image)
+
+        # Get image dimensions for converting normalized coords to pixels
+        height, width = image.height, image.width
+
+        for track_id, obj in enumerate(result.get("objects", [])):
+            # Convert normalized coordinates (0-1) to pixel coordinates
+            x_min_norm = obj["x_min"]
+            y_min_norm = obj["y_min"]
+            x_max_norm = obj["x_max"]
+            y_max_norm = obj["y_max"]
+
+            x1 = x_min_norm * width
+            y1 = y_min_norm * height
+            x2 = x_max_norm * width
+            y2 = y_max_norm * height
+
+            bbox = (x1, y1, x2, y2)
+
+            detection = Detection2DBBox(
+                bbox=bbox,
+                track_id=track_id,
+                class_id=-1,  # Moondream doesn't provide class IDs
+                confidence=1.0,  # Moondream doesn't provide confidence scores
+                name=query,  # Use the query as the object name
+                ts=image.ts,
+                image=image,
+            )
+
+            if detection.is_valid():
+                image_detections.detections.append(detection)
+
+        return image_detections
+
+
+if __name__ == "__main__":
+    from dimos.utils.data import get_data
+
+    # Load test image
+    image = Image.from_file(get_data("cafe.jpg"))
+
+    # Initialize the model
+    print("Loading Moondream model...")
+    model = MoondreamVlModel()
+
+    # Test text query
+    #    print("\nQuerying: 'What's in this image?'")
+    #    answer = model.query(image, "What's in this image?")
+    #    print(f"Answer: {answer}")
+
+    # Test detection query
+    print(model.query_detections(image, "person", max_objects=5))
+    print("detect glass")
+    print(model.query_detections(image, "glass", max_objects=5))
diff --git a/dimos/models/vl/test_models.py b/dimos/models/vl/test_models.py
new file mode 100644
index 0000000000..d5cd795929
--- /dev/null
+++ b/dimos/models/vl/test_models.py
@@ -0,0 +1,68 @@
+import time
+
+import pytest
+from dimos_lcm.foxglove_msgs.ImageAnnotations import ImageAnnotations
+
+from dimos.core import LCMTransport
+from dimos.models.vl.base import VlModel
+from dimos.models.vl.moondream import MoondreamVlModel
+from dimos.models.vl.qwen import QwenVlModel
+from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection.type import ImageDetections2D
+from dimos.utils.data import get_data
+
+
+@pytest.mark.parametrize(
+    "model_class,model_name",
+    [
+        (MoondreamVlModel, "Moondream"),
+        (QwenVlModel, "Qwen"),
+    ],
+    ids=["moondream", "qwen"],
+)
+@pytest.mark.heavy
+def test_vlm(model_class, model_name):
+    image = Image.from_file(get_data("cafe.jpg")).to_rgb()
+
+    print(f"\n{'=' * 60}")
+    print(f"Testing {model_name}")
+    print(f"{'=' * 60}")
+
+    # Initialize model
+    print(f"Loading {model_name} model...")
+    model: VlModel = model_class()
+
+    queries = ["glasses", "blue shirt", "lightbulbs", "dog", "flowers on the table", "shoes"]
+
+    all_detections = ImageDetections2D(image)
+    query_times = []
+
+    for query in queries:
+        print(f"\nQuerying for: {query}")
+        start_time = time.time()
+        detections = model.query_detections(image, query, max_objects=5)
+        query_time = time.time() - start_time
+        query_times.append(query_time)
+
+        print(f"  Found {len(detections)} detections in {query_time:.3f}s")
+        all_detections.detections.extend(detections.detections)
+
+    avg_time = sum(query_times) / len(query_times) if query_times else 0
+    print(f"\n{model_name} Results:")
+    print(f"  Average query time: {avg_time:.3f}s")
+    print(f"  Total detections: {len(all_detections)}")
+    print(all_detections)
+
+    # Publish to LCM with model-specific channel names
+    annotations_transport: LCMTransport[ImageAnnotations] = LCMTransport(
+        "/annotations", ImageAnnotations
+    )
+    annotations_transport.publish(all_detections.to_foxglove_annotations())
+
+    image_transport: LCMTransport[Image] = LCMTransport("/image", Image)
+    image_transport.publish(image)
+
+    annotations_transport.lcm.stop()
+    image_transport.lcm.stop()
+
+    print(f"Published {model_name} annotations and image to LCM")

From 4d007eaa11d35f48a847c2f79aa23bc90982585d Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sun, 12 Oct 2025 11:05:13 -0700
Subject: [PATCH 20/47] slightly nicer bounding boxes, slightly better vlm
 tests

---
 dimos/models/vl/test_models.py                | 11 ++++-
 .../detection/type/detection2d/bbox.py        | 42 ++++++++++++++-----
 2 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/dimos/models/vl/test_models.py b/dimos/models/vl/test_models.py
index d5cd795929..0c313fc14e 100644
--- a/dimos/models/vl/test_models.py
+++ b/dimos/models/vl/test_models.py
@@ -32,7 +32,16 @@ def test_vlm(model_class, model_name):
     print(f"Loading {model_name} model...")
     model: VlModel = model_class()
 
-    queries = ["glasses", "blue shirt", "lightbulbs", "dog", "flowers on the table", "shoes"]
+    queries = [
+        "glasses",
+        "blue shirt",
+        "bulb",
+        "dog",
+        "flowers on the left table",
+        "shoes",
+        "leftmost persons ear",
+        "rightmost arm",
+    ]
 
     all_detections = ImageDetections2D(image)
     query_times = []
diff --git a/dimos/perception/detection/type/detection2d/bbox.py b/dimos/perception/detection/type/detection2d/bbox.py
index 1bec4a55d4..5b7e77f3ea 100644
--- a/dimos/perception/detection/type/detection2d/bbox.py
+++ b/dimos/perception/detection/type/detection2d/bbox.py
@@ -248,34 +248,54 @@ def to_text_annotation(self) -> List[TextAnnotation]:
 
         font_size = 20
 
-        return [
-            TextAnnotation(
-                timestamp=to_ros_stamp(self.ts),
-                position=Point2(x=x1, y=y2 + font_size),
-                text=f"confidence: {self.confidence:.3f}",
-                font_size=font_size,
-                text_color=Color(r=1.0, g=1.0, b=1.0, a=1),
-                background_color=Color(r=0, g=0, b=0, a=1),
-            ),
+        # Build label text - exclude class_id if it's -1 (VLM detection)
+        if self.class_id == -1:
+            label_text = f"{self.name}_{self.track_id}"
+        else:
+            label_text = f"{self.name}_{self.class_id}_{self.track_id}"
+
+        annotations = [
             TextAnnotation(
                 timestamp=to_ros_stamp(self.ts),
                 position=Point2(x=x1, y=y1),
-                text=f"{self.name}_{self.class_id}_{self.track_id}",
+                text=label_text,
                 font_size=font_size,
                 text_color=Color(r=1.0, g=1.0, b=1.0, a=1),
                 background_color=Color(r=0, g=0, b=0, a=1),
             ),
         ]
 
+        # Only show confidence if it's not 1.0
+        if self.confidence != 1.0:
+            annotations.append(
+                TextAnnotation(
+                    timestamp=to_ros_stamp(self.ts),
+                    position=Point2(x=x1, y=y2 + font_size),
+                    text=f"confidence: {self.confidence:.3f}",
+                    font_size=font_size,
+                    text_color=Color(r=1.0, g=1.0, b=1.0, a=1),
+                    background_color=Color(r=0, g=0, b=0, a=1),
+                )
+            )
+
+        return annotations
+
     def to_points_annotation(self) -> List[PointsAnnotation]:
         x1, y1, x2, y2 = self.bbox
 
         thickness = 1
 
+        # Use bright green for confidence 1.0, black otherwise
+        outline_color = (
+            Color(r=0.0, g=1.0, b=0.0, a=1.0)
+            if self.confidence == 1.0
+            else Color(r=0.0, g=0.0, b=0.0, a=1.0)
+        )
+
         return [
             PointsAnnotation(
                 timestamp=to_ros_stamp(self.ts),
-                outline_color=Color(r=0.0, g=0.0, b=0.0, a=1.0),
+                outline_color=outline_color,
                 fill_color=Color.from_string(self.name, alpha=0.15),
                 thickness=thickness,
                 points_length=4,

From ce6a923bc582468abcaba53dfbc349794cd0c83d Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sun, 12 Oct 2025 11:23:34 -0700
Subject: [PATCH 21/47] intelligent annotation font size, model warmup function

---
 dimos/models/vl/base.py                       |  8 +++++
 dimos/models/vl/moondream.py                  |  1 +
 dimos/models/vl/test_models.py                | 33 ++++++++++++-------
 .../detection/type/detection2d/bbox.py        |  3 +-
 4 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/dimos/models/vl/base.py b/dimos/models/vl/base.py
index c7cb6457b3..cde41bd8fc 100644
--- a/dimos/models/vl/base.py
+++ b/dimos/models/vl/base.py
@@ -4,6 +4,7 @@
 
 from dimos.msgs.sensor_msgs import Image
 from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D
+from dimos.utils.data import get_data
 from dimos.utils.decorators import retry
 from dimos.utils.llm_utils import extract_json
 
@@ -63,6 +64,13 @@ class VlModel(ABC):
     @abstractmethod
     def query(self, image: Image, query: str, **kwargs) -> str: ...
 
+    def warmup(self) -> None:
+        try:
+            image = Image.from_file(get_data("cafe-smol.jpg")).to_rgb()
+            self._model.detect(image, "person", settings={"max_objects": 1})
+        except Exception:
+            pass
+
     # requery once if JSON parsing fails
     @retry(max_retries=2, on_exception=json.JSONDecodeError, delay=0.0)
     def query_json(self, image: Image, query: str) -> dict:
diff --git a/dimos/models/vl/moondream.py b/dimos/models/vl/moondream.py
index 1647f979fd..c147869778 100644
--- a/dimos/models/vl/moondream.py
+++ b/dimos/models/vl/moondream.py
@@ -36,6 +36,7 @@ def _model(self) -> AutoModelForCausalLM:
         )
         model = model.to(self._device)
         model.compile()
+
         return model
 
     def query(self, image: Image | np.ndarray, query: str, **kwargs) -> str:
diff --git a/dimos/models/vl/test_models.py b/dimos/models/vl/test_models.py
index 0c313fc14e..d8cf1ef819 100644
--- a/dimos/models/vl/test_models.py
+++ b/dimos/models/vl/test_models.py
@@ -8,6 +8,7 @@
 from dimos.models.vl.moondream import MoondreamVlModel
 from dimos.models.vl.qwen import QwenVlModel
 from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection.detectors.yolo import Yolo2DDetector
 from dimos.perception.detection.type import ImageDetections2D
 from dimos.utils.data import get_data
 
@@ -24,19 +25,26 @@
 def test_vlm(model_class, model_name):
     image = Image.from_file(get_data("cafe.jpg")).to_rgb()
 
-    print(f"\n{'=' * 60}")
     print(f"Testing {model_name}")
-    print(f"{'=' * 60}")
 
     # Initialize model
     print(f"Loading {model_name} model...")
     model: VlModel = model_class()
 
+    model.warmup()
+
+    # Publish to LCM with model-specific channel names
+    annotations_transport: LCMTransport[ImageAnnotations] = LCMTransport(
+        "/annotations", ImageAnnotations
+    )
+    image_transport: LCMTransport[Image] = LCMTransport("/image", Image)
+    image_transport.publish(image)
+
     queries = [
         "glasses",
         "blue shirt",
         "bulb",
-        "dog",
+        "old man's face",
         "flowers on the left table",
         "shoes",
         "leftmost persons ear",
@@ -46,6 +54,15 @@ def test_vlm(model_class, model_name):
     all_detections = ImageDetections2D(image)
     query_times = []
 
+    # # First, run YOLO detection
+    # print("\nRunning YOLO detection...")
+    # yolo_detector = Yolo2DDetector()
+    # yolo_detections = yolo_detector.process_image(image)
+    # print(f"  YOLO found {len(yolo_detections.detections)} objects")
+    # all_detections.detections.extend(yolo_detections.detections)
+    # annotations_transport.publish(all_detections.to_foxglove_annotations())
+
+    # Then run VLM queries
     for query in queries:
         print(f"\nQuerying for: {query}")
         start_time = time.time()
@@ -55,6 +72,7 @@ def test_vlm(model_class, model_name):
 
         print(f"  Found {len(detections)} detections in {query_time:.3f}s")
         all_detections.detections.extend(detections.detections)
+        annotations_transport.publish(all_detections.to_foxglove_annotations())
 
     avg_time = sum(query_times) / len(query_times) if query_times else 0
     print(f"\n{model_name} Results:")
@@ -62,16 +80,7 @@ def test_vlm(model_class, model_name):
     print(f"  Total detections: {len(all_detections)}")
     print(all_detections)
 
-    # Publish to LCM with model-specific channel names
-    annotations_transport: LCMTransport[ImageAnnotations] = LCMTransport(
-        "/annotations", ImageAnnotations
-    )
     annotations_transport.publish(all_detections.to_foxglove_annotations())
 
-    image_transport: LCMTransport[Image] = LCMTransport("/image", Image)
-    image_transport.publish(image)
-
     annotations_transport.lcm.stop()
     image_transport.lcm.stop()
-
-    print(f"Published {model_name} annotations and image to LCM")
diff --git a/dimos/perception/detection/type/detection2d/bbox.py b/dimos/perception/detection/type/detection2d/bbox.py
index 5b7e77f3ea..4039157399 100644
--- a/dimos/perception/detection/type/detection2d/bbox.py
+++ b/dimos/perception/detection/type/detection2d/bbox.py
@@ -15,7 +15,6 @@
 from __future__ import annotations
 
 import hashlib
-from abc import abstractmethod
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Union
 
@@ -246,7 +245,7 @@ def lcm_encode(self):
     def to_text_annotation(self) -> List[TextAnnotation]:
         x1, y1, x2, y2 = self.bbox
 
-        font_size = 20
+        font_size = self.image.width / 80
 
         # Build label text - exclude class_id if it's -1 (VLM detection)
         if self.class_id == -1:

From 484ae0d5a24869a3c91bccb6e0e00693ff93236c Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sun, 12 Oct 2025 11:33:10 -0700
Subject: [PATCH 22/47] messing with detections

---
 dimos/models/vl/test_models.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dimos/models/vl/test_models.py b/dimos/models/vl/test_models.py
index d8cf1ef819..4a75f4d0b0 100644
--- a/dimos/models/vl/test_models.py
+++ b/dimos/models/vl/test_models.py
@@ -30,7 +30,6 @@ def test_vlm(model_class, model_name):
     # Initialize model
     print(f"Loading {model_name} model...")
     model: VlModel = model_class()
-
     model.warmup()
 
     # Publish to LCM with model-specific channel names
@@ -44,7 +43,9 @@ def test_vlm(model_class, model_name):
         "glasses",
         "blue shirt",
         "bulb",
-        "old man's face",
+        "cigarette",
+        "reflection of a car",
+        "knee",
         "flowers on the left table",
         "shoes",
         "leftmost persons ear",

From d0fb0c0fe4938e98e391fd056627727cbdaedbd0 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sun, 12 Oct 2025 11:56:11 -0700
Subject: [PATCH 23/47] color brightness for from_string

---
 dimos/msgs/foxglove_msgs/Color.py             | 22 +++++++++++++++----
 .../detection/type/detection2d/bbox.py        |  8 ++-----
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/dimos/msgs/foxglove_msgs/Color.py b/dimos/msgs/foxglove_msgs/Color.py
index 30362f837a..59d60ccc35 100644
--- a/dimos/msgs/foxglove_msgs/Color.py
+++ b/dimos/msgs/foxglove_msgs/Color.py
@@ -22,12 +22,13 @@ class Color(LCMColor):
     """Color with convenience methods."""
 
     @classmethod
-    def from_string(cls, name: str, alpha: float = 0.2) -> Color:
+    def from_string(cls, name: str, alpha: float = 0.2, brightness: float = 1.0) -> Color:
         """Generate a consistent color from a string using hash function.
 
         Args:
             name: String to generate color from
             alpha: Transparency value (0.0-1.0)
+            brightness: Brightness multiplier (0.0-2.0). Values > 1.0 lighten towards white.
 
         Returns:
             Color instance with deterministic RGB values
@@ -41,10 +42,23 @@ def from_string(cls, name: str, alpha: float = 0.2) -> Color:
         g = hash_bytes[1] / 255.0
         b = hash_bytes[2] / 255.0
 
+        # Apply brightness adjustment
+        # If brightness > 1.0, mix with white to lighten
+        if brightness > 1.0:
+            mix_factor = brightness - 1.0  # 0.0 to 1.0
+            r = r + (1.0 - r) * mix_factor
+            g = g + (1.0 - g) * mix_factor
+            b = b + (1.0 - b) * mix_factor
+        else:
+            # If brightness < 1.0, darken by scaling
+            r *= brightness
+            g *= brightness
+            b *= brightness
+
         # Create and return color instance
         color = cls()
-        color.r = r
-        color.g = g
-        color.b = b
+        color.r = min(1.0, r)
+        color.g = min(1.0, g)
+        color.b = min(1.0, b)
         color.a = alpha
         return color
diff --git a/dimos/perception/detection/type/detection2d/bbox.py b/dimos/perception/detection/type/detection2d/bbox.py
index 4039157399..859ca21dee 100644
--- a/dimos/perception/detection/type/detection2d/bbox.py
+++ b/dimos/perception/detection/type/detection2d/bbox.py
@@ -284,12 +284,8 @@ def to_points_annotation(self) -> List[PointsAnnotation]:
 
         thickness = 1
 
-        # Use bright green for confidence 1.0, black otherwise
-        outline_color = (
-            Color(r=0.0, g=1.0, b=0.0, a=1.0)
-            if self.confidence == 1.0
-            else Color(r=0.0, g=0.0, b=0.0, a=1.0)
-        )
+        # Use consistent color based on object name, brighter for outline
+        outline_color = Color.from_string(self.name, alpha=1.0, brightness=1.25)
 
         return [
             PointsAnnotation(

From 33df73881ecc91dcdba2bbb22289f212e4200597 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sun, 12 Oct 2025 22:45:22 -0700
Subject: [PATCH 24/47] mobileclip for reid, transforms for detections

---
 data/.lfs/models_mobileclip.tar.gz            |   3 +
 data/.lfs/models_yolo.tar.gz                  |   4 +-
 dimos/models/vl/moondream.py                  |  27 +---
 dimos/models/vl/test_models.py                |  16 ++-
 .../detection/detectors/person/yolo.py        |  37 +++--
 dimos/perception/detection/module2D.py        |  85 ++++++++++-
 dimos/perception/detection/module3D.py        |  19 ++-
 dimos/perception/detection/reid/mobileclip.py |  48 +++++++
 .../detection/reid/test_mobileclip.py         | 136 ++++++++++++++++++
 .../detection/type/detection2d/bbox.py        |  30 ++--
 .../detection/type/detection2d/person.py      |   6 +
 .../modular/connection_module.py              |  26 +++-
 .../unitree_webrtc/modular/ivan_unitree.py    |  35 +++--
 13 files changed, 377 insertions(+), 95 deletions(-)
 create mode 100644 data/.lfs/models_mobileclip.tar.gz
 create mode 100644 dimos/perception/detection/reid/mobileclip.py
 create mode 100644 dimos/perception/detection/reid/test_mobileclip.py

diff --git a/data/.lfs/models_mobileclip.tar.gz b/data/.lfs/models_mobileclip.tar.gz
new file mode 100644
index 0000000000..874c94de07
--- /dev/null
+++ b/data/.lfs/models_mobileclip.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f8022e365d9e456dcbd3913d36bf8c68a4cd086eb777c92a773c8192cd8235d
+size 277814612
diff --git a/data/.lfs/models_yolo.tar.gz b/data/.lfs/models_yolo.tar.gz
index aca0915dfd..650d4617ca 100644
--- a/data/.lfs/models_yolo.tar.gz
+++ b/data/.lfs/models_yolo.tar.gz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ed4a5160d4edfda145b6752b5c49ad22bc2887b66b9b9c38bd8c35fb5ffaf8f
-size 9315806
+oid sha256:01796d5884cf29258820cf0e617bf834e9ffb63d8a4c7a54eea802e96fe6a818
+size 72476992
diff --git a/dimos/models/vl/moondream.py b/dimos/models/vl/moondream.py
index c147869778..a3b9f5fcca 100644
--- a/dimos/models/vl/moondream.py
+++ b/dimos/models/vl/moondream.py
@@ -62,9 +62,7 @@ def query(self, image: Image | np.ndarray, query: str, **kwargs) -> str:
 
         return str(result)
 
-    def query_detections(
-        self, image: Image, query: str, max_objects: int = 10
-    ) -> ImageDetections2D:
+    def query_detections(self, image: Image, query: str, **kwargs) -> ImageDetections2D:
         """Detect objects using Moondream's native detect method.
 
         Args:
@@ -77,7 +75,7 @@ def query_detections(
         """
         pil_image = PILImage.fromarray(image.data)
 
-        settings = {"max_objects": max_objects}
+        settings = {"max_objects": kwargs.get("max_objects", 5)}
         result = self._model.detect(pil_image, query, settings=settings)
 
         # Convert to ImageDetections2D
@@ -114,24 +112,3 @@ def query_detections(
                 image_detections.detections.append(detection)
 
         return image_detections
-
-
-if __name__ == "__main__":
-    from dimos.utils.data import get_data
-
-    # Load test image
-    image = Image.from_file(get_data("cafe.jpg"))
-
-    # Initialize the model
-    print("Loading Moondream model...")
-    model = MoondreamVlModel()
-
-    # Test text query
-    #    print("\nQuerying: 'What's in this image?'")
-    #    answer = model.query(image, "What's in this image?")
-    #    print(f"Answer: {answer}")
-
-    # Test detection query
-    print(model.query_detections(image, "person", max_objects=5))
-    print("detect glass")
-    print(model.query_detections(image, "glass", max_objects=5))
diff --git a/dimos/models/vl/test_models.py b/dimos/models/vl/test_models.py
index 4a75f4d0b0..66c6a2326a 100644
--- a/dimos/models/vl/test_models.py
+++ b/dimos/models/vl/test_models.py
@@ -32,13 +32,6 @@ def test_vlm(model_class, model_name):
     model: VlModel = model_class()
     model.warmup()
 
-    # Publish to LCM with model-specific channel names
-    annotations_transport: LCMTransport[ImageAnnotations] = LCMTransport(
-        "/annotations", ImageAnnotations
-    )
-    image_transport: LCMTransport[Image] = LCMTransport("/image", Image)
-    image_transport.publish(image)
-
     queries = [
         "glasses",
         "blue shirt",
@@ -63,6 +56,15 @@ def test_vlm(model_class, model_name):
     # all_detections.detections.extend(yolo_detections.detections)
     # annotations_transport.publish(all_detections.to_foxglove_annotations())
 
+    # Publish to LCM with model-specific channel names
+    annotations_transport: LCMTransport[ImageAnnotations] = LCMTransport(
+        "/annotations", ImageAnnotations
+    )
+
+    image_transport: LCMTransport[Image] = LCMTransport("/image", Image)
+
+    image_transport.publish(image)
+
     # Then run VLM queries
     for query in queries:
         print(f"\nQuerying for: {query}")
diff --git a/dimos/perception/detection/detectors/person/yolo.py b/dimos/perception/detection/detectors/person/yolo.py
index a4e764878c..72c1d92348 100644
--- a/dimos/perception/detection/detectors/person/yolo.py
+++ b/dimos/perception/detection/detectors/person/yolo.py
@@ -26,7 +26,7 @@
 
 
 class YoloPersonDetector(Detector):
-    def __init__(self, model_path="models_yolo", model_name="yolo11n-pose.pt", device="cpu"):
+    def __init__(self, model_path="models_yolo", model_name="yolo11s-pose.pt", device: str = None):
         """Initialize the YOLO person detector.
 
         Args:
@@ -34,17 +34,24 @@ def __init__(self, model_path="models_yolo", model_name="yolo11n-pose.pt", devic
             model_name (str): Name of the YOLO model weights file
             device (str): Device to run inference on ('cuda' or 'cpu')
         """
-        self.device = device
-        self.model = YOLO(get_data(model_path) / model_name, task="pose")
-
-        if is_cuda_available():
-            if hasattr(onnxruntime, "preload_dlls"):  # Handles CUDA 11 / onnxruntime-gpu<=1.18
-                onnxruntime.preload_dlls(cuda=True, cudnn=True)
-            self.device = "cuda"
-            logger.debug("Using CUDA for YOLO person detector")
+        self.model = YOLO(
+            get_data(model_path) / model_name,
+            task="track",
+        )
+        self.tracker = get_data(model_path) / "botsort.yaml"
+
+        if device:
+            self.device = device
+            return
         else:
-            self.device = "cpu"
-            logger.debug("Using CPU for YOLO person detector")
+            if is_cuda_available():
+                if hasattr(onnxruntime, "preload_dlls"):  # Handles CUDA 11 / onnxruntime-gpu<=1.18
+                    onnxruntime.preload_dlls(cuda=True, cudnn=True)
+                self.device = "cuda"
+                logger.info("Using CUDA for YOLO person detector")
+            else:
+                self.device = "cpu"
+                logger.info("Using CPU for YOLO person detector")
 
     def process_image(self, image: Image) -> ImageDetections2D:
         """Process image and return detection results.
@@ -55,5 +62,11 @@ def process_image(self, image: Image) -> ImageDetections2D:
         Returns:
             ImageDetections2D containing Detection2DPerson objects with pose keypoints
         """
-        results = self.model(source=image.to_opencv(), device=self.device)
+        results = self.model.track(
+            source=image.to_opencv(),
+            verbose=False,
+            conf=0.5,
+            tracker=self.tracker,
+            persist=True,
+        )
         return ImageDetections2D.from_ultralytics_result(image, results)
diff --git a/dimos/perception/detection/module2D.py b/dimos/perception/detection/module2D.py
index 50c3010d4b..ec87107fce 100644
--- a/dimos/perception/detection/module2D.py
+++ b/dimos/perception/detection/module2D.py
@@ -12,22 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Tuple
 
 from dimos_lcm.foxglove_msgs.ImageAnnotations import (
     ImageAnnotations,
 )
+from dimos_lcm.sensor_msgs import CameraInfo
 from reactivex import operators as ops
 from reactivex.observable import Observable
 from reactivex.subject import Subject
 
 from dimos.core import In, Module, Out, rpc
 from dimos.core.module import ModuleConfig
+from dimos.msgs.geometry_msgs import Transform, Vector3
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.sensor_msgs.Image import sharpness_barrier
 from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.perception.detection.detectors import Detector
 from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
+from dimos.perception.detection.detectors.yolo import Yolo2DDetector
 from dimos.perception.detection.type import (
     ImageDetections2D,
 )
@@ -39,6 +42,7 @@
 class Config(ModuleConfig):
     max_freq: float = 10  # hz
     detector: Optional[Callable[[Any], Detector]] = YoloPersonDetector
+    camera_info: CameraInfo = CameraInfo()
 
 
 class Detection2DModule(Module):
@@ -55,11 +59,14 @@ class Detection2DModule(Module):
     detected_image_1: Out[Image] = None  # type: ignore
     detected_image_2: Out[Image] = None  # type: ignore
 
+    cnt: int = 0
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.config: Config = Config(**kwargs)
         self.detector = self.config.detector()
         self.vlm_detections_subject = Subject()
+        self.previous_detection_count = 0
 
     def process_image_frame(self, image: Image) -> ImageDetections2D:
         return self.detector.process_image(image)
@@ -74,13 +81,81 @@ def sharp_image_stream(self) -> Observable[Image]:
 
     @simple_mcache
     def detection_stream_2d(self) -> Observable[ImageDetections2D]:
-        return backpressure(self.sharp_image_stream().pipe(ops.map(self.process_image_frame)))
+        return backpressure(self.image.observable().pipe(ops.map(self.process_image_frame)))
+
+    def pixel_to_3d(
+        self,
+        pixel: Tuple[int, int],
+        camera_info: CameraInfo,
+        assumed_depth: float = 1.0,
+    ) -> Vector3:
+        """Unproject 2D pixel coordinates to 3D position in camera optical frame.
+
+        Args:
+            camera_info: Camera calibration information
+            assumed_depth: Assumed depth in meters (default 1.0m from camera)
+
+        Returns:
+            Vector3 position in camera optical frame coordinates
+        """
+        # Extract camera intrinsics
+        fx, fy = camera_info.K[0], camera_info.K[4]
+        cx, cy = camera_info.K[2], camera_info.K[5]
+
+        # Unproject pixel to normalized camera coordinates
+        x_norm = (pixel[0] - cx) / fx
+        y_norm = (pixel[1] - cy) / fy
+
+        # Create 3D point at assumed depth in camera optical frame
+        # Camera optical frame: X right, Y down, Z forward
+        return Vector3(x_norm * assumed_depth, y_norm * assumed_depth, assumed_depth)
+
+    def track(self, detections: ImageDetections2D):
+        sensor_frame = self.tf.get("sensor", "camera_optical", detections.image.ts, 5.0)
+
+        if not sensor_frame:
+            return
+
+        if not detections.detections:
+            return
+
+        sensor_frame.child_frame_id = "sensor_frame"
+        transforms = [sensor_frame]
+
+        current_count = len(detections.detections)
+        max_count = max(current_count, self.previous_detection_count)
+
+        # Publish transforms for all detection slots up to max_count
+        for index in range(max_count):
+            if index < current_count:
+                # Active detection - compute real position
+                detection = detections.detections[index]
+                position_3d = self.pixel_to_3d(
+                    detection.center_bbox, self.config.camera_info, assumed_depth=1.0
+                )
+            else:
+                # No detection at this index - publish zero transform
+                position_3d = Vector3(0.0, 0.0, 0.0)
+
+            transforms.append(
+                Transform(
+                    frame_id=sensor_frame.child_frame_id,
+                    child_frame_id=f"det_{index}",
+                    ts=detections.image.ts,
+                    translation=position_3d,
+                )
+            )
+
+        self.previous_detection_count = current_count
+        self.tf.publish(*transforms)
 
     @rpc
     def start(self):
-        self.detection_stream_2d().subscribe(
-            lambda det: self.detections.publish(det.to_ros_detection2d_array())
-        )
+        self.detection_stream_2d().subscribe(self.track)
+
+        # self.detection_stream_2d().subscribe(
+        #    lambda det: self.detections.publish(det.to_ros_detection2d_array())
+        # )
 
         self.detection_stream_2d().subscribe(
             lambda det: self.annotations.publish(det.to_foxglove_annotations())
diff --git a/dimos/perception/detection/module3D.py b/dimos/perception/detection/module3D.py
index a09cdb0e74..68e98afe3f 100644
--- a/dimos/perception/detection/module3D.py
+++ b/dimos/perception/detection/module3D.py
@@ -15,7 +15,9 @@
 
 from typing import Optional
 
+from dimos_lcm.foxglove_msgs.ImageAnnotations import ImageAnnotations
 from dimos_lcm.sensor_msgs import CameraInfo
+from lcm_msgs.foxglove_msgs import SceneUpdate
 from reactivex import operators as ops
 from reactivex.observable import Observable
 
@@ -23,6 +25,8 @@
 from dimos.core import In, Out, rpc
 from dimos.msgs.geometry_msgs import Transform
 from dimos.msgs.sensor_msgs import Image, PointCloud2
+from dimos.msgs.vision_msgs import Detection2DArray
+from dimos.perception.detection.module2D import Config as Module2DConfig
 from dimos.perception.detection.module2D import Detection2DModule
 from dimos.perception.detection.type import (
     ImageDetections2D,
@@ -33,12 +37,17 @@
 from dimos.utils.reactive import backpressure
 
 
-class Detection3DModule(Detection2DModule):
-    camera_info: CameraInfo
+class Config(Module2DConfig): ...
+
 
+class Detection3DModule(Detection2DModule):
     image: In[Image] = None  # type: ignore
     pointcloud: In[PointCloud2] = None  # type: ignore
 
+    detections: Out[Detection2DArray] = None  # type: ignore
+    annotations: Out[ImageAnnotations] = None  # type: ignore
+    scene_update: Out[SceneUpdate] = None  # type: ignore
+
     # just for visualization,
     # emits latest pointclouds of detected objects in a frame
     detected_pointcloud_0: Out[PointCloud2] = None  # type: ignore
@@ -52,10 +61,6 @@ class Detection3DModule(Detection2DModule):
 
     detection_3d_stream: Optional[Observable[ImageDetections3DPC]] = None
 
-    def __init__(self, camera_info: CameraInfo, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.camera_info = camera_info
-
     def process_frame(
         self,
         detections: ImageDetections2D,
@@ -70,7 +75,7 @@ def process_frame(
             detection3d = Detection3DPC.from_2d(
                 detection,
                 world_pointcloud=pointcloud,
-                camera_info=self.camera_info,
+                camera_info=self.config.camera_info,
                 world_to_optical_transform=transform,
             )
             if detection3d is not None:
diff --git a/dimos/perception/detection/reid/mobileclip.py b/dimos/perception/detection/reid/mobileclip.py
new file mode 100644
index 0000000000..0ed800e4ee
--- /dev/null
+++ b/dimos/perception/detection/reid/mobileclip.py
@@ -0,0 +1,48 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import open_clip
+import torch
+import torch.nn.functional as F
+from PIL import Image
+
+
+def test_embed():
+    # 1) Pick a MobileCLIP variant that OpenCLIP exposes directly
+    # Good starts: 'MobileCLIP-S2' or 'MobileCLIP-B' with pretrained='datacompdr'
+    model_name = "MobileCLIP-S2"
+    pretrained = "datacompdr"  # OpenCLIP key
+    device = "cuda"
+
+    model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
+    tokenizer = open_clip.get_tokenizer(model_name)
+    model = model.eval().to(device)
+
+    # 2) Encode an image (or crops) → unit-norm embedding
+    def embed_images(imgs_rgb: list[Image.Image]) -> np.ndarray:
+        with torch.inference_mode(), torch.cuda.amp.autocast(True):
+            batch = torch.stack([preprocess(im.convert("RGB")) for im in imgs_rgb]).to(device)
+            feats = model.encode_image(batch)
+            feats = F.normalize(feats, dim=-1)
+        return feats.detach().cpu().numpy()
+
+    # 3) Cosine distance for re-ID
+    def cosine_distance(u, v):  # u,v are L2-normalized
+        return 1.0 - float((u @ v))
+
+    # Example
+    im = Image.open("person_crop.jpg")
+    emb = embed_images([im])[0]
+    print(emb.shape)  # e.g. (512,) depending on backbone
diff --git a/dimos/perception/detection/reid/test_mobileclip.py b/dimos/perception/detection/reid/test_mobileclip.py
new file mode 100644
index 0000000000..755ea5a4ee
--- /dev/null
+++ b/dimos/perception/detection/reid/test_mobileclip.py
@@ -0,0 +1,136 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import open_clip
+import pytest
+import torch
+import torch.nn.functional as F
+from PIL import Image as PILImage
+
+from dimos.msgs.sensor_msgs import Image
+from dimos.utils.data import get_data
+
+
+@pytest.fixture(scope="session")
+def mobileclip_model():
+    """Load MobileCLIP model once for all tests."""
+    model_name = "MobileCLIP2-S0"
+    model_path = get_data("models_mobileclip") / "mobileclip2_s0.pt"
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    model, _, preprocess = open_clip.create_model_and_transforms(
+        model_name, pretrained=str(model_path)
+    )
+    tokenizer = open_clip.get_tokenizer(model_name)
+    model = model.eval().to(device)
+
+    return {
+        "model": model,
+        "preprocess": preprocess,
+        "tokenizer": tokenizer,
+        "device": device,
+    }
+
+
+@pytest.fixture(scope="session")
+def test_image():
+    """Load test image."""
+    return Image.from_file(get_data("cafe.jpg")).to_rgb()
+
+
+def embed_images(model_dict, pil_images):
+    """Embed PIL images using MobileCLIP."""
+    model = model_dict["model"]
+    preprocess = model_dict["preprocess"]
+    device = model_dict["device"]
+
+    with torch.inference_mode():
+        batch = torch.stack([preprocess(img) for img in pil_images]).to(device)
+        feats = model.encode_image(batch)
+        feats = F.normalize(feats, dim=-1)
+    return feats.detach().cpu().numpy()
+
+
+@pytest.mark.heavy
+def test_mobileclip_embedding(mobileclip_model, test_image):
+    """Test that MobileCLIP can embed the test image."""
+    # Convert to PIL
+    pil_image = PILImage.fromarray(test_image.to_opencv())
+
+    # Embed
+    embedding = embed_images(mobileclip_model, [pil_image])[0]
+
+    print(f"\nEmbedding shape: {embedding.shape}")
+    print(f"Embedding dtype: {embedding.dtype}")
+    print(f"Embedding norm: {np.linalg.norm(embedding):.4f}")
+    print(f"Embedding min/max: [{embedding.min():.4f}, {embedding.max():.4f}]")
+
+    # Validate embedding
+    assert embedding.shape[0] > 0, "Embedding should have features"
+    assert embedding.dtype == np.float32 or embedding.dtype == np.float64
+    assert np.isfinite(embedding).all(), "Embedding should contain finite values"
+
+    # Check L2 normalization (should be ~1.0)
+    norm = np.linalg.norm(embedding)
+    assert abs(norm - 1.0) < 0.01, f"Embedding should be L2 normalized, got norm={norm}"
+
+
+@pytest.mark.heavy
+def test_mobileclip_text_similarity(mobileclip_model, test_image):
+    """Test text-image similarity with MobileCLIP."""
+    model = mobileclip_model["model"]
+    tokenizer = mobileclip_model["tokenizer"]
+    device = mobileclip_model["device"]
+
+    # Get image embedding
+    pil_image = PILImage.fromarray(test_image.to_opencv())
+    img_embedding = embed_images(mobileclip_model, [pil_image])[0]
+
+    # Encode text queries
+    queries = ["a cafe", "a person", "a car", "a dog", "potato", "food", "dinner", "rock"]
+
+    with torch.inference_mode():
+        text_tokens = tokenizer(queries).to(device)
+        text_features = model.encode_text(text_tokens)
+        text_features = F.normalize(text_features, dim=-1)
+        text_embeddings = text_features.detach().cpu().numpy()
+
+    # Compute similarities (cosine similarity = 1 - cosine distance)
+    similarities = {}
+    for query, text_emb in zip(queries, text_embeddings):
+        similarity = float(img_embedding @ text_emb)
+        similarities[query] = similarity
+        print(f"\n'{query}': {similarity:.4f}")
+
+    # Cafe image should match "a cafe" better than "a dog"
+    assert similarities["a cafe"] > similarities["a dog"], "Should recognize cafe scene"
+    assert similarities["a person"] > similarities["a car"], "Should detect people in cafe"
+
+
+@pytest.mark.heavy
+def test_mobileclip_cosine_distance(mobileclip_model, test_image):
+    """Test cosine distance metric for re-identification."""
+    pil_image = PILImage.fromarray(test_image.to_opencv())
+
+    # Embed same image twice
+    emb1 = embed_images(mobileclip_model, [pil_image])[0]
+    emb2 = embed_images(mobileclip_model, [pil_image])[0]
+
+    # Cosine distance between same image should be ~0
+    cosine_dist = 1.0 - float(emb1 @ emb2)
+
+    print(f"\nCosine distance (same image): {cosine_dist:.6f}")
+
+    assert cosine_dist < 0.01, f"Same image should have distance ~0, got {cosine_dist}"
diff --git a/dimos/perception/detection/type/detection2d/bbox.py b/dimos/perception/detection/type/detection2d/bbox.py
index 859ca21dee..6eaeb919b2 100644
--- a/dimos/perception/detection/type/detection2d/bbox.py
+++ b/dimos/perception/detection/type/detection2d/bbox.py
@@ -139,6 +139,12 @@ def __str__(self):
             console.print(*parts, end="")
         return capture.get().strip()
 
+    @property
+    def center_bbox(self) -> Tuple[float, float]:
+        """Get center point of bounding box."""
+        x1, y1, x2, y2 = self.bbox
+        return ((x1 + x2) / 2, (y1 + y2) / 2)
+
     def bbox_2d_volume(self) -> float:
         x1, y1, x2, y2 = self.bbox
         width = max(0.0, x2 - x1)
@@ -291,7 +297,7 @@ def to_points_annotation(self) -> List[PointsAnnotation]:
             PointsAnnotation(
                 timestamp=to_ros_stamp(self.ts),
                 outline_color=outline_color,
-                fill_color=Color.from_string(self.name, alpha=0.15),
+                fill_color=Color.from_string(self.name, alpha=0.2),
                 thickness=thickness,
                 points_length=4,
                 points=[
@@ -348,8 +354,6 @@ def from_ros_detection2d(cls, ros_det: ROSDetection2D, **kwargs) -> "Detection2D
         # Extract timestamp
         ts = to_timestamp(ros_det.header.stamp)
 
-        # Name is not stored in ROS Detection2D, so we'll use a placeholder
-        # Remove 'name' from kwargs if present to avoid duplicate
         name = kwargs.pop("name", f"class_{class_id}")
 
         return cls(
@@ -413,23 +417,7 @@ def from_ultralytics_result(
                 else:
                     # Regular bbox detection
                     detection = Detection2DBBox.from_ultralytics_result(result, i, image)
-                detections.append(detection)
+                if detection.is_valid():
+                    detections.append(detection)
 
         return cls(image=image, detections=detections)
-
-    @classmethod
-    def from_pose_detector(
-        cls, image: Image, people: Sequence["Detection2DPerson"], **kwargs
-    ) -> "ImageDetections2D":
-        """Create ImageDetections2D from a list of Detection2DPerson detections.
-        Args:
-            image: Source image
-            people: Sequence of Detection2DPerson objects with pose keypoints
-        Returns:
-            ImageDetections2D containing the pose detections
-        """
-        detections: List[Detection2D] = list(people)
-        return cls(
-            image=image,
-            detections=detections,
-        )
diff --git a/dimos/perception/detection/type/detection2d/person.py b/dimos/perception/detection/type/detection2d/person.py
index d339dff39d..4390437ede 100644
--- a/dimos/perception/detection/type/detection2d/person.py
+++ b/dimos/perception/detection/type/detection2d/person.py
@@ -25,6 +25,7 @@
 from dimos.msgs.sensor_msgs import Image
 from dimos.perception.detection.type.detection2d.bbox import Bbox, Detection2DBBox
 from dimos.types.timestamped import to_ros_stamp
+from dimos.utils.decorators.decorators import simple_mcache
 
 if TYPE_CHECKING:
     from ultralytics.engine.results import Results
@@ -193,6 +194,11 @@ def get_visible_keypoints(self, threshold: float = 0.5) -> List[Tuple[str, np.nd
                 visible.append((name, self.keypoints[i], score))
         return visible
 
+    @simple_mcache
+    def is_valid(self) -> bool:
+        valid_keypoints = sum(1 for score in self.keypoint_scores if score > 0.8)
+        return valid_keypoints >= 5
+
     @property
     def width(self) -> float:
         """Get width of bounding box."""
diff --git a/dimos/robot/unitree_webrtc/modular/connection_module.py b/dimos/robot/unitree_webrtc/modular/connection_module.py
index 6e13ed938e..57f508b552 100644
--- a/dimos/robot/unitree_webrtc/modular/connection_module.py
+++ b/dimos/robot/unitree_webrtc/modular/connection_module.py
@@ -30,7 +30,8 @@
 from reactivex.observable import Observable
 
 from dimos.agents2 import Agent, Output, Reducer, Stream, skill
-from dimos.core import DimosCluster, In, LCMTransport, Module, ModuleConfig, Out, rpc
+from dimos.constants import DEFAULT_CAPACITY_COLOR_IMAGE
+from dimos.core import DimosCluster, In, LCMTransport, Module, ModuleConfig, Out, pSHMTransport, rpc
 from dimos.msgs.foxglove_msgs import ImageAnnotations
 from dimos.msgs.geometry_msgs import PoseStamped, Quaternion, Transform, Twist, Vector3
 from dimos.msgs.sensor_msgs.Image import Image, sharpness_window
@@ -175,7 +176,7 @@ def start(self):
             case "webrtc":
                 self.connection = UnitreeWebRTCConnection(**self.connection_config)
             case "fake":
-                self.connection = FakeRTC(**self.connection_config)
+                self.connection = FakeRTC(**self.connection_config, seek=12.0)
             case "mujoco":
                 from dimos.robot.unitree_webrtc.mujoco_connection import MujocoConnection
 
@@ -223,10 +224,19 @@ def _odom_to_tf(self, odom: PoseStamped) -> List[Transform]:
             ts=odom.ts,
         )
 
+        sensor = Transform(
+            translation=Vector3(0.0, 0.0, 0.0),
+            rotation=Quaternion(0.0, 0.0, 0.0, 1.0),
+            frame_id="world",
+            child_frame_id="sensor",
+            ts=odom.ts,
+        )
+
         return [
             Transform.from_pose("base_link", odom),
             camera_link,
             camera_optical,
+            sensor,
         ]
 
     def _publish_tf(self, msg):
@@ -302,9 +312,19 @@ def deploy_connection(dimos: DimosCluster, **kwargs):
         **kwargs,
     )
 
-    connection.lidar.transport = LCMTransport("/lidar", LidarMessage)
     connection.odom.transport = LCMTransport("/odom", PoseStamped)
+
+    #    connection.video.transport = pSHMTransport(
+    #        "/image", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE
+    #    )
+
+    #    connection.lidar.transport = pSHMTransport(
+    #        "/lidar", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE
+    #    )
+
     connection.video.transport = LCMTransport("/image", Image)
+    connection.lidar.transport = LCMTransport("/lidar", LidarMessage)
+
     connection.movecmd.transport = LCMTransport("/cmd_vel", Vector3)
     connection.camera_info.transport = LCMTransport("/camera_info", CameraInfo)
 
diff --git a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
index e892ad35dc..95ace0c423 100644
--- a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
+++ b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
@@ -15,7 +15,7 @@
 import logging
 import time
 
-from lcm_msgs.foxglove_msgs import SceneUpdate
+from dimos_lcm.foxglove_msgs import SceneUpdate
 
 from dimos.agents2.spec import Model, Provider
 from dimos.core import LCMTransport, start
@@ -24,8 +24,10 @@
 from dimos.msgs.foxglove_msgs import ImageAnnotations
 from dimos.msgs.sensor_msgs import Image, PointCloud2
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection.moduleDB import ObjectDBModule
+from dimos.perception.detection.module2D import Detection2DModule
+from dimos.perception.detection.module3D import Detection3DModule
 from dimos.protocol.pubsub import lcm
+from dimos.robot.foxglove_bridge import FoxgloveBridge
 from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation
 from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule
 from dimos.utils.logging_config import setup_logger
@@ -44,30 +46,37 @@ def goto(pose):
         return True
 
     module3D = dimos.deploy(
-        ObjectDBModule,
-        goto=goto,
+        Detection2DModule,
+        # goto=goto,
         camera_info=ConnectionModule._camera_info(),
     )
 
     module3D.image.connect(connection.video)
     # module3D.pointcloud.connect(mapper.global_map)
-    module3D.pointcloud.connect(connection.lidar)
+    # module3D.pointcloud.connect(connection.lidar)
 
     module3D.annotations.transport = LCMTransport("/annotations", ImageAnnotations)
     module3D.detections.transport = LCMTransport("/detections", Detection2DArray)
 
-    module3D.detected_pointcloud_0.transport = LCMTransport("/detected/pointcloud/0", PointCloud2)
-    module3D.detected_pointcloud_1.transport = LCMTransport("/detected/pointcloud/1", PointCloud2)
-    module3D.detected_pointcloud_2.transport = LCMTransport("/detected/pointcloud/2", PointCloud2)
+    # module3D.detected_pointcloud_0.transport = LCMTransport("/detected/pointcloud/0", PointCloud2)
+    # module3D.detected_pointcloud_1.transport = LCMTransport("/detected/pointcloud/1", PointCloud2)
+    # module3D.detected_pointcloud_2.transport = LCMTransport("/detected/pointcloud/2", PointCloud2)
 
     module3D.detected_image_0.transport = LCMTransport("/detected/image/0", Image)
     module3D.detected_image_1.transport = LCMTransport("/detected/image/1", Image)
     module3D.detected_image_2.transport = LCMTransport("/detected/image/2", Image)
-
-    module3D.scene_update.transport = LCMTransport("/scene_update", SceneUpdate)
+    # module3D.scene_update.transport = LCMTransport("/scene_update", SceneUpdate)
 
     module3D.start()
     connection.start()
+    bridge = FoxgloveBridge(
+        #        shm_channels=[
+        #            "/image#sensor_msgs.Image",
+        #            "/lidar#sensor_msgs.PointCloud2",
+        #        ]
+    )
+    # bridge = FoxgloveBridge()
+    bridge.start()
 
     from dimos.agents2 import Agent, Output, Reducer, Stream, skill
     from dimos.agents2.cli.human import HumanInput
@@ -84,10 +93,10 @@ def goto(pose):
     agent.register_skills(module3D)
 
     # agent.run_implicit_skill("video_stream_tool")
-    agent.run_implicit_skill("human")
+    # agent.run_implicit_skill("human")
 
-    agent.start()
-    agent.loop_thread()
+    # agent.start()
+    # agent.loop_thread()
 
     try:
         while True:

From 57d5296e2c792390123ec8f83a15d6d8fee5ded3 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Sun, 12 Oct 2025 23:55:16 -0700
Subject: [PATCH 25/47] detection reconstruction in another module

---
 dimos/msgs/vision_msgs/Detection2DArray.py    |  7 +++-
 dimos/perception/detection/conftest.py        |  2 +-
 dimos/perception/detection/detectors/yolo.py  |  7 +++-
 dimos/perception/detection/module2D.py        |  6 +--
 .../detection/type/detection2d/bbox.py        | 13 +++++++
 .../detection/type/imageDetections.py         |  3 +-
 .../detection/type/test_detection2d.py        | 38 +++++++++++++++++++
 7 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/dimos/msgs/vision_msgs/Detection2DArray.py b/dimos/msgs/vision_msgs/Detection2DArray.py
index 133893b9f0..79c84f7609 100644
--- a/dimos/msgs/vision_msgs/Detection2DArray.py
+++ b/dimos/msgs/vision_msgs/Detection2DArray.py
@@ -11,12 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from dimos_lcm.vision_msgs.Detection2DArray import Detection2DArray as LCMDetection2DArray
 
+from dimos.types.timestamped import to_timestamp
+
 
 class Detection2DArray(LCMDetection2DArray):
     msg_name = "vision_msgs.Detection2DArray"
 
     # for _get_field_type() to work when decoding in _decode_one()
     __annotations__ = LCMDetection2DArray.__annotations__
+
+    @property
+    def ts(self) -> float:
+        return to_timestamp(self.header.stamp)
diff --git a/dimos/perception/detection/conftest.py b/dimos/perception/detection/conftest.py
index de0e0d21b6..9016713cff 100644
--- a/dimos/perception/detection/conftest.py
+++ b/dimos/perception/detection/conftest.py
@@ -196,7 +196,7 @@ def detection3dpc(get_moment_3dpc) -> Detection3DPC:
 def get_moment_2d(get_moment) -> Generator[Callable[[], Moment2D], None, None]:
     from dimos.perception.detection.detectors import Yolo2DDetector
 
-    module = Detection2DModule(detector=Yolo2DDetector)
+    module = Detection2DModule(detector=lambda: Yolo2DDetector(device="cpu"))
 
     @functools.lru_cache(maxsize=1)
     def moment_provider(**kwargs) -> Moment2D:
diff --git a/dimos/perception/detection/detectors/yolo.py b/dimos/perception/detection/detectors/yolo.py
index af457540cc..459da20579 100644
--- a/dimos/perception/detection/detectors/yolo.py
+++ b/dimos/perception/detection/detectors/yolo.py
@@ -29,7 +29,7 @@
 
 
 class Yolo2DDetector(Detector):
-    def __init__(self, model_path="models_yolo", model_name="yolo11n.onnx", device="cpu"):
+    def __init__(self, model_path="models_yolo", model_name="yolo11n.onnx", device: str = None):
         """
         Initialize the YOLO detector.
 
@@ -38,11 +38,14 @@ def __init__(self, model_path="models_yolo", model_name="yolo11n.onnx", device="
             model_name (str): Name of the YOLO model weights file
             device (str): Device to run inference on ('cuda' or 'cpu')
         """
-        self.device = device
         self.model = YOLO(get_data(model_path) / model_name, task="detect")
 
         module_dir = os.path.dirname(__file__)
         self.tracker_config = os.path.join(module_dir, "config", "custom_tracker.yaml")
+
+        if device:
+            self.device = device
+            return
         if is_cuda_available():
             if hasattr(onnxruntime, "preload_dlls"):  # Handles CUDA 11 / onnxruntime-gpu<=1.18
                 onnxruntime.preload_dlls(cuda=True, cudnn=True)
diff --git a/dimos/perception/detection/module2D.py b/dimos/perception/detection/module2D.py
index ec87107fce..86dcfd2ab3 100644
--- a/dimos/perception/detection/module2D.py
+++ b/dimos/perception/detection/module2D.py
@@ -153,9 +153,9 @@ def track(self, detections: ImageDetections2D):
     def start(self):
         self.detection_stream_2d().subscribe(self.track)
 
-        # self.detection_stream_2d().subscribe(
-        #    lambda det: self.detections.publish(det.to_ros_detection2d_array())
-        # )
+        self.detection_stream_2d().subscribe(
+            lambda det: self.detections.publish(det.to_ros_detection2d_array())
+        )
 
         self.detection_stream_2d().subscribe(
             lambda det: self.annotations.publish(det.to_foxglove_annotations())
diff --git a/dimos/perception/detection/type/detection2d/bbox.py b/dimos/perception/detection/type/detection2d/bbox.py
index 6eaeb919b2..554d99cf3c 100644
--- a/dimos/perception/detection/type/detection2d/bbox.py
+++ b/dimos/perception/detection/type/detection2d/bbox.py
@@ -383,6 +383,19 @@ def to_ros_detection2d(self) -> ROSDetection2D:
 
 
 class ImageDetections2D(ImageDetections[Detection2D]):
+    @classmethod
+    def from_ros_detection2d_array(
+        cls, image: Image, ros_detections: Sequence[ROSDetection2D], **kwargs
+    ) -> "ImageDetections2D":
+        """Convert from ROS Detection2DArray message to ImageDetections2D object."""
+        detections: List[Detection2D] = []
+        for ros_det in ros_detections.detections:
+            detection = Detection2DBBox.from_ros_detection2d(ros_det, image=image, **kwargs)
+            if detection.is_valid():
+                detections.append(detection)
+
+        return cls(image=image, detections=detections)
+
     @classmethod
     def from_ultralytics_result(
         cls, image: Image, results: List[Results], **kwargs
diff --git a/dimos/perception/detection/type/imageDetections.py b/dimos/perception/detection/type/imageDetections.py
index 4431b028ff..994c939e4d 100644
--- a/dimos/perception/detection/type/imageDetections.py
+++ b/dimos/perception/detection/type/imageDetections.py
@@ -16,10 +16,11 @@
 
 from typing import TYPE_CHECKING, Generic, List, Optional, TypeVar
 
+from dimos_lcm.vision_msgs import Detection2DArray
+
 from dimos.msgs.foxglove_msgs import ImageAnnotations
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.std_msgs import Header
-from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.perception.detection.type.utils import TableStr
 
 if TYPE_CHECKING:
diff --git a/dimos/perception/detection/type/test_detection2d.py b/dimos/perception/detection/type/test_detection2d.py
index 3bf37c0fb6..db1e88a403 100644
--- a/dimos/perception/detection/type/test_detection2d.py
+++ b/dimos/perception/detection/type/test_detection2d.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 import pytest
 
+from dimos.perception.detection.type import ImageDetections2D
+
 
 def test_detection2d(detection2d):
     # def test_detection_basic_properties(detection2d):
@@ -85,3 +87,39 @@ def test_detection2d(detection2d):
     assert ros_bbox.center.position.y == pytest.approx(center_y, abs=0.001)
     assert ros_bbox.size_x == pytest.approx(width, abs=0.001)
     assert ros_bbox.size_y == pytest.approx(height, abs=0.001)
+
+
+def test_from_ros_detection2d_array(get_moment_2d):
+    moment = get_moment_2d()
+
+    detections2d = moment["detections2d"]
+
+    test_image = detections2d.image
+
+    # Convert to ROS detection array
+    ros_array = detections2d.to_ros_detection2d_array()
+
+    # Convert back to ImageDetections2D
+    recovered = ImageDetections2D.from_ros_detection2d_array(test_image, ros_array)
+
+    # Verify we got the same number of detections
+    assert len(recovered.detections) == len(detections2d.detections)
+
+    # Verify the detection matches
+    original_det = detections2d.detections[0]
+    recovered_det = recovered.detections[0]
+
+    # Check bbox is approximately the same (allow 1 pixel tolerance due to float conversion)
+    for orig_val, rec_val in zip(original_det.bbox, recovered_det.bbox):
+        assert orig_val == pytest.approx(rec_val, abs=1.0)
+
+    # Check other properties
+    assert recovered_det.track_id == original_det.track_id
+    assert recovered_det.class_id == original_det.class_id
+    assert recovered_det.confidence == pytest.approx(original_det.confidence, abs=0.01)
+
+    print(f"\nSuccessfully round-tripped detection through ROS format:")
+    print(f"  Original bbox: {original_det.bbox}")
+    print(f"  Recovered bbox: {recovered_det.bbox}")
+    print(f"  Track ID: {recovered_det.track_id}")
+    print(f"  Confidence: {recovered_det.confidence:.3f}")

From fff177f4d6cfeddff4af0cf75c0e4557552ffb9e Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Mon, 13 Oct 2025 11:24:33 -0700
Subject: [PATCH 26/47] reid module, mobileclip

---
 dimos/perception/detection/reid/__init__.py   |   1 +
 dimos/perception/detection/reid/base.py       | 132 ++++++++
 dimos/perception/detection/reid/mobileclip.py | 129 ++++++--
 dimos/perception/detection/reid/reidModule.py |  49 +++
 .../detection/reid/test_mobileclip.py         | 300 +++++++++++++-----
 .../detection/type/detection2d/bbox.py        |   3 +-
 6 files changed, 502 insertions(+), 112 deletions(-)
 create mode 100644 dimos/perception/detection/reid/__init__.py
 create mode 100644 dimos/perception/detection/reid/base.py
 create mode 100644 dimos/perception/detection/reid/reidModule.py

diff --git a/dimos/perception/detection/reid/__init__.py b/dimos/perception/detection/reid/__init__.py
new file mode 100644
index 0000000000..6ac0295caf
--- /dev/null
+++ b/dimos/perception/detection/reid/__init__.py
@@ -0,0 +1 @@
+from dimos.perception.detection.reid.reidModule import ReidModule as ReidModule
diff --git a/dimos/perception/detection/reid/base.py b/dimos/perception/detection/reid/base.py
new file mode 100644
index 0000000000..4ca17f35d6
--- /dev/null
+++ b/dimos/perception/detection/reid/base.py
@@ -0,0 +1,132 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+
+import numpy as np
+import torch
+
+from dimos.msgs.sensor_msgs import Image
+from dimos.types.timestamped import Timestamped
+
+
+class Embedding(Timestamped):
+    """Base class for embeddings with vector data.
+
+    Supports both torch.Tensor (for GPU-accelerated comparisons) and np.ndarray.
+    Embeddings are kept as torch.Tensor on device by default for efficiency.
+    """
+
+    vector: torch.Tensor | np.ndarray
+
+    def __matmul__(self, other: "Embedding") -> float:
+        """Compute cosine similarity via @ operator."""
+        if isinstance(self.vector, torch.Tensor):
+            other_tensor = other.to_torch(self.vector.device)
+            result = self.vector @ other_tensor
+            return result.item()
+        return float(self.vector @ other.to_numpy())
+
+    def to_numpy(self) -> np.ndarray:
+        """Convert to numpy array (moves to CPU if needed)."""
+        if isinstance(self.vector, torch.Tensor):
+            return self.vector.detach().cpu().numpy()
+        return self.vector
+
+    def to_torch(self, device: str | torch.device | None = None) -> torch.Tensor:
+        """Convert to torch tensor on specified device."""
+        if isinstance(self.vector, np.ndarray):
+            tensor = torch.from_numpy(self.vector)
+            return tensor.to(device) if device else tensor
+        # Already a tensor
+        if device is not None and self.vector.device != torch.device(device):
+            return self.vector.to(device)
+        return self.vector
+
+
+E = TypeVar("E", bound="Embedding")
+
+
+class EmbeddingModel(ABC, Generic[E]):
+    """Abstract base class for embedding models supporting vision and language."""
+
+    device: str
+    normalize: bool = True
+
+    @abstractmethod
+    def embed(self, *images: Image) -> E | list[E]:
+        """
+        Embed one or more images.
+        Returns single Embedding if one image, list if multiple.
+        """
+        pass
+
+    @abstractmethod
+    def embed_text(self, *texts: str) -> E | list[E]:
+        """
+        Embed one or more text strings.
+        Returns single Embedding if one text, list if multiple.
+        """
+        pass
+
+    def compare_one_to_many(self, query: E, candidates: list[E]) -> torch.Tensor:
+        """
+        Efficiently compare one query against many candidates on GPU.
+
+        Args:
+            query: Query embedding
+            candidates: List of candidate embeddings
+
+        Returns:
+            torch.Tensor of similarities (N,)
+        """
+        query_tensor = query.to_torch(self.device)
+        candidate_tensors = torch.stack([c.to_torch(self.device) for c in candidates])
+        return query_tensor @ candidate_tensors.T
+
+    def compare_many_to_many(self, queries: list[E], candidates: list[E]) -> torch.Tensor:
+        """
+        Efficiently compare all queries against all candidates on GPU.
+
+        Args:
+            queries: List of query embeddings
+            candidates: List of candidate embeddings
+
+        Returns:
+            torch.Tensor of similarities (M, N) where M=len(queries), N=len(candidates)
+        """
+        query_tensors = torch.stack([q.to_torch(self.device) for q in queries])
+        candidate_tensors = torch.stack([c.to_torch(self.device) for c in candidates])
+        return query_tensors @ candidate_tensors.T
+
+    def query(self, query_emb: E, candidates: list[E], top_k: int = 5) -> list[tuple[int, float]]:
+        """
+        Find top-k most similar candidates to query (GPU accelerated).
+
+        Args:
+            query_emb: Query embedding
+            candidates: List of candidate embeddings
+            top_k: Number of top results to return
+
+        Returns:
+            List of (index, similarity) tuples sorted by similarity (descending)
+        """
+        similarities = self.compare_one_to_many(query_emb, candidates)
+        top_values, top_indices = similarities.topk(k=min(top_k, len(candidates)))
+        return [(idx.item(), val.item()) for idx, val in zip(top_indices, top_values)]
+
+    def warmup(self) -> None:
+        """Optional warmup method to pre-load model."""
+        pass
diff --git a/dimos/perception/detection/reid/mobileclip.py b/dimos/perception/detection/reid/mobileclip.py
index 0ed800e4ee..387e5b1c94 100644
--- a/dimos/perception/detection/reid/mobileclip.py
+++ b/dimos/perception/detection/reid/mobileclip.py
@@ -12,37 +12,106 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pathlib import Path
+
 import numpy as np
 import open_clip
 import torch
 import torch.nn.functional as F
-from PIL import Image
-
-
-def test_embed():
-    # 1) Pick a MobileCLIP variant that OpenCLIP exposes directly
-    # Good starts: 'MobileCLIP-S2' or 'MobileCLIP-B' with pretrained='datacompdr'
-    model_name = "MobileCLIP-S2"
-    pretrained = "datacompdr"  # OpenCLIP key
-    device = "cuda"
-
-    model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
-    tokenizer = open_clip.get_tokenizer(model_name)
-    model = model.eval().to(device)
-
-    # 2) Encode an image (or crops) → unit-norm embedding
-    def embed_images(imgs_rgb: list[Image.Image]) -> np.ndarray:
-        with torch.inference_mode(), torch.cuda.amp.autocast(True):
-            batch = torch.stack([preprocess(im.convert("RGB")) for im in imgs_rgb]).to(device)
-            feats = model.encode_image(batch)
-            feats = F.normalize(feats, dim=-1)
-        return feats.detach().cpu().numpy()
-
-    # 3) Cosine distance for re-ID
-    def cosine_distance(u, v):  # u,v are L2-normalized
-        return 1.0 - float((u @ v))
-
-    # Example
-    im = Image.open("person_crop.jpg")
-    emb = embed_images([im])[0]
-    print(emb.shape)  # e.g. (512,) depending on backbone
+from PIL import Image as PILImage
+
+from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection.reid.base import Embedding, EmbeddingModel
+
+
+class MobileCLIPEmbedding(Embedding):
+    """Embedding produced by MobileCLIP model.
+
+    Keeps embeddings as torch.Tensor on device for efficient GPU comparisons.
+    """
+
+    def __init__(self, vector: torch.Tensor | np.ndarray, timestamp: float = 0.0):
+        self.vector = vector
+        # Set timestamp from parent Timestamped class
+        if timestamp > 0:
+            self.timestamp = timestamp
+
+
+class MobileCLIPModel(EmbeddingModel[MobileCLIPEmbedding]):
+    """MobileCLIP embedding model for vision-language re-identification."""
+
+    def __init__(
+        self,
+        model_name: str = "MobileCLIP2-S0",
+        model_path: Path | str | None = None,
+        device: str | None = None,
+        normalize: bool = True,
+    ):
+        """
+        Initialize MobileCLIP model.
+
+        Args:
+            model_name: Name of the model architecture
+            model_path: Path to pretrained weights
+            device: Device to run on (cuda/cpu), auto-detects if None
+            normalize: Whether to L2 normalize embeddings
+        """
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.normalize = normalize
+
+        # Load model
+        pretrained = str(model_path) if model_path else None
+        self.model, _, self.preprocess = open_clip.create_model_and_transforms(
+            model_name, pretrained=pretrained
+        )
+        self.tokenizer = open_clip.get_tokenizer(model_name)
+        self.model = self.model.eval().to(self.device)
+
+    def embed(self, *images: Image) -> MobileCLIPEmbedding | list[MobileCLIPEmbedding]:
+        """Embed one or more images.
+
+        Returns embeddings as torch.Tensor on device for efficient GPU comparisons.
+        """
+        # Convert to PIL images
+        pil_images = [PILImage.fromarray(img.to_opencv()) for img in images]
+
+        # Preprocess and batch
+        with torch.inference_mode():
+            batch = torch.stack([self.preprocess(img) for img in pil_images]).to(self.device)
+            feats = self.model.encode_image(batch)
+            if self.normalize:
+                feats = F.normalize(feats, dim=-1)
+
+        # Create embeddings (keep as torch.Tensor on device)
+        embeddings = []
+        for i, feat in enumerate(feats):
+            timestamp = images[i].ts
+            embeddings.append(MobileCLIPEmbedding(vector=feat, timestamp=timestamp))
+
+        return embeddings[0] if len(images) == 1 else embeddings
+
+    def embed_text(self, *texts: str) -> MobileCLIPEmbedding | list[MobileCLIPEmbedding]:
+        """Embed one or more text strings.
+
+        Returns embeddings as torch.Tensor on device for efficient GPU comparisons.
+        """
+        with torch.inference_mode():
+            text_tokens = self.tokenizer(list(texts)).to(self.device)
+            feats = self.model.encode_text(text_tokens)
+            if self.normalize:
+                feats = F.normalize(feats, dim=-1)
+
+        # Create embeddings (keep as torch.Tensor on device)
+        embeddings = []
+        for feat in feats:
+            embeddings.append(MobileCLIPEmbedding(vector=feat))
+
+        return embeddings[0] if len(texts) == 1 else embeddings
+
+    def warmup(self) -> None:
+        """Warmup the model with a dummy forward pass."""
+        dummy_image = torch.randn(1, 3, 224, 224).to(self.device)
+        dummy_text = self.tokenizer(["warmup"]).to(self.device)
+        with torch.inference_mode():
+            self.model.encode_image(dummy_image)
+            self.model.encode_text(dummy_text)
diff --git a/dimos/perception/detection/reid/reidModule.py b/dimos/perception/detection/reid/reidModule.py
new file mode 100644
index 0000000000..7a01a0fd81
--- /dev/null
+++ b/dimos/perception/detection/reid/reidModule.py
@@ -0,0 +1,49 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional
+
+from reactivex import operators as ops
+from reactivex.observable import Observable
+
+from dimos.core import In, Module, ModuleConfig, rpc
+from dimos.msgs.sensor_msgs import Image
+from dimos.msgs.vision_msgs import Detection2DArray
+from dimos.perception.detection.reid.base import EmbeddingModel
+from dimos.perception.detection.type import ImageDetections2D
+from dimos.types.timestamped import align_timestamped
+from dimos.utils.reactive import backpressure
+
+
+class Config(ModuleConfig):
+    embedding_model: Optional[Callable[..., "EmbeddingModel"]] = None
+
+
+class ReidModule(Module):
+    detections: In[Detection2DArray] = None  # type: ignore
+    image: In[Image] = None  # type: ignore
+
+    def detections_stream(self) -> Observable[ImageDetections2D]:
+        return backpressure(
+            align_timestamped(
+                self.image.pure_observable(),
+                self.detections.pure_observable(),
+                match_tolerance=0.0,
+                buffer_size=2.0,
+            ).pipe(ops.map(lambda pair: ImageDetections2D.from_ros_detection2d_array(*pair)))  # type: ignore[misc]
+        )
+
+    @rpc
+    def start(self):
+        self.detections_stream().subscribe(print)
diff --git a/dimos/perception/detection/reid/test_mobileclip.py b/dimos/perception/detection/reid/test_mobileclip.py
index 755ea5a4ee..11282fbd79 100644
--- a/dimos/perception/detection/reid/test_mobileclip.py
+++ b/dimos/perception/detection/reid/test_mobileclip.py
@@ -13,35 +13,20 @@
 # limitations under the License.
 
 import numpy as np
-import open_clip
 import pytest
-import torch
-import torch.nn.functional as F
-from PIL import Image as PILImage
 
 from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection.reid.mobileclip import MobileCLIPModel
 from dimos.utils.data import get_data
 
 
 @pytest.fixture(scope="session")
 def mobileclip_model():
     """Load MobileCLIP model once for all tests."""
-    model_name = "MobileCLIP2-S0"
     model_path = get_data("models_mobileclip") / "mobileclip2_s0.pt"
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    model, _, preprocess = open_clip.create_model_and_transforms(
-        model_name, pretrained=str(model_path)
-    )
-    tokenizer = open_clip.get_tokenizer(model_name)
-    model = model.eval().to(device)
-
-    return {
-        "model": model,
-        "preprocess": preprocess,
-        "tokenizer": tokenizer,
-        "device": device,
-    }
+    model = MobileCLIPModel(model_name="MobileCLIP2-S0", model_path=model_path)
+    model.warmup()
+    return model
 
 
 @pytest.fixture(scope="session")
@@ -50,67 +35,101 @@ def test_image():
     return Image.from_file(get_data("cafe.jpg")).to_rgb()
 
 
-def embed_images(model_dict, pil_images):
-    """Embed PIL images using MobileCLIP."""
-    model = model_dict["model"]
-    preprocess = model_dict["preprocess"]
-    device = model_dict["device"]
+@pytest.mark.heavy
+def test_single_image_embedding(mobileclip_model, test_image):
+    """Test embedding a single image."""
+    embedding = mobileclip_model.embed(test_image)
 
-    with torch.inference_mode():
-        batch = torch.stack([preprocess(img) for img in pil_images]).to(device)
-        feats = model.encode_image(batch)
-        feats = F.normalize(feats, dim=-1)
-    return feats.detach().cpu().numpy()
+    # Embedding should be torch.Tensor on device
+    import torch
 
+    assert isinstance(embedding.vector, torch.Tensor), "Embedding should be torch.Tensor"
+    assert embedding.vector.device.type in ["cuda", "cpu"], "Should be on valid device"
 
-@pytest.mark.heavy
-def test_mobileclip_embedding(mobileclip_model, test_image):
-    """Test that MobileCLIP can embed the test image."""
-    # Convert to PIL
-    pil_image = PILImage.fromarray(test_image.to_opencv())
-
-    # Embed
-    embedding = embed_images(mobileclip_model, [pil_image])[0]
-
-    print(f"\nEmbedding shape: {embedding.shape}")
-    print(f"Embedding dtype: {embedding.dtype}")
-    print(f"Embedding norm: {np.linalg.norm(embedding):.4f}")
-    print(f"Embedding min/max: [{embedding.min():.4f}, {embedding.max():.4f}]")
-
-    # Validate embedding
-    assert embedding.shape[0] > 0, "Embedding should have features"
-    assert embedding.dtype == np.float32 or embedding.dtype == np.float64
-    assert np.isfinite(embedding).all(), "Embedding should contain finite values"
-
-    # Check L2 normalization (should be ~1.0)
-    norm = np.linalg.norm(embedding)
+    # Test conversion to numpy
+    vector_np = embedding.to_numpy()
+    print(f"\nEmbedding shape: {vector_np.shape}")
+    print(f"Embedding dtype: {vector_np.dtype}")
+    print(f"Embedding norm: {np.linalg.norm(vector_np):.4f}")
+
+    assert vector_np.shape[0] > 0, "Embedding should have features"
+    assert np.isfinite(vector_np).all(), "Embedding should contain finite values"
+
+    # Check L2 normalization
+    norm = np.linalg.norm(vector_np)
     assert abs(norm - 1.0) < 0.01, f"Embedding should be L2 normalized, got norm={norm}"
 
 
 @pytest.mark.heavy
-def test_mobileclip_text_similarity(mobileclip_model, test_image):
-    """Test text-image similarity with MobileCLIP."""
-    model = mobileclip_model["model"]
-    tokenizer = mobileclip_model["tokenizer"]
-    device = mobileclip_model["device"]
-
-    # Get image embedding
-    pil_image = PILImage.fromarray(test_image.to_opencv())
-    img_embedding = embed_images(mobileclip_model, [pil_image])[0]
-
-    # Encode text queries
-    queries = ["a cafe", "a person", "a car", "a dog", "potato", "food", "dinner", "rock"]
-
-    with torch.inference_mode():
-        text_tokens = tokenizer(queries).to(device)
-        text_features = model.encode_text(text_tokens)
-        text_features = F.normalize(text_features, dim=-1)
-        text_embeddings = text_features.detach().cpu().numpy()
-
-    # Compute similarities (cosine similarity = 1 - cosine distance)
+def test_batch_image_embedding(mobileclip_model, test_image):
+    """Test embedding multiple images at once."""
+    embeddings = mobileclip_model.embed(test_image, test_image, test_image)
+
+    assert isinstance(embeddings, list), "Batch embedding should return list"
+    assert len(embeddings) == 3, "Should return 3 embeddings"
+
+    # Check all embeddings are similar (same image)
+    sim_01 = embeddings[0] @ embeddings[1]
+    sim_02 = embeddings[0] @ embeddings[2]
+
+    print(f"\nSimilarity between same images: {sim_01:.6f}, {sim_02:.6f}")
+
+    assert sim_01 > 0.99, f"Same image embeddings should be very similar, got {sim_01}"
+    assert sim_02 > 0.99, f"Same image embeddings should be very similar, got {sim_02}"
+
+
+@pytest.mark.heavy
+def test_single_text_embedding(mobileclip_model):
+    """Test embedding a single text string."""
+    import torch
+
+    embedding = mobileclip_model.embed_text("a cafe")
+
+    # Should be torch.Tensor
+    assert isinstance(embedding.vector, torch.Tensor), "Text embedding should be torch.Tensor"
+
+    vector_np = embedding.to_numpy()
+    print(f"\nText embedding shape: {vector_np.shape}")
+    print(f"Text embedding norm: {np.linalg.norm(vector_np):.4f}")
+
+    assert vector_np.shape[0] > 0, "Text embedding should have features"
+    assert np.isfinite(vector_np).all(), "Text embedding should contain finite values"
+
+    # Check L2 normalization
+    norm = np.linalg.norm(vector_np)
+    assert abs(norm - 1.0) < 0.01, f"Text embedding should be L2 normalized, got norm={norm}"
+
+
+@pytest.mark.heavy
+def test_batch_text_embedding(mobileclip_model):
+    """Test embedding multiple text strings at once."""
+    import torch
+
+    embeddings = mobileclip_model.embed_text("a cafe", "a person", "a dog")
+
+    assert isinstance(embeddings, list), "Batch text embedding should return list"
+    assert len(embeddings) == 3, "Should return 3 text embeddings"
+
+    # All should be torch.Tensor and normalized
+    for i, emb in enumerate(embeddings):
+        assert isinstance(emb.vector, torch.Tensor), f"Embedding {i} should be torch.Tensor"
+        norm = np.linalg.norm(emb.to_numpy())
+        assert abs(norm - 1.0) < 0.01, f"Text embedding {i} should be L2 normalized"
+
+
+@pytest.mark.heavy
+def test_text_image_similarity(mobileclip_model, test_image):
+    """Test cross-modal text-image similarity using @ operator."""
+    img_embedding = mobileclip_model.embed(test_image)
+
+    # Embed text queries
+    queries = ["a cafe", "a person", "a car", "a dog", "potato", "food"]
+    text_embeddings = mobileclip_model.embed_text(*queries)
+
+    # Compute similarities using @ operator
     similarities = {}
     for query, text_emb in zip(queries, text_embeddings):
-        similarity = float(img_embedding @ text_emb)
+        similarity = img_embedding @ text_emb
         similarities[query] = similarity
         print(f"\n'{query}': {similarity:.4f}")
 
@@ -120,17 +139,136 @@ def test_mobileclip_text_similarity(mobileclip_model, test_image):
 
 
 @pytest.mark.heavy
-def test_mobileclip_cosine_distance(mobileclip_model, test_image):
-    """Test cosine distance metric for re-identification."""
-    pil_image = PILImage.fromarray(test_image.to_opencv())
+def test_cosine_distance(mobileclip_model, test_image):
+    """Test cosine distance computation (1 - similarity)."""
+    emb1 = mobileclip_model.embed(test_image)
+    emb2 = mobileclip_model.embed(test_image)
+
+    # Similarity using @ operator
+    similarity = emb1 @ emb2
+
+    # Distance is 1 - similarity
+    distance = 1.0 - similarity
+
+    print(f"\nSimilarity (same image): {similarity:.6f}")
+    print(f"Distance (same image): {distance:.6f}")
+
+    assert similarity > 0.99, f"Same image should have high similarity, got {similarity}"
+    assert distance < 0.01, f"Same image should have low distance, got {distance}"
+
+
+@pytest.mark.heavy
+def test_query_functionality(mobileclip_model, test_image):
+    """Test query method for top-k retrieval."""
+    # Create a query and some candidates
+    query_text = mobileclip_model.embed_text("a cafe")
+
+    # Create candidate embeddings
+    candidate_texts = ["a cafe", "a restaurant", "a person", "a dog", "a car"]
+    candidates = mobileclip_model.embed_text(*candidate_texts)
+
+    # Query for top-3
+    results = mobileclip_model.query(query_text, candidates, top_k=3)
+
+    print("\nTop-3 results:")
+    for idx, sim in results:
+        print(f"  {candidate_texts[idx]}: {sim:.4f}")
+
+    assert len(results) == 3, "Should return top-3 results"
+    assert results[0][0] == 0, "Top match should be 'a cafe' itself"
+    assert results[0][1] > results[1][1], "Results should be sorted by similarity"
+    assert results[1][1] > results[2][1], "Results should be sorted by similarity"
+
+
+@pytest.mark.heavy
+def test_embedding_operator(mobileclip_model, test_image):
+    """Test that @ operator works on embeddings."""
+    emb1 = mobileclip_model.embed(test_image)
+    emb2 = mobileclip_model.embed(test_image)
+
+    # Use @ operator
+    similarity = emb1 @ emb2
+
+    assert isinstance(similarity, float), "@ operator should return float"
+    assert 0.0 <= similarity <= 1.0, "Cosine similarity should be in [0, 1]"
+    assert similarity > 0.99, "Same image should have similarity near 1.0"
+
+
+@pytest.mark.heavy
+def test_warmup(mobileclip_model):
+    """Test that warmup runs without error."""
+    # Warmup is already called in fixture, but test it explicitly
+    mobileclip_model.warmup()
+    # Just verify no exceptions raised
+    assert True
+
+
+@pytest.mark.heavy
+def test_compare_one_to_many(mobileclip_model, test_image):
+    """Test GPU-accelerated one-to-many comparison."""
+    import torch
+
+    # Create query and gallery
+    query_emb = mobileclip_model.embed(test_image)
+    gallery_embs = mobileclip_model.embed(test_image, test_image, test_image)
+
+    # Compare on GPU
+    similarities = mobileclip_model.compare_one_to_many(query_emb, gallery_embs)
+
+    print(f"\nOne-to-many similarities: {similarities}")
+
+    # Should return torch.Tensor
+    assert isinstance(similarities, torch.Tensor), "Should return torch.Tensor"
+    assert similarities.shape == (3,), "Should have 3 similarities"
+    assert similarities.device.type in ["cuda", "cpu"], "Should be on device"
+
+    # All should be ~1.0 (same image)
+    similarities_np = similarities.cpu().numpy()
+    assert np.all(similarities_np > 0.99), "Same images should have similarity ~1.0"
+
+
+@pytest.mark.heavy
+def test_compare_many_to_many(mobileclip_model):
+    """Test GPU-accelerated many-to-many comparison."""
+    import torch
+
+    # Create queries and candidates
+    queries = mobileclip_model.embed_text("a cafe", "a person")
+    candidates = mobileclip_model.embed_text("a cafe", "a restaurant", "a dog")
+
+    # Compare on GPU
+    similarities = mobileclip_model.compare_many_to_many(queries, candidates)
+
+    print(f"\nMany-to-many similarities:\n{similarities}")
+
+    # Should return torch.Tensor
+    assert isinstance(similarities, torch.Tensor), "Should return torch.Tensor"
+    assert similarities.shape == (2, 3), "Should be (2, 3) similarity matrix"
+    assert similarities.device.type in ["cuda", "cpu"], "Should be on device"
+
+    # First query should match first candidate best
+    similarities_np = similarities.cpu().numpy()
+    assert similarities_np[0, 0] > similarities_np[0, 2], "Cafe should match cafe better than dog"
+
+
+@pytest.mark.heavy
+def test_gpu_query_performance(mobileclip_model, test_image):
+    """Test that query method uses GPU acceleration."""
+    # Create a larger gallery
+    gallery_size = 20
+    gallery_images = [test_image] * gallery_size
+    gallery_embs = mobileclip_model.embed(*gallery_images)
 
-    # Embed same image twice
-    emb1 = embed_images(mobileclip_model, [pil_image])[0]
-    emb2 = embed_images(mobileclip_model, [pil_image])[0]
+    query_emb = mobileclip_model.embed(test_image)
 
-    # Cosine distance between same image should be ~0
-    cosine_dist = 1.0 - float(emb1 @ emb2)
+    # Query should use GPU-accelerated comparison
+    results = mobileclip_model.query(query_emb, gallery_embs, top_k=5)
 
-    print(f"\nCosine distance (same image): {cosine_dist:.6f}")
+    print(f"\nTop-5 results from gallery of {gallery_size}")
+    for idx, sim in results:
+        print(f"  Index {idx}: {sim:.4f}")
 
-    assert cosine_dist < 0.01, f"Same image should have distance ~0, got {cosine_dist}"
+    assert len(results) == 5, "Should return top-5 results"
+    # All should be high similarity (same image, allow some variation for image preprocessing)
+    for idx, sim in results:
+        assert sim > 0.90, f"Same images should have high similarity, got {sim}"
diff --git a/dimos/perception/detection/type/detection2d/bbox.py b/dimos/perception/detection/type/detection2d/bbox.py
index 554d99cf3c..de8ddf05df 100644
--- a/dimos/perception/detection/type/detection2d/bbox.py
+++ b/dimos/perception/detection/type/detection2d/bbox.py
@@ -28,6 +28,7 @@
 from dimos_lcm.foxglove_msgs.Point2 import Point2
 from dimos_lcm.vision_msgs import (
     BoundingBox2D,
+    Detection2DArray,
     ObjectHypothesis,
     ObjectHypothesisWithPose,
     Point2D,
@@ -385,7 +386,7 @@ def to_ros_detection2d(self) -> ROSDetection2D:
 class ImageDetections2D(ImageDetections[Detection2D]):
     @classmethod
     def from_ros_detection2d_array(
-        cls, image: Image, ros_detections: Sequence[ROSDetection2D], **kwargs
+        cls, image: Image, ros_detections: Detection2DArray, **kwargs
     ) -> "ImageDetections2D":
         """Convert from ROS Detection2DArray message to ImageDetections2D object."""
         detections: List[Detection2D] = []

From 3771e339ee0e264a1e03a0040acd2efcf172fb6f Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Mon, 13 Oct 2025 18:56:35 -0700
Subject: [PATCH 27/47] quick person tracker

---
 dimos/perception/detection/conftest.py        |   2 +-
 .../detection/detectors/person/yolo.py        |   1 +
 dimos/perception/detection/person_tracker.py  | 134 +++++++++
 dimos/perception/detection/reid/mobileclip.py |   2 +-
 dimos/perception/detection/reid/reidModule.py |  88 +++++-
 .../detection/reid/test_trackAssociator.py    | 268 ++++++++++++++++++
 .../detection/reid/trackAssociator.py         | 175 ++++++++++++
 .../detection/type/detection2d/__init__.py    |   3 +-
 .../detection/type/detection2d/bbox.py        |  83 ++----
 .../type/detection2d/imageDetections2D.py     |  79 ++++++
 .../detection/type/detection2d/person.py      |  19 ++
 .../test_bbox.py}                             |  38 ---
 .../detection2d/test_imageDetections2D.py     |  52 ++++
 .../detection/type/detection2d/test_person.py |  71 +++++
 .../detection/type/detection3d/__init__.py    |   6 +-
 .../type/detection3d/imageDetections3DPC.py   |  45 +++
 .../detection/type/detection3d/pointcloud.py  |  95 +------
 .../type/detection3d/pointcloud_filters.py    |  82 ++++++
 .../detection3d/test_imageDetections3DPC.py   |  36 +++
 .../test_pointcloud.py}                       |   0
 .../modular/connection_module.py              |  12 +-
 .../unitree_webrtc/modular/ivan_unitree.py    |  67 +++--
 22 files changed, 1136 insertions(+), 222 deletions(-)
 create mode 100644 dimos/perception/detection/person_tracker.py
 create mode 100644 dimos/perception/detection/reid/test_trackAssociator.py
 create mode 100644 dimos/perception/detection/reid/trackAssociator.py
 create mode 100644 dimos/perception/detection/type/detection2d/imageDetections2D.py
 rename dimos/perception/detection/type/{test_detection2d.py => detection2d/test_bbox.py} (69%)
 create mode 100644 dimos/perception/detection/type/detection2d/test_imageDetections2D.py
 create mode 100644 dimos/perception/detection/type/detection2d/test_person.py
 create mode 100644 dimos/perception/detection/type/detection3d/imageDetections3DPC.py
 create mode 100644 dimos/perception/detection/type/detection3d/pointcloud_filters.py
 create mode 100644 dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
 rename dimos/perception/detection/type/{test_detection3dpc.py => detection3d/test_pointcloud.py} (100%)

diff --git a/dimos/perception/detection/conftest.py b/dimos/perception/detection/conftest.py
index 9016713cff..e6e69ce0af 100644
--- a/dimos/perception/detection/conftest.py
+++ b/dimos/perception/detection/conftest.py
@@ -252,7 +252,7 @@ def object_db_module(get_moment):
     """Create and populate an ObjectDBModule with detections from multiple frames."""
     from dimos.perception.detection.detectors import Yolo2DDetector
 
-    module2d = Detection2DModule(detector=Yolo2DDetector)
+    module2d = Detection2DModule(detector=lambda: Yolo2DDetector(device="cpu"))
     module3d = Detection3DModule(camera_info=ConnectionModule._camera_info())
     moduleDB = ObjectDBModule(
         camera_info=ConnectionModule._camera_info(),
diff --git a/dimos/perception/detection/detectors/person/yolo.py b/dimos/perception/detection/detectors/person/yolo.py
index 72c1d92348..4c0799dafe 100644
--- a/dimos/perception/detection/detectors/person/yolo.py
+++ b/dimos/perception/detection/detectors/person/yolo.py
@@ -68,5 +68,6 @@ def process_image(self, image: Image) -> ImageDetections2D:
             conf=0.5,
             tracker=self.tracker,
             persist=True,
+            device=self.device,
         )
         return ImageDetections2D.from_ultralytics_result(image, results)
diff --git a/dimos/perception/detection/person_tracker.py b/dimos/perception/detection/person_tracker.py
new file mode 100644
index 0000000000..265b3a4c9b
--- /dev/null
+++ b/dimos/perception/detection/person_tracker.py
@@ -0,0 +1,134 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Generic, Optional, Tuple, TypeVar
+
+import numpy as np
+import torch
+from dimos_lcm.foxglove_msgs.ImageAnnotations import (
+    ImageAnnotations,
+    TextAnnotation,
+)
+from dimos_lcm.foxglove_msgs.Point2 import Point2
+from reactivex import operators as ops
+from reactivex.observable import Observable
+
+from dimos.agents2 import skill
+from dimos.core import In, Module, ModuleConfig, Out, rpc
+from dimos.msgs.foxglove_msgs.Color import Color
+from dimos.msgs.geometry_msgs import PoseStamped, Vector3
+from dimos.msgs.sensor_msgs import CameraInfo, Image
+from dimos.msgs.vision_msgs import Detection2DArray
+from dimos.perception.detection.reid.base import EmbeddingModel
+from dimos.perception.detection.reid.mobileclip import MobileCLIPModel
+from dimos.perception.detection.reid.trackAssociator import TrackAssociator
+from dimos.perception.detection.type import ImageDetections2D
+from dimos.types.timestamped import Timestamped, align_timestamped, to_ros_stamp
+from dimos.utils.reactive import backpressure
+
+
+class PersonTracker(Module):
+    detections: In[Detection2DArray] = None  # type: ignore
+    image: In[Image] = None  # type: ignore
+    target: Out[PoseStamped] = None  # type: ignore
+
+    camera_info: CameraInfo
+
+    def __init__(self, cameraInfo: CameraInfo, **kwargs):
+        super().__init__(**kwargs)
+        self.camera_info = cameraInfo
+
+    def center_to_3d(
+        self,
+        pixel: Tuple[int, int],
+        camera_info: CameraInfo,
+        assumed_depth: float = 1.0,
+    ) -> Vector3:
+        """Unproject 2D pixel coordinates to 3D position in camera_link frame.
+
+        Args:
+            camera_info: Camera calibration information
+            assumed_depth: Assumed depth in meters (default 1.0m from camera)
+
+        Returns:
+            Vector3 position in camera_link frame coordinates (Z up, X forward)
+        """
+        # Extract camera intrinsics
+        fx, fy = camera_info.K[0], camera_info.K[4]
+        cx, cy = camera_info.K[2], camera_info.K[5]
+
+        # Unproject pixel to normalized camera coordinates
+        x_norm = (pixel[0] - cx) / fx
+        y_norm = (pixel[1] - cy) / fy
+
+        # Create 3D point at assumed depth in camera optical frame
+        # Camera optical frame: X right, Y down, Z forward
+        x_optical = x_norm * assumed_depth
+        y_optical = y_norm * assumed_depth
+        z_optical = assumed_depth
+
+        # Transform from camera optical frame to camera_link frame
+        # Optical: X right, Y down, Z forward
+        # Link: X forward, Y left, Z up
+        # Transformation: x_link = z_optical, y_link = -x_optical, z_link = -y_optical
+        return Vector3(z_optical, -x_optical, -y_optical)
+
+    def detections_stream(self) -> Observable[ImageDetections2D]:
+        return backpressure(
+            align_timestamped(
+                self.image.pure_observable(),
+                self.detections.pure_observable().pipe(
+                    ops.filter(lambda d: d.detections_length > 0)  # type: ignore[attr-defined]
+                ),
+                match_tolerance=0.0,
+                buffer_size=2.0,
+            ).pipe(ops.map(lambda pair: ImageDetections2D.from_ros_detection2d_array(*pair)))
+        )
+
+    @rpc
+    def start(self):
+        self.detections_stream().subscribe(self.track)
+
+    def track(self, detections2D: ImageDetections2D):
+        if len(detections2D) == 0:
+            return
+
+        target = max(detections2D.detections, key=lambda det: det.bbox_2d_volume())
+
+        vector = self.center_to_3d(target.center_bbox, self.camera_info, 1.0)
+
+        pose_in_camera = PoseStamped(
+            ts=detections2D.ts,
+            position=vector,
+            frame_id="camera_link",
+        )
+
+        print("Pose in camera frame:", pose_in_camera)
+
+        tf_world_to_camera = self.tf.get("world", "camera_link", detections2D.ts, 2)
+        if not tf_world_to_camera:
+            print("no tf")
+            return
+
+        # Transform the pose from camera frame to world frame
+        # Convert pose to transform, compose with world-to-camera, then convert back
+        from dimos.msgs.geometry_msgs import Transform
+
+        tf_camera_to_target = Transform.from_pose("target", pose_in_camera)
+        tf_world_to_target = tf_world_to_camera + tf_camera_to_target
+        pose_in_world = tf_world_to_target.to_pose(ts=detections2D.ts)
+
+        print("Target at", pose_in_world)
+        self.target.publish(pose_in_world)
diff --git a/dimos/perception/detection/reid/mobileclip.py b/dimos/perception/detection/reid/mobileclip.py
index 387e5b1c94..7cb16fcdab 100644
--- a/dimos/perception/detection/reid/mobileclip.py
+++ b/dimos/perception/detection/reid/mobileclip.py
@@ -42,7 +42,7 @@ class MobileCLIPModel(EmbeddingModel[MobileCLIPEmbedding]):
 
     def __init__(
         self,
-        model_name: str = "MobileCLIP2-S0",
+        model_name: str = "MobileCLIP2-S4",
         model_path: Path | str | None = None,
         device: str | None = None,
         normalize: bool = True,
diff --git a/dimos/perception/detection/reid/reidModule.py b/dimos/perception/detection/reid/reidModule.py
index 7a01a0fd81..2335fdde35 100644
--- a/dimos/perception/detection/reid/reidModule.py
+++ b/dimos/perception/detection/reid/reidModule.py
@@ -14,31 +14,59 @@
 
 from typing import Callable, Optional
 
+from dimos_lcm.foxglove_msgs.ImageAnnotations import (
+    ImageAnnotations,
+    TextAnnotation,
+)
+from dimos_lcm.foxglove_msgs.Point2 import Point2
 from reactivex import operators as ops
 from reactivex.observable import Observable
 
-from dimos.core import In, Module, ModuleConfig, rpc
+from dimos.core import In, Module, ModuleConfig, Out, rpc
+from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.perception.detection.reid.base import EmbeddingModel
+from dimos.perception.detection.reid.mobileclip import MobileCLIPModel
+from dimos.perception.detection.reid.trackAssociator import TrackAssociator
 from dimos.perception.detection.type import ImageDetections2D
-from dimos.types.timestamped import align_timestamped
+from dimos.types.timestamped import align_timestamped, to_ros_stamp
 from dimos.utils.reactive import backpressure
 
 
 class Config(ModuleConfig):
     embedding_model: Optional[Callable[..., "EmbeddingModel"]] = None
+    similarity_threshold: float = 0.99
 
 
 class ReidModule(Module):
+    default_config = Config
+
     detections: In[Detection2DArray] = None  # type: ignore
     image: In[Image] = None  # type: ignore
+    annotations: Out[ImageAnnotations] = None  # type: ignore
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.config = Config(**kwargs)
+        self.embedding_model = (
+            self.config.embedding_model() if self.config.embedding_model else MobileCLIPModel()
+        )
+        self.associator = (
+            TrackAssociator(
+                model=self.embedding_model, similarity_threshold=self.config.similarity_threshold
+            )
+            if self.embedding_model
+            else None
+        )
 
     def detections_stream(self) -> Observable[ImageDetections2D]:
         return backpressure(
             align_timestamped(
                 self.image.pure_observable(),
-                self.detections.pure_observable(),
+                self.detections.pure_observable().pipe(
+                    ops.filter(lambda d: d.detections_length > 0)  # type: ignore[attr-defined]
+                ),
                 match_tolerance=0.0,
                 buffer_size=2.0,
             ).pipe(ops.map(lambda pair: ImageDetections2D.from_ros_detection2d_array(*pair)))  # type: ignore[misc]
@@ -46,4 +74,56 @@ def detections_stream(self) -> Observable[ImageDetections2D]:
 
     @rpc
     def start(self):
-        self.detections_stream().subscribe(print)
+        self.detections_stream().subscribe(self.ingress)
+
+    def ingress(self, imageDetections: ImageDetections2D):
+        if not self.associator or not self.embedding_model:
+            print("No embedding model or associator configured")
+            return
+
+        track_ids = []
+
+        # Update embeddings for all detections
+        for detection in imageDetections:
+            embedding = self.embedding_model.embed(detection.cropped_image(padding=0))
+            # embed() with single image returns single Embedding
+            assert not isinstance(embedding, list), "Expected single embedding"
+            self.associator.update_embedding(detection.track_id, embedding)
+            track_ids.append(detection.track_id)
+
+        # Record negative constraints (co-occurrence = different objects)
+        self.associator.add_negative_constraints(track_ids)
+
+        # Associate and create annotations
+        text_annotations = []
+        for detection in imageDetections:
+            long_term_id = self.associator.associate(detection.track_id)
+            print(
+                f"track_id={detection.track_id} -> long_term_id={long_term_id} "
+                f"({detection.name}, conf={detection.confidence:.2f})"
+            )
+
+            # Create text annotation for long_term_id above the detection
+            x1, y1, _, _ = detection.bbox
+            font_size = imageDetections.image.width / 60
+
+            text_annotations.append(
+                TextAnnotation(
+                    timestamp=to_ros_stamp(detection.ts),
+                    position=Point2(x=x1, y=y1 - font_size * 1.5),
+                    text=f"PERSON: {long_term_id}",
+                    font_size=font_size,
+                    text_color=Color(r=0.0, g=1.0, b=1.0, a=1.0),  # Cyan
+                    background_color=Color(r=0.0, g=0.0, b=0.0, a=0.8),
+                )
+            )
+
+        # Publish annotations
+        if text_annotations:
+            annotations = ImageAnnotations(
+                texts=text_annotations,
+                texts_length=len(text_annotations),
+                points=[],
+                points_length=0,
+            )
+            self.annotations.publish(annotations)
diff --git a/dimos/perception/detection/reid/test_trackAssociator.py b/dimos/perception/detection/reid/test_trackAssociator.py
new file mode 100644
index 0000000000..76f868bd7b
--- /dev/null
+++ b/dimos/perception/detection/reid/test_trackAssociator.py
@@ -0,0 +1,268 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection.reid.mobileclip import MobileCLIPModel
+from dimos.perception.detection.reid.trackAssociator import TrackAssociator
+from dimos.utils.data import get_data
+
+
+@pytest.fixture(scope="session")
+def mobileclip_model():
+    """Load MobileCLIP model once for all tests."""
+    model_path = get_data("models_mobileclip") / "mobileclip2_s0.pt"
+    model = MobileCLIPModel(model_name="MobileCLIP2-S0", model_path=model_path)
+    model.warmup()
+    return model
+
+
+@pytest.fixture
+def track_associator(mobileclip_model):
+    """Create fresh TrackAssociator for each test."""
+    return TrackAssociator(model=mobileclip_model, similarity_threshold=0.75)
+
+
+@pytest.fixture(scope="session")
+def test_image():
+    """Load test image."""
+    return Image.from_file(get_data("cafe.jpg")).to_rgb()
+
+
+@pytest.mark.heavy
+def test_update_embedding_single(track_associator, mobileclip_model, test_image):
+    """Test updating embedding for a single track."""
+    embedding = mobileclip_model.embed(test_image)
+
+    # First update
+    track_associator.update_embedding(track_id=1, new_embedding=embedding)
+
+    assert 1 in track_associator.track_embeddings
+    assert track_associator.embedding_counts[1] == 1
+
+    # Verify embedding is on device and normalized
+    emb_vec = track_associator.track_embeddings[1]
+    assert isinstance(emb_vec, torch.Tensor)
+    assert emb_vec.device.type in ["cuda", "cpu"]
+    norm = torch.norm(emb_vec).item()
+    assert abs(norm - 1.0) < 0.01, "Embedding should be normalized"
+
+
+@pytest.mark.heavy
+def test_update_embedding_running_average(track_associator, mobileclip_model, test_image):
+    """Test running average of embeddings."""
+    embedding1 = mobileclip_model.embed(test_image)
+    embedding2 = mobileclip_model.embed(test_image)
+
+    # Add first embedding
+    track_associator.update_embedding(track_id=1, new_embedding=embedding1)
+    first_vec = track_associator.track_embeddings[1].clone()
+
+    # Add second embedding (same image, should be very similar)
+    track_associator.update_embedding(track_id=1, new_embedding=embedding2)
+    avg_vec = track_associator.track_embeddings[1]
+
+    assert track_associator.embedding_counts[1] == 2
+
+    # Average should still be normalized
+    norm = torch.norm(avg_vec).item()
+    assert abs(norm - 1.0) < 0.01, "Average embedding should be normalized"
+
+    # Average should be similar to both originals (same image)
+    similarity1 = (first_vec @ avg_vec).item()
+    assert similarity1 > 0.99, "Average should be very similar to original"
+
+
+@pytest.mark.heavy
+def test_negative_constraints(track_associator):
+    """Test negative constraint recording."""
+    # Simulate frame with 3 tracks
+    track_ids = [1, 2, 3]
+    track_associator.add_negative_constraints(track_ids)
+
+    # Check that all pairs are recorded
+    assert 2 in track_associator.negative_pairs[1]
+    assert 3 in track_associator.negative_pairs[1]
+    assert 1 in track_associator.negative_pairs[2]
+    assert 3 in track_associator.negative_pairs[2]
+    assert 1 in track_associator.negative_pairs[3]
+    assert 2 in track_associator.negative_pairs[3]
+
+
+@pytest.mark.heavy
+def test_associate_new_track(track_associator, mobileclip_model, test_image):
+    """Test associating a new track creates new long_term_id."""
+    embedding = mobileclip_model.embed(test_image)
+    track_associator.update_embedding(track_id=1, new_embedding=embedding)
+
+    # First association should create new long_term_id
+    long_term_id = track_associator.associate(track_id=1)
+
+    assert long_term_id == 0, "First track should get long_term_id=0"
+    assert track_associator.track_to_long_term[1] == 0
+    assert track_associator.long_term_counter == 1
+
+
+@pytest.mark.heavy
+def test_associate_similar_tracks(track_associator, mobileclip_model, test_image):
+    """Test associating similar tracks to same long_term_id."""
+    # Create embeddings from same image (should be very similar)
+    embedding1 = mobileclip_model.embed(test_image)
+    embedding2 = mobileclip_model.embed(test_image)
+
+    # Add first track
+    track_associator.update_embedding(track_id=1, new_embedding=embedding1)
+    long_term_id_1 = track_associator.associate(track_id=1)
+
+    # Add second track with similar embedding
+    track_associator.update_embedding(track_id=2, new_embedding=embedding2)
+    long_term_id_2 = track_associator.associate(track_id=2)
+
+    # Should get same long_term_id (similarity > 0.75)
+    assert long_term_id_1 == long_term_id_2, "Similar tracks should get same long_term_id"
+    assert track_associator.long_term_counter == 1, "Only one long_term_id should be created"
+
+
+@pytest.mark.heavy
+def test_associate_with_negative_constraint(track_associator, mobileclip_model, test_image):
+    """Test that negative constraints prevent association."""
+    # Create similar embeddings
+    embedding1 = mobileclip_model.embed(test_image)
+    embedding2 = mobileclip_model.embed(test_image)
+
+    # Add first track
+    track_associator.update_embedding(track_id=1, new_embedding=embedding1)
+    long_term_id_1 = track_associator.associate(track_id=1)
+
+    # Add negative constraint (tracks co-occurred)
+    track_associator.add_negative_constraints([1, 2])
+
+    # Add second track with similar embedding
+    track_associator.update_embedding(track_id=2, new_embedding=embedding2)
+    long_term_id_2 = track_associator.associate(track_id=2)
+
+    # Should get different long_term_ids despite high similarity
+    assert long_term_id_1 != long_term_id_2, (
+        "Co-occurring tracks should get different long_term_ids"
+    )
+    assert track_associator.long_term_counter == 2, "Two long_term_ids should be created"
+
+
+@pytest.mark.heavy
+def test_associate_different_objects(track_associator, mobileclip_model, test_image):
+    """Test that dissimilar embeddings get different long_term_ids."""
+    # Create embeddings for image and text (very different)
+    image_emb = mobileclip_model.embed(test_image)
+    text_emb = mobileclip_model.embed_text("a dog")
+
+    # Add first track (image)
+    track_associator.update_embedding(track_id=1, new_embedding=image_emb)
+    long_term_id_1 = track_associator.associate(track_id=1)
+
+    # Add second track (text - very different embedding)
+    track_associator.update_embedding(track_id=2, new_embedding=text_emb)
+    long_term_id_2 = track_associator.associate(track_id=2)
+
+    # Should get different long_term_ids (similarity < 0.75)
+    assert long_term_id_1 != long_term_id_2, "Different objects should get different long_term_ids"
+    assert track_associator.long_term_counter == 2
+
+
+@pytest.mark.heavy
+def test_associate_returns_cached(track_associator, mobileclip_model, test_image):
+    """Test that repeated calls return same long_term_id."""
+    embedding = mobileclip_model.embed(test_image)
+    track_associator.update_embedding(track_id=1, new_embedding=embedding)
+
+    # First call
+    long_term_id_1 = track_associator.associate(track_id=1)
+
+    # Second call should return cached result
+    long_term_id_2 = track_associator.associate(track_id=1)
+
+    assert long_term_id_1 == long_term_id_2
+    assert track_associator.long_term_counter == 1, "Should not create new ID"
+
+
+@pytest.mark.heavy
+def test_associate_not_ready(track_associator):
+    """Test that associate returns -1 for track without embedding."""
+    long_term_id = track_associator.associate(track_id=999)
+    assert long_term_id == -1, "Should return -1 for track without embedding"
+
+
+@pytest.mark.heavy
+def test_gpu_performance(track_associator, mobileclip_model, test_image):
+    """Test that embeddings stay on GPU for performance."""
+    embedding = mobileclip_model.embed(test_image)
+    track_associator.update_embedding(track_id=1, new_embedding=embedding)
+
+    # Embedding should stay on device
+    emb_vec = track_associator.track_embeddings[1]
+    assert isinstance(emb_vec, torch.Tensor)
+    # Device comparison (handle "cuda" vs "cuda:0")
+    assert emb_vec.device.type == torch.device(track_associator.device).type
+
+    # Running average should happen on GPU
+    embedding2 = mobileclip_model.embed(test_image)
+    track_associator.update_embedding(track_id=1, new_embedding=embedding2)
+
+    avg_vec = track_associator.track_embeddings[1]
+    assert avg_vec.device.type == torch.device(track_associator.device).type
+
+
+@pytest.mark.heavy
+def test_similarity_threshold_configurable(mobileclip_model):
+    """Test that similarity threshold is configurable."""
+    associator_strict = TrackAssociator(model=mobileclip_model, similarity_threshold=0.95)
+    associator_loose = TrackAssociator(model=mobileclip_model, similarity_threshold=0.50)
+
+    assert associator_strict.similarity_threshold == 0.95
+    assert associator_loose.similarity_threshold == 0.50
+
+
+@pytest.mark.heavy
+def test_multi_track_scenario(track_associator, mobileclip_model, test_image):
+    """Test realistic scenario with multiple tracks across frames."""
+    # Frame 1: Track 1 appears
+    emb1 = mobileclip_model.embed(test_image)
+    track_associator.update_embedding(1, emb1)
+    track_associator.add_negative_constraints([1])
+    lt1 = track_associator.associate(1)
+
+    # Frame 2: Track 1 and Track 2 appear (different objects)
+    text_emb = mobileclip_model.embed_text("a dog")
+    track_associator.update_embedding(1, emb1)  # Update average
+    track_associator.update_embedding(2, text_emb)
+    track_associator.add_negative_constraints([1, 2])  # Co-occur = different
+    lt2 = track_associator.associate(2)
+
+    # Track 2 should get different ID despite any similarity
+    assert lt1 != lt2
+
+    # Frame 3: Track 1 disappears, Track 3 appears (same as Track 1)
+    emb3 = mobileclip_model.embed(test_image)
+    track_associator.update_embedding(3, emb3)
+    track_associator.add_negative_constraints([2, 3])
+    lt3 = track_associator.associate(3)
+
+    # Track 3 should match Track 1 (not co-occurring, similar embedding)
+    assert lt3 == lt1
+
+    print("\nMulti-track scenario results:")
+    print(f"  Track 1 -> long_term_id {lt1}")
+    print(f"  Track 2 -> long_term_id {lt2} (different object, co-occurred)")
+    print(f"  Track 3 -> long_term_id {lt3} (re-identified as Track 1)")
diff --git a/dimos/perception/detection/reid/trackAssociator.py b/dimos/perception/detection/reid/trackAssociator.py
new file mode 100644
index 0000000000..44b93392e7
--- /dev/null
+++ b/dimos/perception/detection/reid/trackAssociator.py
@@ -0,0 +1,175 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Set
+
+import torch
+import torch.nn.functional as F
+
+from dimos.perception.detection.reid.base import Embedding, EmbeddingModel
+
+
+class TrackAssociator:
+    """Associates short-term track_ids to long-term unique detection IDs via embedding similarity.
+
+    Maintains:
+    - Running average embeddings per track_id (on GPU)
+    - Negative constraints from co-occurrence (tracks in same frame = different objects)
+    - Mapping from track_id to unique long-term ID
+    """
+
+    def __init__(self, model: EmbeddingModel, similarity_threshold: float = 0.75):
+        """Initialize track associator.
+
+        Args:
+            model: Embedding model for GPU-accelerated comparisons
+            similarity_threshold: Minimum similarity for associating tracks (0-1)
+        """
+        self.model = model
+        self.device = model.device
+        self.similarity_threshold = similarity_threshold
+
+        # Track embeddings (running average, kept on GPU)
+        self.track_embeddings: Dict[int, torch.Tensor] = {}
+        self.embedding_counts: Dict[int, int] = {}
+
+        # Negative constraints (track_ids that co-occurred = different objects)
+        self.negative_pairs: Dict[int, Set[int]] = {}
+
+        # Track ID to long-term unique ID mapping
+        self.track_to_long_term: Dict[int, int] = {}
+        self.long_term_counter: int = 0
+
+        # Similarity history for optional adaptive thresholding
+        self.similarity_history: List[float] = []
+
+    def update_embedding(self, track_id: int, new_embedding: Embedding) -> None:
+        """Update running average embedding for a track_id.
+
+        Args:
+            track_id: Short-term track ID from detector
+            new_embedding: New embedding to incorporate into average
+        """
+        # Convert to torch on device (no-op if already on device)
+        new_vec = new_embedding.to_torch(self.device)
+
+        # Debug: check embedding diversity
+        print(
+            f"Track {track_id}: embedding norm={new_vec.norm().item():.3f}, first 3 values={new_vec[:3].cpu().tolist()}"
+        )
+
+        if track_id in self.track_embeddings:
+            # Running average
+            count = self.embedding_counts[track_id]
+            old_avg = self.track_embeddings[track_id]
+
+            # Compute average on GPU
+            new_avg = (old_avg * count + new_vec) / (count + 1)
+
+            # Re-normalize (important for cosine similarity)
+            new_avg = F.normalize(new_avg, dim=-1)
+
+            self.track_embeddings[track_id] = new_avg
+            self.embedding_counts[track_id] += 1
+        else:
+            # First embedding for this track (normalize for consistency)
+            self.track_embeddings[track_id] = F.normalize(new_vec, dim=-1)
+            self.embedding_counts[track_id] = 1
+
+    def add_negative_constraints(self, track_ids: List[int]) -> None:
+        """Record that these track_ids co-occurred in same frame (different objects).
+
+        Args:
+            track_ids: List of track_ids present in current frame
+        """
+        # All pairs of track_ids in same frame can't be same object
+        for i, tid1 in enumerate(track_ids):
+            for tid2 in track_ids[i + 1 :]:
+                self.negative_pairs.setdefault(tid1, set()).add(tid2)
+                self.negative_pairs.setdefault(tid2, set()).add(tid1)
+
+    def associate(self, track_id: int) -> int:
+        """Associate track_id to long-term unique detection ID.
+
+        Args:
+            track_id: Short-term track ID to associate
+
+        Returns:
+            Long-term unique detection ID, or -1 if not ready yet
+        """
+        # Already has assignment
+        if track_id in self.track_to_long_term:
+            return self.track_to_long_term[track_id]
+
+        # Need embedding to compare
+        if track_id not in self.track_embeddings:
+            return -1  # Not ready yet
+
+        # Build candidate list (only tracks with assigned long_term_ids)
+        query_vec = self.track_embeddings[track_id]
+
+        candidates = []
+        candidate_track_ids = []
+
+        for other_tid, other_vec in self.track_embeddings.items():
+            # Skip self
+            if other_tid == track_id:
+                continue
+            # Skip if negative constraint (co-occurred)
+            if other_tid in self.negative_pairs.get(track_id, set()):
+                continue
+            # Skip if no long_term_id yet
+            if other_tid not in self.track_to_long_term:
+                continue
+
+            candidates.append(other_vec)
+            candidate_track_ids.append(other_tid)
+
+        if candidates:
+            # GPU-accelerated comparison (single matrix multiplication)
+            candidate_stack = torch.stack(candidates)  # [N, D]
+            similarities = query_vec @ candidate_stack.T  # [N]
+
+            # Find best match
+            best_sim, best_idx = similarities.max(dim=0)
+            best_sim_value = best_sim.item()  # Move to CPU only for comparison
+
+            # Debug: show similarity values and check for exact match
+            matched_track_id = candidate_track_ids[best_idx]
+            matched_long_term_id = self.track_to_long_term[matched_track_id]
+
+            # Check if embeddings are actually identical
+            matched_vec = self.track_embeddings[matched_track_id]
+            diff = (query_vec - matched_vec).abs().max().item()
+
+            print(
+                f"Track {track_id}: best similarity = {best_sim_value:.6f} with track {matched_track_id} "
+                f"(long_term_id={matched_long_term_id}, max_diff={diff:.6f}, counts: {self.embedding_counts[track_id]} vs {self.embedding_counts[matched_track_id]})"
+            )
+
+            # Track similarity distribution (for future adaptive thresholding)
+            self.similarity_history.append(best_sim_value)
+
+            if best_sim_value >= self.similarity_threshold:
+                # Associate with existing long_term_id
+                matched_track_id = candidate_track_ids[best_idx]
+                long_term_id = self.track_to_long_term[matched_track_id]
+                self.track_to_long_term[track_id] = long_term_id
+                return long_term_id
+
+        # Create new unique detection ID
+        new_id = self.long_term_counter
+        self.long_term_counter += 1
+        self.track_to_long_term[track_id] = new_id
+        return new_id
diff --git a/dimos/perception/detection/type/detection2d/__init__.py b/dimos/perception/detection/type/detection2d/__init__.py
index 3a5cb27dce..1096abda9c 100644
--- a/dimos/perception/detection/type/detection2d/__init__.py
+++ b/dimos/perception/detection/type/detection2d/__init__.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 from dimos.perception.detection.type.detection2d.base import Detection2D
-from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox, ImageDetections2D
+from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox
+from dimos.perception.detection.type.detection2d.imageDetections2D import ImageDetections2D
 from dimos.perception.detection.type.detection2d.person import Detection2DPerson
 
 __all__ = [
diff --git a/dimos/perception/detection/type/detection2d/bbox.py b/dimos/perception/detection/type/detection2d/bbox.py
index de8ddf05df..223e1bc018 100644
--- a/dimos/perception/detection/type/detection2d/bbox.py
+++ b/dimos/perception/detection/type/detection2d/bbox.py
@@ -28,7 +28,6 @@
 from dimos_lcm.foxglove_msgs.Point2 import Point2
 from dimos_lcm.vision_msgs import (
     BoundingBox2D,
-    Detection2DArray,
     ObjectHypothesis,
     ObjectHypothesisWithPose,
     Point2D,
@@ -46,7 +45,6 @@
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.std_msgs import Header
 from dimos.perception.detection.type.detection2d.base import Detection2D
-from dimos.perception.detection.type.imageDetections import ImageDetections
 from dimos.types.timestamped import to_ros_stamp, to_timestamp
 from dimos.utils.decorators.decorators import simple_mcache
 
@@ -101,6 +99,33 @@ def to_repr_dict(self) -> Dict[str, Any]:
             "bbox": f"[{x1:.0f},{y1:.0f},{x2:.0f},{y2:.0f}]",
         }
 
+    def center_to_3d(
+        self,
+        pixel: Tuple[int, int],
+        camera_info: CameraInfo,
+        assumed_depth: float = 1.0,
+    ) -> PoseStamped:
+        """Unproject 2D pixel coordinates to 3D position in camera optical frame.
+
+        Args:
+            camera_info: Camera calibration information
+            assumed_depth: Assumed depth in meters (default 1.0m from camera)
+
+        Returns:
+            Vector3 position in camera optical frame coordinates
+        """
+        # Extract camera intrinsics
+        fx, fy = camera_info.K[0], camera_info.K[4]
+        cx, cy = camera_info.K[2], camera_info.K[5]
+
+        # Unproject pixel to normalized camera coordinates
+        x_norm = (pixel[0] - cx) / fx
+        y_norm = (pixel[1] - cy) / fy
+
+        # Create 3D point at assumed depth in camera optical frame
+        # Camera optical frame: X right, Y down, Z forward
+        return Vector3(x_norm * assumed_depth, y_norm * assumed_depth, assumed_depth)
+
     # return focused image, only on the bbox
     def cropped_image(self, padding: int = 20) -> Image:
         """Return a cropped version of the image focused on the bounding box.
@@ -381,57 +406,3 @@ def to_ros_detection2d(self) -> ROSDetection2D:
             ],
             id=str(self.track_id),
         )
-
-
-class ImageDetections2D(ImageDetections[Detection2D]):
-    @classmethod
-    def from_ros_detection2d_array(
-        cls, image: Image, ros_detections: Detection2DArray, **kwargs
-    ) -> "ImageDetections2D":
-        """Convert from ROS Detection2DArray message to ImageDetections2D object."""
-        detections: List[Detection2D] = []
-        for ros_det in ros_detections.detections:
-            detection = Detection2DBBox.from_ros_detection2d(ros_det, image=image, **kwargs)
-            if detection.is_valid():
-                detections.append(detection)
-
-        return cls(image=image, detections=detections)
-
-    @classmethod
-    def from_ultralytics_result(
-        cls, image: Image, results: List[Results], **kwargs
-    ) -> "ImageDetections2D":
-        """Create ImageDetections2D from ultralytics Results.
-
-        Dispatches to appropriate Detection2D subclass based on result type:
-        - If keypoints present: creates Detection2DPerson
-        - Otherwise: creates Detection2DBBox
-
-        Args:
-            image: Source image
-            results: List of ultralytics Results objects
-            **kwargs: Additional arguments passed to detection constructors
-
-        Returns:
-            ImageDetections2D containing appropriate detection types
-        """
-        from dimos.perception.detection.type.detection2d.person import Detection2DPerson
-
-        detections: List[Detection2D] = []
-        for result in results:
-            if result.boxes is None:
-                continue
-
-            num_detections = len(result.boxes.xyxy)
-            for i in range(num_detections):
-                detection: Detection2D
-                if result.keypoints is not None:
-                    # Pose detection with keypoints
-                    detection = Detection2DPerson.from_ultralytics_result(result, i, image)
-                else:
-                    # Regular bbox detection
-                    detection = Detection2DBBox.from_ultralytics_result(result, i, image)
-                if detection.is_valid():
-                    detections.append(detection)
-
-        return cls(image=image, detections=detections)
diff --git a/dimos/perception/detection/type/detection2d/imageDetections2D.py b/dimos/perception/detection/type/detection2d/imageDetections2D.py
new file mode 100644
index 0000000000..74854dae47
--- /dev/null
+++ b/dimos/perception/detection/type/detection2d/imageDetections2D.py
@@ -0,0 +1,79 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import List
+
+from dimos_lcm.vision_msgs import Detection2DArray
+from ultralytics.engine.results import Results
+
+from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection.type.detection2d.base import Detection2D
+from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox
+from dimos.perception.detection.type.imageDetections import ImageDetections
+
+
+class ImageDetections2D(ImageDetections[Detection2D]):
+    @classmethod
+    def from_ros_detection2d_array(
+        cls, image: Image, ros_detections: Detection2DArray, **kwargs
+    ) -> "ImageDetections2D":
+        """Convert from ROS Detection2DArray message to ImageDetections2D object."""
+        detections: List[Detection2D] = []
+        for ros_det in ros_detections.detections:
+            detection = Detection2DBBox.from_ros_detection2d(ros_det, image=image, **kwargs)
+            if detection.is_valid():  # type: ignore[attr-defined]
+                detections.append(detection)
+
+        return cls(image=image, detections=detections)
+
+    @classmethod
+    def from_ultralytics_result(
+        cls, image: Image, results: List[Results], **kwargs
+    ) -> "ImageDetections2D":
+        """Create ImageDetections2D from ultralytics Results.
+
+        Dispatches to appropriate Detection2D subclass based on result type:
+        - If keypoints present: creates Detection2DPerson
+        - Otherwise: creates Detection2DBBox
+
+        Args:
+            image: Source image
+            results: List of ultralytics Results objects
+            **kwargs: Additional arguments passed to detection constructors
+
+        Returns:
+            ImageDetections2D containing appropriate detection types
+        """
+        from dimos.perception.detection.type.detection2d.person import Detection2DPerson
+
+        detections: List[Detection2D] = []
+        for result in results:
+            if result.boxes is None:
+                continue
+
+            num_detections = len(result.boxes.xyxy)
+            for i in range(num_detections):
+                detection: Detection2D
+                if result.keypoints is not None:
+                    # Pose detection with keypoints
+                    detection = Detection2DPerson.from_ultralytics_result(result, i, image)
+                else:
+                    # Regular bbox detection
+                    detection = Detection2DBBox.from_ultralytics_result(result, i, image)
+                if detection.is_valid():
+                    detections.append(detection)
+
+        return cls(image=image, detections=detections)
diff --git a/dimos/perception/detection/type/detection2d/person.py b/dimos/perception/detection/type/detection2d/person.py
index 4390437ede..1c6fee5cae 100644
--- a/dimos/perception/detection/type/detection2d/person.py
+++ b/dimos/perception/detection/type/detection2d/person.py
@@ -172,6 +172,25 @@ def from_yolo(cls, result: "Results", idx: int, image: Image) -> "Detection2DPer
         """Alias for from_ultralytics_result for backward compatibility."""
         return cls.from_ultralytics_result(result, idx, image)
 
+    @classmethod
+    def from_ros_detection2d(cls, *args, **kwargs) -> "Detection2DPerson":
+        """Conversion from ROS Detection2D is not supported for Detection2DPerson.
+
+        The ROS Detection2D message format does not include keypoint data,
+        which is required for Detection2DPerson. Use Detection2DBBox for
+        round-trip ROS conversions, or store keypoints separately.
+
+        Raises:
+            NotImplementedError: Always raised as this conversion is impossible
+        """
+        raise NotImplementedError(
+            "Cannot convert from ROS Detection2D to Detection2DPerson. "
+            "The ROS Detection2D message format does not contain keypoint data "
+            "(keypoints and keypoint_scores) which are required fields for Detection2DPerson. "
+            "Consider using Detection2DBBox for ROS conversions, or implement a custom "
+            "message format that includes pose keypoints."
+        )
+
     def get_keypoint(self, name: str) -> Tuple[np.ndarray, float]:
         """Get specific keypoint by name.
         Returns:
diff --git a/dimos/perception/detection/type/test_detection2d.py b/dimos/perception/detection/type/detection2d/test_bbox.py
similarity index 69%
rename from dimos/perception/detection/type/test_detection2d.py
rename to dimos/perception/detection/type/detection2d/test_bbox.py
index db1e88a403..3bf37c0fb6 100644
--- a/dimos/perception/detection/type/test_detection2d.py
+++ b/dimos/perception/detection/type/detection2d/test_bbox.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 import pytest
 
-from dimos.perception.detection.type import ImageDetections2D
-
 
 def test_detection2d(detection2d):
     # def test_detection_basic_properties(detection2d):
@@ -87,39 +85,3 @@ def test_detection2d(detection2d):
     assert ros_bbox.center.position.y == pytest.approx(center_y, abs=0.001)
     assert ros_bbox.size_x == pytest.approx(width, abs=0.001)
     assert ros_bbox.size_y == pytest.approx(height, abs=0.001)
-
-
-def test_from_ros_detection2d_array(get_moment_2d):
-    moment = get_moment_2d()
-
-    detections2d = moment["detections2d"]
-
-    test_image = detections2d.image
-
-    # Convert to ROS detection array
-    ros_array = detections2d.to_ros_detection2d_array()
-
-    # Convert back to ImageDetections2D
-    recovered = ImageDetections2D.from_ros_detection2d_array(test_image, ros_array)
-
-    # Verify we got the same number of detections
-    assert len(recovered.detections) == len(detections2d.detections)
-
-    # Verify the detection matches
-    original_det = detections2d.detections[0]
-    recovered_det = recovered.detections[0]
-
-    # Check bbox is approximately the same (allow 1 pixel tolerance due to float conversion)
-    for orig_val, rec_val in zip(original_det.bbox, recovered_det.bbox):
-        assert orig_val == pytest.approx(rec_val, abs=1.0)
-
-    # Check other properties
-    assert recovered_det.track_id == original_det.track_id
-    assert recovered_det.class_id == original_det.class_id
-    assert recovered_det.confidence == pytest.approx(original_det.confidence, abs=0.01)
-
-    print(f"\nSuccessfully round-tripped detection through ROS format:")
-    print(f"  Original bbox: {original_det.bbox}")
-    print(f"  Recovered bbox: {recovered_det.bbox}")
-    print(f"  Track ID: {recovered_det.track_id}")
-    print(f"  Confidence: {recovered_det.confidence:.3f}")
diff --git a/dimos/perception/detection/type/detection2d/test_imageDetections2D.py b/dimos/perception/detection/type/detection2d/test_imageDetections2D.py
new file mode 100644
index 0000000000..6731b7b0c7
--- /dev/null
+++ b/dimos/perception/detection/type/detection2d/test_imageDetections2D.py
@@ -0,0 +1,52 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+
+from dimos.perception.detection.type import ImageDetections2D
+
+
+def test_from_ros_detection2d_array(get_moment_2d):
+    moment = get_moment_2d()
+
+    detections2d = moment["detections2d"]
+
+    test_image = detections2d.image
+
+    # Convert to ROS detection array
+    ros_array = detections2d.to_ros_detection2d_array()
+
+    # Convert back to ImageDetections2D
+    recovered = ImageDetections2D.from_ros_detection2d_array(test_image, ros_array)
+
+    # Verify we got the same number of detections
+    assert len(recovered.detections) == len(detections2d.detections)
+
+    # Verify the detection matches
+    original_det = detections2d.detections[0]
+    recovered_det = recovered.detections[0]
+
+    # Check bbox is approximately the same (allow 1 pixel tolerance due to float conversion)
+    for orig_val, rec_val in zip(original_det.bbox, recovered_det.bbox):
+        assert orig_val == pytest.approx(rec_val, abs=1.0)
+
+    # Check other properties
+    assert recovered_det.track_id == original_det.track_id
+    assert recovered_det.class_id == original_det.class_id
+    assert recovered_det.confidence == pytest.approx(original_det.confidence, abs=0.01)
+
+    print(f"\nSuccessfully round-tripped detection through ROS format:")
+    print(f"  Original bbox: {original_det.bbox}")
+    print(f"  Recovered bbox: {recovered_det.bbox}")
+    print(f"  Track ID: {recovered_det.track_id}")
+    print(f"  Confidence: {recovered_det.confidence:.3f}")
diff --git a/dimos/perception/detection/type/detection2d/test_person.py b/dimos/perception/detection/type/detection2d/test_person.py
new file mode 100644
index 0000000000..ba930fd299
--- /dev/null
+++ b/dimos/perception/detection/type/detection2d/test_person.py
@@ -0,0 +1,71 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+
+
+def test_person_ros_confidence():
+    """Test that Detection2DPerson preserves confidence when converting to ROS format."""
+
+    from dimos.msgs.sensor_msgs import Image
+    from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector
+    from dimos.perception.detection.type.detection2d.person import Detection2DPerson
+    from dimos.utils.data import get_data
+
+    # Load test image
+    image_path = get_data("cafe.jpg")
+    image = Image.from_file(image_path)
+
+    # Run pose detection
+    detector = YoloPersonDetector(device="cpu")
+    detections = detector.process_image(image)
+
+    # Find a Detection2DPerson (should have at least one person in cafe.jpg)
+    person_detections = [d for d in detections.detections if isinstance(d, Detection2DPerson)]
+    assert len(person_detections) > 0, "No person detections found in cafe.jpg"
+
+    # Test each person detection
+    for person_det in person_detections:
+        original_confidence = person_det.confidence
+        assert 0.0 <= original_confidence <= 1.0, "Confidence should be between 0 and 1"
+
+        # Convert to ROS format
+        ros_det = person_det.to_ros_detection2d()
+
+        # Extract confidence from ROS message
+        assert len(ros_det.results) > 0, "ROS detection should have results"
+        ros_confidence = ros_det.results[0].hypothesis.score
+
+        # Verify confidence is preserved (allow small floating point tolerance)
+        assert original_confidence == pytest.approx(ros_confidence, abs=0.001), (
+            f"Confidence mismatch: {original_confidence} != {ros_confidence}"
+        )
+
+        print("\nSuccessfully preserved confidence in ROS conversion for Detection2DPerson:")
+        print(f"  Original confidence: {original_confidence:.3f}")
+        print(f"  ROS confidence: {ros_confidence:.3f}")
+        print(f"  Track ID: {person_det.track_id}")
+        print(f"  Visible keypoints: {len(person_det.get_visible_keypoints(threshold=0.3))}/17")
+
+
+def test_person_from_ros_raises():
+    """Test that Detection2DPerson.from_ros_detection2d() raises NotImplementedError."""
+    from dimos.perception.detection.type.detection2d.person import Detection2DPerson
+
+    with pytest.raises(NotImplementedError) as exc_info:
+        Detection2DPerson.from_ros_detection2d()
+
+    # Verify the error message is informative
+    error_msg = str(exc_info.value)
+    assert "keypoint data" in error_msg.lower()
+    assert "Detection2DBBox" in error_msg
diff --git a/dimos/perception/detection/type/detection3d/__init__.py b/dimos/perception/detection/type/detection3d/__init__.py
index e9e1950abf..a8d11ca87f 100644
--- a/dimos/perception/detection/type/detection3d/__init__.py
+++ b/dimos/perception/detection/type/detection3d/__init__.py
@@ -14,9 +14,9 @@
 
 from dimos.perception.detection.type.detection3d.base import Detection3D
 from dimos.perception.detection.type.detection3d.bbox import Detection3DBBox
-from dimos.perception.detection.type.detection3d.pointcloud import (
-    Detection3DPC,
-    ImageDetections3DPC,
+from dimos.perception.detection.type.detection3d.imageDetections3DPC import ImageDetections3DPC
+from dimos.perception.detection.type.detection3d.pointcloud import Detection3DPC
+from dimos.perception.detection.type.detection3d.pointcloud_filters import (
     PointCloudFilter,
     height_filter,
     radius_outlier,
diff --git a/dimos/perception/detection/type/detection3d/imageDetections3DPC.py b/dimos/perception/detection/type/detection3d/imageDetections3DPC.py
new file mode 100644
index 0000000000..efad114a2c
--- /dev/null
+++ b/dimos/perception/detection/type/detection3d/imageDetections3DPC.py
@@ -0,0 +1,45 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from lcm_msgs.foxglove_msgs import SceneUpdate
+
+from dimos.perception.detection.type.detection3d.pointcloud import Detection3DPC
+from dimos.perception.detection.type.imageDetections import ImageDetections
+
+
+class ImageDetections3DPC(ImageDetections[Detection3DPC]):
+    """Specialized class for 3D detections in an image."""
+
+    def to_foxglove_scene_update(self) -> "SceneUpdate":
+        """Convert all detections to a Foxglove SceneUpdate message.
+
+        Returns:
+            SceneUpdate containing SceneEntity objects for all detections
+        """
+
+        # Create SceneUpdate message with all detections
+        scene_update = SceneUpdate()
+        scene_update.deletions_length = 0
+        scene_update.deletions = []
+        scene_update.entities = []
+
+        # Process each detection
+        for i, detection in enumerate(self.detections):
+            entity = detection.to_foxglove_scene_entity(entity_id=f"detection_{detection.name}_{i}")
+            scene_update.entities.append(entity)
+
+        scene_update.entities_length = len(scene_update.entities)
+        return scene_update
diff --git a/dimos/perception/detection/type/detection3d/pointcloud.py b/dimos/perception/detection/type/detection3d/pointcloud.py
index 6f9e4c2e05..e5fb82549c 100644
--- a/dimos/perception/detection/type/detection3d/pointcloud.py
+++ b/dimos/perception/detection/type/detection3d/pointcloud.py
@@ -16,7 +16,7 @@
 
 import functools
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, Optional, TypeVar
+from typing import Any, Dict, Optional
 
 import numpy as np
 from dimos_lcm.sensor_msgs import CameraInfo
@@ -28,70 +28,16 @@
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3
 from dimos.msgs.sensor_msgs import PointCloud2
-from dimos.perception.detection.type.detection2d import Detection2D, Detection2DBBox
+from dimos.perception.detection.type.detection2d import Detection2DBBox
 from dimos.perception.detection.type.detection3d.base import Detection3D
-from dimos.perception.detection.type.imageDetections import ImageDetections
+from dimos.perception.detection.type.detection3d.pointcloud_filters import (
+    PointCloudFilter,
+    radius_outlier,
+    raycast,
+    statistical,
+)
 from dimos.types.timestamped import to_ros_stamp
 
-# Filters take Detection2DBBox, PointCloud2, CameraInfo, Transform and return filtered PointCloud2 or None
-PointCloudFilter = Callable[
-    [Detection2DBBox, PointCloud2, CameraInfo, Transform], Optional[PointCloud2]
-]
-
-
-def height_filter(height=0.1) -> PointCloudFilter:
-    return lambda det, pc, ci, tf: pc.filter_by_height(height)
-
-
-def statistical(nb_neighbors=40, std_ratio=0.5) -> PointCloudFilter:
-    def filter_func(
-        det: Detection2DBBox, pc: PointCloud2, ci: CameraInfo, tf: Transform
-    ) -> Optional[PointCloud2]:
-        try:
-            statistical, removed = pc.pointcloud.remove_statistical_outlier(
-                nb_neighbors=nb_neighbors, std_ratio=std_ratio
-            )
-            return PointCloud2(statistical, pc.frame_id, pc.ts)
-        except Exception as e:
-            # print("statistical filter failed:", e)
-            return None
-
-    return filter_func
-
-
-def raycast() -> PointCloudFilter:
-    def filter_func(
-        det: Detection2DBBox, pc: PointCloud2, ci: CameraInfo, tf: Transform
-    ) -> Optional[PointCloud2]:
-        try:
-            camera_pos = tf.inverse().translation
-            camera_pos_np = camera_pos.to_numpy()
-            _, visible_indices = pc.pointcloud.hidden_point_removal(camera_pos_np, radius=100.0)
-            visible_pcd = pc.pointcloud.select_by_index(visible_indices)
-            return PointCloud2(visible_pcd, pc.frame_id, pc.ts)
-        except Exception as e:
-            # print("raycast filter failed:", e)
-            return None
-
-    return filter_func
-
-
-def radius_outlier(min_neighbors: int = 20, radius: float = 0.3) -> PointCloudFilter:
-    """
-    Remove isolated points: keep only points that have at least `min_neighbors`
-    neighbors within `radius` meters (same units as your point cloud).
-    """
-
-    def filter_func(
-        det: Detection2DBBox, pc: PointCloud2, ci: CameraInfo, tf: Transform
-    ) -> Optional[PointCloud2]:
-        filtered_pcd, removed = pc.pointcloud.remove_radius_outlier(
-            nb_points=min_neighbors, radius=radius
-        )
-        return PointCloud2(filtered_pcd, pc.frame_id, pc.ts)
-
-    return filter_func
-
 
 @dataclass
 class Detection3DPC(Detection3D):
@@ -377,28 +323,3 @@ def from_2d(  # type: ignore[override]
             transform=world_to_optical_transform,
             frame_id=world_pointcloud.frame_id,
         )
-
-
-class ImageDetections3DPC(ImageDetections[Detection3DPC]):
-    """Specialized class for 3D detections in an image."""
-
-    def to_foxglove_scene_update(self) -> "SceneUpdate":
-        """Convert all detections to a Foxglove SceneUpdate message.
-
-        Returns:
-            SceneUpdate containing SceneEntity objects for all detections
-        """
-
-        # Create SceneUpdate message with all detections
-        scene_update = SceneUpdate()
-        scene_update.deletions_length = 0
-        scene_update.deletions = []
-        scene_update.entities = []
-
-        # Process each detection
-        for i, detection in enumerate(self.detections):
-            entity = detection.to_foxglove_scene_entity(entity_id=f"detection_{detection.name}_{i}")
-            scene_update.entities.append(entity)
-
-        scene_update.entities_length = len(scene_update.entities)
-        return scene_update
diff --git a/dimos/perception/detection/type/detection3d/pointcloud_filters.py b/dimos/perception/detection/type/detection3d/pointcloud_filters.py
new file mode 100644
index 0000000000..51cf3d7f33
--- /dev/null
+++ b/dimos/perception/detection/type/detection3d/pointcloud_filters.py
@@ -0,0 +1,82 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Callable, Optional
+
+from dimos_lcm.sensor_msgs import CameraInfo
+
+from dimos.msgs.geometry_msgs import Transform
+from dimos.msgs.sensor_msgs import PointCloud2
+from dimos.perception.detection.type.detection2d import Detection2DBBox
+
+# Filters take Detection2DBBox, PointCloud2, CameraInfo, Transform and return filtered PointCloud2 or None
+PointCloudFilter = Callable[
+    [Detection2DBBox, PointCloud2, CameraInfo, Transform], Optional[PointCloud2]
+]
+
+
+def height_filter(height=0.1) -> PointCloudFilter:
+    return lambda det, pc, ci, tf: pc.filter_by_height(height)
+
+
+def statistical(nb_neighbors=40, std_ratio=0.5) -> PointCloudFilter:
+    def filter_func(
+        det: Detection2DBBox, pc: PointCloud2, ci: CameraInfo, tf: Transform
+    ) -> Optional[PointCloud2]:
+        try:
+            statistical, removed = pc.pointcloud.remove_statistical_outlier(
+                nb_neighbors=nb_neighbors, std_ratio=std_ratio
+            )
+            return PointCloud2(statistical, pc.frame_id, pc.ts)
+        except Exception as e:
+            # print("statistical filter failed:", e)
+            return None
+
+    return filter_func
+
+
+def raycast() -> PointCloudFilter:
+    def filter_func(
+        det: Detection2DBBox, pc: PointCloud2, ci: CameraInfo, tf: Transform
+    ) -> Optional[PointCloud2]:
+        try:
+            camera_pos = tf.inverse().translation
+            camera_pos_np = camera_pos.to_numpy()
+            _, visible_indices = pc.pointcloud.hidden_point_removal(camera_pos_np, radius=100.0)
+            visible_pcd = pc.pointcloud.select_by_index(visible_indices)
+            return PointCloud2(visible_pcd, pc.frame_id, pc.ts)
+        except Exception as e:
+            # print("raycast filter failed:", e)
+            return None
+
+    return filter_func
+
+
+def radius_outlier(min_neighbors: int = 20, radius: float = 0.3) -> PointCloudFilter:
+    """
+    Remove isolated points: keep only points that have at least `min_neighbors`
+    neighbors within `radius` meters (same units as your point cloud).
+    """
+
+    def filter_func(
+        det: Detection2DBBox, pc: PointCloud2, ci: CameraInfo, tf: Transform
+    ) -> Optional[PointCloud2]:
+        filtered_pcd, removed = pc.pointcloud.remove_radius_outlier(
+            nb_points=min_neighbors, radius=radius
+        )
+        return PointCloud2(filtered_pcd, pc.frame_id, pc.ts)
+
+    return filter_func
diff --git a/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py b/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
new file mode 100644
index 0000000000..5173646953
--- /dev/null
+++ b/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
@@ -0,0 +1,36 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_to_foxglove_scene_update(get_moment_3dpc):
+    """Test conversion of ImageDetections3DPC to Foxglove SceneUpdate."""
+    moment = get_moment_3dpc(seek=10.0)
+    detections3dpc = moment["detections3dpc"]
+
+    # Convert to scene update
+    scene_update = detections3dpc.to_foxglove_scene_update()
+
+    # Verify scene update structure
+    assert scene_update is not None
+    assert scene_update.deletions_length == 0
+    assert len(scene_update.deletions) == 0
+    assert scene_update.entities_length == len(detections3dpc.detections)
+    assert len(scene_update.entities) == len(detections3dpc.detections)
+
+    # Verify each entity corresponds to a detection
+    for i, (entity, detection) in enumerate(zip(scene_update.entities, detections3dpc.detections)):
+        assert entity.id == str(detection.track_id)
+        assert entity.frame_id == detection.frame_id
+        assert entity.cubes_length == 1
+        assert entity.texts_length == 1
diff --git a/dimos/perception/detection/type/test_detection3dpc.py b/dimos/perception/detection/type/detection3d/test_pointcloud.py
similarity index 100%
rename from dimos/perception/detection/type/test_detection3dpc.py
rename to dimos/perception/detection/type/detection3d/test_pointcloud.py
diff --git a/dimos/robot/unitree_webrtc/modular/connection_module.py b/dimos/robot/unitree_webrtc/modular/connection_module.py
index 57f508b552..1d67e4f596 100644
--- a/dimos/robot/unitree_webrtc/modular/connection_module.py
+++ b/dimos/robot/unitree_webrtc/modular/connection_module.py
@@ -314,13 +314,13 @@ def deploy_connection(dimos: DimosCluster, **kwargs):
 
     connection.odom.transport = LCMTransport("/odom", PoseStamped)
 
-    #    connection.video.transport = pSHMTransport(
-    #        "/image", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE
-    #    )
+    connection.video.transport = pSHMTransport(
+        "/image", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE
+    )
 
-    #    connection.lidar.transport = pSHMTransport(
-    #        "/lidar", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE
-    #    )
+    connection.lidar.transport = pSHMTransport(
+        "/lidar", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE
+    )
 
     connection.video.transport = LCMTransport("/image", Image)
     connection.lidar.transport = LCMTransport("/lidar", LidarMessage)
diff --git a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
index 95ace0c423..410ad86ad7 100644
--- a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
+++ b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
@@ -22,10 +22,13 @@
 
 # from dimos.msgs.detection2d import Detection2DArray
 from dimos.msgs.foxglove_msgs import ImageAnnotations
+from dimos.msgs.geometry_msgs import PoseStamped
 from dimos.msgs.sensor_msgs import Image, PointCloud2
 from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.perception.detection.module2D import Detection2DModule
 from dimos.perception.detection.module3D import Detection3DModule
+from dimos.perception.detection.person_tracker import PersonTracker
+from dimos.perception.detection.reid import ReidModule
 from dimos.protocol.pubsub import lcm
 from dimos.robot.foxglove_bridge import FoxgloveBridge
 from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation
@@ -36,7 +39,7 @@
 
 
 def detection_unitree():
-    dimos = start(6)
+    dimos = start(8)
     connection = deploy_connection(dimos)
     # mapper = deploy_navigation(dimos, connection)
     # mapper.start()
@@ -45,44 +48,48 @@ def goto(pose):
         print("NAVIGATION REQUESTED:", pose)
         return True
 
-    module3D = dimos.deploy(
+    detector = dimos.deploy(
         Detection2DModule,
         # goto=goto,
         camera_info=ConnectionModule._camera_info(),
     )
 
-    module3D.image.connect(connection.video)
-    # module3D.pointcloud.connect(mapper.global_map)
-    # module3D.pointcloud.connect(connection.lidar)
+    detector.image.connect(connection.video)
+    # detector.pointcloud.connect(mapper.global_map)
+    # detector.pointcloud.connect(connection.lidar)
 
-    module3D.annotations.transport = LCMTransport("/annotations", ImageAnnotations)
-    module3D.detections.transport = LCMTransport("/detections", Detection2DArray)
+    detector.annotations.transport = LCMTransport("/annotations", ImageAnnotations)
+    detector.detections.transport = LCMTransport("/detections", Detection2DArray)
 
-    # module3D.detected_pointcloud_0.transport = LCMTransport("/detected/pointcloud/0", PointCloud2)
-    # module3D.detected_pointcloud_1.transport = LCMTransport("/detected/pointcloud/1", PointCloud2)
-    # module3D.detected_pointcloud_2.transport = LCMTransport("/detected/pointcloud/2", PointCloud2)
+    # detector.detected_pointcloud_0.transport = LCMTransport("/detected/pointcloud/0", PointCloud2)
+    # detector.detected_pointcloud_1.transport = LCMTransport("/detected/pointcloud/1", PointCloud2)
+    # detector.detected_pointcloud_2.transport = LCMTransport("/detected/pointcloud/2", PointCloud2)
 
-    module3D.detected_image_0.transport = LCMTransport("/detected/image/0", Image)
-    module3D.detected_image_1.transport = LCMTransport("/detected/image/1", Image)
-    module3D.detected_image_2.transport = LCMTransport("/detected/image/2", Image)
-    # module3D.scene_update.transport = LCMTransport("/scene_update", SceneUpdate)
+    detector.detected_image_0.transport = LCMTransport("/detected/image/0", Image)
+    detector.detected_image_1.transport = LCMTransport("/detected/image/1", Image)
+    detector.detected_image_2.transport = LCMTransport("/detected/image/2", Image)
+    # detector.scene_update.transport = LCMTransport("/scene_update", SceneUpdate)
 
-    module3D.start()
+    # reidModule = dimos.deploy(ReidModule)
+
+    # reidModule.image.connect(connection.video)
+    # reidModule.detections.connect(detector.detections)
+    # reidModule.annotations.transport = LCMTransport("/reid/annotations", ImageAnnotations)
+
+    person_tracker = dimos.deploy(PersonTracker, cameraInfo=ConnectionModule._camera_info())
+    person_tracker.image.connect(connection.video)
+    person_tracker.detections.connect(detector.detections)
+    person_tracker.target.transport = LCMTransport("/target", PoseStamped)
+
+    detector.start()
+    person_tracker.start()
     connection.start()
-    bridge = FoxgloveBridge(
-        #        shm_channels=[
-        #            "/image#sensor_msgs.Image",
-        #            "/lidar#sensor_msgs.PointCloud2",
-        #        ]
-    )
-    # bridge = FoxgloveBridge()
-    bridge.start()
 
     from dimos.agents2 import Agent, Output, Reducer, Stream, skill
     from dimos.agents2.cli.human import HumanInput
 
     agent = Agent(
-        system_prompt="You are a helpful assistant for controlling a Unitree Go2 robot. ",
+        system_prompt="You are a helpful assistant for controlling a Unitree Go2 robot.",
         model=Model.GPT_4O,  # Could add CLAUDE models to enum
         provider=Provider.OPENAI,  # Would need ANTHROPIC provider
     )
@@ -90,7 +97,17 @@ def goto(pose):
     human_input = dimos.deploy(HumanInput)
     agent.register_skills(human_input)
     # agent.register_skills(connection)
-    agent.register_skills(module3D)
+    agent.register_skills(detector)
+
+    bridge = FoxgloveBridge(
+        shm_channels=[
+            "/image#sensor_msgs.Image",
+            "/lidar#sensor_msgs.PointCloud2",
+        ]
+    )
+    # bridge = FoxgloveBridge()
+    time.sleep(1)
+    bridge.start()
 
     # agent.run_implicit_skill("video_stream_tool")
     # agent.run_implicit_skill("human")

From 342d9affefe9b86bbfc8eec971cfeed4224878d6 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Mon, 13 Oct 2025 19:04:23 -0700
Subject: [PATCH 28/47] person tracker cleanup

---
 dimos/perception/detection/person_tracker.py | 34 ++++----------------
 1 file changed, 6 insertions(+), 28 deletions(-)

diff --git a/dimos/perception/detection/person_tracker.py b/dimos/perception/detection/person_tracker.py
index 265b3a4c9b..83a62cd092 100644
--- a/dimos/perception/detection/person_tracker.py
+++ b/dimos/perception/detection/person_tracker.py
@@ -12,30 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from abc import ABC, abstractmethod
-from typing import Any, Callable, Generic, Optional, Tuple, TypeVar
-
-import numpy as np
-import torch
-from dimos_lcm.foxglove_msgs.ImageAnnotations import (
-    ImageAnnotations,
-    TextAnnotation,
-)
-from dimos_lcm.foxglove_msgs.Point2 import Point2
+from typing import Tuple
+
 from reactivex import operators as ops
 from reactivex.observable import Observable
 
-from dimos.agents2 import skill
-from dimos.core import In, Module, ModuleConfig, Out, rpc
-from dimos.msgs.foxglove_msgs.Color import Color
-from dimos.msgs.geometry_msgs import PoseStamped, Vector3
+from dimos.core import In, Module, Out, rpc
+from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3
 from dimos.msgs.sensor_msgs import CameraInfo, Image
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection.reid.base import EmbeddingModel
-from dimos.perception.detection.reid.mobileclip import MobileCLIPModel
-from dimos.perception.detection.reid.trackAssociator import TrackAssociator
 from dimos.perception.detection.type import ImageDetections2D
-from dimos.types.timestamped import Timestamped, align_timestamped, to_ros_stamp
+from dimos.types.timestamped import align_timestamped
 from dimos.utils.reactive import backpressure
 
 
@@ -106,7 +93,6 @@ def track(self, detections2D: ImageDetections2D):
             return
 
         target = max(detections2D.detections, key=lambda det: det.bbox_2d_volume())
-
         vector = self.center_to_3d(target.center_bbox, self.camera_info, 1.0)
 
         pose_in_camera = PoseStamped(
@@ -115,20 +101,12 @@ def track(self, detections2D: ImageDetections2D):
             frame_id="camera_link",
         )
 
-        print("Pose in camera frame:", pose_in_camera)
-
-        tf_world_to_camera = self.tf.get("world", "camera_link", detections2D.ts, 2)
+        tf_world_to_camera = self.tf.get("world", "camera_link", detections2D.ts, 0.5)
         if not tf_world_to_camera:
-            print("no tf")
             return
 
-        # Transform the pose from camera frame to world frame
-        # Convert pose to transform, compose with world-to-camera, then convert back
-        from dimos.msgs.geometry_msgs import Transform
-
         tf_camera_to_target = Transform.from_pose("target", pose_in_camera)
         tf_world_to_target = tf_world_to_camera + tf_camera_to_target
         pose_in_world = tf_world_to_target.to_pose(ts=detections2D.ts)
 
-        print("Target at", pose_in_world)
         self.target.publish(pose_in_world)

From 8cdc92a30e7477c9f189c78daf8139d924185b81 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Tue, 14 Oct 2025 17:22:02 -0700
Subject: [PATCH 29/47] clip/mobileclip standardized implementation

---
 dimos/models/embedding/__init__.py            |  12 ++
 dimos/models/embedding/clip.py                | 109 ++++++++++
 .../reid => models/embedding}/mobileclip.py   |  17 +-
 .../embedding/test_embedding_models.py}       | 191 ++++++++++++++----
 .../reid/base.py => models/embedding/type.py} |  14 +-
 dimos/msgs/sensor_msgs/Image.py               |   7 +-
 dimos/perception/detection/conftest.py        |   7 +
 dimos/perception/detection/module2D.py        |   4 +-
 dimos/perception/detection/person_tracker.py  |   4 +-
 dimos/perception/detection/reid/__init__.py   |  22 +-
 .../reid/{reidModule.py => module.py}         |  60 +++---
 .../perception/detection/reid/test_module.py  |  48 +++++
 .../detection/reid/test_trackAssociator.py    |  17 +-
 .../detection/reid/trackAssociator.py         |  10 +-
 dimos/perception/detection/reid/type.py       | 150 ++++++++++++++
 dimos/robot/unitree_webrtc/connection.py      |  18 +-
 .../modular/connection_module.py              |   3 +-
 .../unitree_webrtc/modular/ivan_unitree.py    |  12 +-
 .../unitree_webrtc/modular/navigation.py      |  15 +-
 19 files changed, 586 insertions(+), 134 deletions(-)
 create mode 100644 dimos/models/embedding/__init__.py
 create mode 100644 dimos/models/embedding/clip.py
 rename dimos/{perception/detection/reid => models/embedding}/mobileclip.py (88%)
 rename dimos/{perception/detection/reid/test_mobileclip.py => models/embedding/test_embedding_models.py} (57%)
 rename dimos/{perception/detection/reid/base.py => models/embedding/type.py} (93%)
 rename dimos/perception/detection/reid/{reidModule.py => module.py} (67%)
 create mode 100644 dimos/perception/detection/reid/test_module.py
 create mode 100644 dimos/perception/detection/reid/type.py

diff --git a/dimos/models/embedding/__init__.py b/dimos/models/embedding/__init__.py
new file mode 100644
index 0000000000..cad8cd4255
--- /dev/null
+++ b/dimos/models/embedding/__init__.py
@@ -0,0 +1,12 @@
+from dimos.models.embedding.clip import CLIPEmbedding, CLIPModel
+from dimos.models.embedding.mobileclip import MobileCLIPEmbedding, MobileCLIPModel
+from dimos.models.embedding.type import Embedding, EmbeddingModel
+
+__all__ = [
+    "Embedding",
+    "EmbeddingModel",
+    "CLIPEmbedding",
+    "CLIPModel",
+    "MobileCLIPEmbedding",
+    "MobileCLIPModel",
+]
diff --git a/dimos/models/embedding/clip.py b/dimos/models/embedding/clip.py
new file mode 100644
index 0000000000..4bb3ce5ec4
--- /dev/null
+++ b/dimos/models/embedding/clip.py
@@ -0,0 +1,109 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+from PIL import Image as PILImage
+from transformers import CLIPModel as HFCLIPModel
+from transformers import CLIPProcessor
+
+from dimos.models.embedding.type import Embedding, EmbeddingModel
+from dimos.msgs.sensor_msgs import Image
+
+
+class CLIPEmbedding(Embedding): ...
+
+
+class CLIPModel(EmbeddingModel[CLIPEmbedding]):
+    """CLIP embedding model for vision-language re-identification."""
+
+    def __init__(
+        self,
+        model_name: str = "openai/clip-vit-base-patch32",
+        device: str | None = None,
+        normalize: bool = True,
+    ):
+        """
+        Initialize CLIP model.
+
+        Args:
+            model_name: HuggingFace model name (e.g., "openai/clip-vit-base-patch32")
+            device: Device to run on (cuda/cpu), auto-detects if None
+            normalize: Whether to L2 normalize embeddings
+        """
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.normalize = normalize
+
+        print(f"[DEBUG] CLIPModel.__init__: model_name={model_name}, device={self.device}")
+        # Load model and processor
+        self.model = HFCLIPModel.from_pretrained(model_name).eval().to(self.device)
+        self.processor = CLIPProcessor.from_pretrained(model_name)
+        print(f"[DEBUG] CLIPModel.__init__: COMPLETE")
+
+    def embed(self, *images: Image) -> CLIPEmbedding | list[CLIPEmbedding]:
+        """Embed one or more images.
+
+        Returns embeddings as torch.Tensor on device for efficient GPU comparisons.
+        """
+        # Convert to PIL images
+        pil_images = [PILImage.fromarray(img.to_opencv()) for img in images]
+
+        # Process images
+        with torch.inference_mode():
+            inputs = self.processor(images=pil_images, return_tensors="pt").to(self.device)
+            image_features = self.model.get_image_features(**inputs)
+
+            if self.normalize:
+                image_features = F.normalize(image_features, dim=-1)
+
+        # Create embeddings (keep as torch.Tensor on device)
+        embeddings = []
+        for i, feat in enumerate(image_features):
+            timestamp = images[i].ts
+            embeddings.append(CLIPEmbedding(vector=feat, timestamp=timestamp))
+
+        return embeddings[0] if len(images) == 1 else embeddings
+
+    def embed_text(self, *texts: str) -> CLIPEmbedding | list[CLIPEmbedding]:
+        """Embed one or more text strings.
+
+        Returns embeddings as torch.Tensor on device for efficient GPU comparisons.
+        """
+        with torch.inference_mode():
+            inputs = self.processor(text=list(texts), return_tensors="pt", padding=True).to(
+                self.device
+            )
+            text_features = self.model.get_text_features(**inputs)
+
+            if self.normalize:
+                text_features = F.normalize(text_features, dim=-1)
+
+        # Create embeddings (keep as torch.Tensor on device)
+        embeddings = []
+        for feat in text_features:
+            embeddings.append(CLIPEmbedding(vector=feat))
+
+        return embeddings[0] if len(texts) == 1 else embeddings
+
+    def warmup(self) -> None:
+        """Warmup the model with a dummy forward pass."""
+        dummy_image = torch.randn(1, 3, 224, 224).to(self.device)
+        dummy_text_inputs = self.processor(text=["warmup"], return_tensors="pt", padding=True).to(
+            self.device
+        )
+
+        with torch.inference_mode():
+            # Use pixel_values directly for image warmup
+            self.model.get_image_features(pixel_values=dummy_image)
+            self.model.get_text_features(**dummy_text_inputs)
diff --git a/dimos/perception/detection/reid/mobileclip.py b/dimos/models/embedding/mobileclip.py
similarity index 88%
rename from dimos/perception/detection/reid/mobileclip.py
rename to dimos/models/embedding/mobileclip.py
index 7cb16fcdab..d952196a48 100644
--- a/dimos/perception/detection/reid/mobileclip.py
+++ b/dimos/models/embedding/mobileclip.py
@@ -14,27 +14,16 @@
 
 from pathlib import Path
 
-import numpy as np
 import open_clip
 import torch
 import torch.nn.functional as F
 from PIL import Image as PILImage
 
+from dimos.models.embedding.type import Embedding, EmbeddingModel
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.reid.base import Embedding, EmbeddingModel
 
 
-class MobileCLIPEmbedding(Embedding):
-    """Embedding produced by MobileCLIP model.
-
-    Keeps embeddings as torch.Tensor on device for efficient GPU comparisons.
-    """
-
-    def __init__(self, vector: torch.Tensor | np.ndarray, timestamp: float = 0.0):
-        self.vector = vector
-        # Set timestamp from parent Timestamped class
-        if timestamp > 0:
-            self.timestamp = timestamp
+class MobileCLIPEmbedding(Embedding): ...
 
 
 class MobileCLIPModel(EmbeddingModel[MobileCLIPEmbedding]):
@@ -59,6 +48,7 @@ def __init__(
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
         self.normalize = normalize
 
+        print(f"[DEBUG] MobileCLIPModel.__init__: model_name={model_name}, model_path={model_path}, device={self.device}")
         # Load model
         pretrained = str(model_path) if model_path else None
         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
@@ -66,6 +56,7 @@ def __init__(
         )
         self.tokenizer = open_clip.get_tokenizer(model_name)
         self.model = self.model.eval().to(self.device)
+        print(f"[DEBUG] MobileCLIPModel.__init__: COMPLETE")
 
     def embed(self, *images: Image) -> MobileCLIPEmbedding | list[MobileCLIPEmbedding]:
         """Embed one or more images.
diff --git a/dimos/perception/detection/reid/test_mobileclip.py b/dimos/models/embedding/test_embedding_models.py
similarity index 57%
rename from dimos/perception/detection/reid/test_mobileclip.py
rename to dimos/models/embedding/test_embedding_models.py
index 11282fbd79..f9ec892137 100644
--- a/dimos/perception/detection/reid/test_mobileclip.py
+++ b/dimos/models/embedding/test_embedding_models.py
@@ -15,16 +15,25 @@
 import numpy as np
 import pytest
 
+from dimos.models.embedding.clip import CLIPModel
+from dimos.models.embedding.mobileclip import MobileCLIPModel
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.reid.mobileclip import MobileCLIPModel
 from dimos.utils.data import get_data
 
 
-@pytest.fixture(scope="session")
-def mobileclip_model():
-    """Load MobileCLIP model once for all tests."""
-    model_path = get_data("models_mobileclip") / "mobileclip2_s0.pt"
-    model = MobileCLIPModel(model_name="MobileCLIP2-S0", model_path=model_path)
+@pytest.fixture(scope="session", params=["mobileclip", "clip"])
+def embedding_model(request):
+    """Load embedding model once for all tests. Parametrized for different models."""
+    if request.param == "mobileclip":
+        model_path = get_data("models_mobileclip") / "mobileclip2_s0.pt"
+        model = MobileCLIPModel(model_name="MobileCLIP2-S0", model_path=model_path)
+        model.warmup()
+    elif request.param == "clip":
+        model = CLIPModel(model_name="openai/clip-vit-base-patch32")
+        model.warmup()
+    else:
+        raise ValueError(f"Unknown model: {request.param}")
+
     model.warmup()
     return model
 
@@ -36,9 +45,9 @@ def test_image():
 
 
 @pytest.mark.heavy
-def test_single_image_embedding(mobileclip_model, test_image):
+def test_single_image_embedding(embedding_model, test_image):
     """Test embedding a single image."""
-    embedding = mobileclip_model.embed(test_image)
+    embedding = embedding_model.embed(test_image)
 
     # Embedding should be torch.Tensor on device
     import torch
@@ -61,9 +70,9 @@ def test_single_image_embedding(mobileclip_model, test_image):
 
 
 @pytest.mark.heavy
-def test_batch_image_embedding(mobileclip_model, test_image):
+def test_batch_image_embedding(embedding_model, test_image):
     """Test embedding multiple images at once."""
-    embeddings = mobileclip_model.embed(test_image, test_image, test_image)
+    embeddings = embedding_model.embed(test_image, test_image, test_image)
 
     assert isinstance(embeddings, list), "Batch embedding should return list"
     assert len(embeddings) == 3, "Should return 3 embeddings"
@@ -79,11 +88,11 @@ def test_batch_image_embedding(mobileclip_model, test_image):
 
 
 @pytest.mark.heavy
-def test_single_text_embedding(mobileclip_model):
+def test_single_text_embedding(embedding_model):
     """Test embedding a single text string."""
     import torch
 
-    embedding = mobileclip_model.embed_text("a cafe")
+    embedding = embedding_model.embed_text("a cafe")
 
     # Should be torch.Tensor
     assert isinstance(embedding.vector, torch.Tensor), "Text embedding should be torch.Tensor"
@@ -101,11 +110,11 @@ def test_single_text_embedding(mobileclip_model):
 
 
 @pytest.mark.heavy
-def test_batch_text_embedding(mobileclip_model):
+def test_batch_text_embedding(embedding_model):
     """Test embedding multiple text strings at once."""
     import torch
 
-    embeddings = mobileclip_model.embed_text("a cafe", "a person", "a dog")
+    embeddings = embedding_model.embed_text("a cafe", "a person", "a dog")
 
     assert isinstance(embeddings, list), "Batch text embedding should return list"
     assert len(embeddings) == 3, "Should return 3 text embeddings"
@@ -118,13 +127,13 @@ def test_batch_text_embedding(mobileclip_model):
 
 
 @pytest.mark.heavy
-def test_text_image_similarity(mobileclip_model, test_image):
+def test_text_image_similarity(embedding_model, test_image):
     """Test cross-modal text-image similarity using @ operator."""
-    img_embedding = mobileclip_model.embed(test_image)
+    img_embedding = embedding_model.embed(test_image)
 
     # Embed text queries
     queries = ["a cafe", "a person", "a car", "a dog", "potato", "food"]
-    text_embeddings = mobileclip_model.embed_text(*queries)
+    text_embeddings = embedding_model.embed_text(*queries)
 
     # Compute similarities using @ operator
     similarities = {}
@@ -139,10 +148,10 @@ def test_text_image_similarity(mobileclip_model, test_image):
 
 
 @pytest.mark.heavy
-def test_cosine_distance(mobileclip_model, test_image):
+def test_cosine_distance(embedding_model, test_image):
     """Test cosine distance computation (1 - similarity)."""
-    emb1 = mobileclip_model.embed(test_image)
-    emb2 = mobileclip_model.embed(test_image)
+    emb1 = embedding_model.embed(test_image)
+    emb2 = embedding_model.embed(test_image)
 
     # Similarity using @ operator
     similarity = emb1 @ emb2
@@ -158,17 +167,17 @@ def test_cosine_distance(mobileclip_model, test_image):
 
 
 @pytest.mark.heavy
-def test_query_functionality(mobileclip_model, test_image):
+def test_query_functionality(embedding_model, test_image):
     """Test query method for top-k retrieval."""
     # Create a query and some candidates
-    query_text = mobileclip_model.embed_text("a cafe")
+    query_text = embedding_model.embed_text("a cafe")
 
     # Create candidate embeddings
     candidate_texts = ["a cafe", "a restaurant", "a person", "a dog", "a car"]
-    candidates = mobileclip_model.embed_text(*candidate_texts)
+    candidates = embedding_model.embed_text(*candidate_texts)
 
     # Query for top-3
-    results = mobileclip_model.query(query_text, candidates, top_k=3)
+    results = embedding_model.query(query_text, candidates, top_k=3)
 
     print("\nTop-3 results:")
     for idx, sim in results:
@@ -181,10 +190,10 @@ def test_query_functionality(mobileclip_model, test_image):
 
 
 @pytest.mark.heavy
-def test_embedding_operator(mobileclip_model, test_image):
+def test_embedding_operator(embedding_model, test_image):
     """Test that @ operator works on embeddings."""
-    emb1 = mobileclip_model.embed(test_image)
-    emb2 = mobileclip_model.embed(test_image)
+    emb1 = embedding_model.embed(test_image)
+    emb2 = embedding_model.embed(test_image)
 
     # Use @ operator
     similarity = emb1 @ emb2
@@ -195,25 +204,25 @@ def test_embedding_operator(mobileclip_model, test_image):
 
 
 @pytest.mark.heavy
-def test_warmup(mobileclip_model):
+def test_warmup(embedding_model):
     """Test that warmup runs without error."""
     # Warmup is already called in fixture, but test it explicitly
-    mobileclip_model.warmup()
+    embedding_model.warmup()
     # Just verify no exceptions raised
     assert True
 
 
 @pytest.mark.heavy
-def test_compare_one_to_many(mobileclip_model, test_image):
+def test_compare_one_to_many(embedding_model, test_image):
     """Test GPU-accelerated one-to-many comparison."""
     import torch
 
     # Create query and gallery
-    query_emb = mobileclip_model.embed(test_image)
-    gallery_embs = mobileclip_model.embed(test_image, test_image, test_image)
+    query_emb = embedding_model.embed(test_image)
+    gallery_embs = embedding_model.embed(test_image, test_image, test_image)
 
     # Compare on GPU
-    similarities = mobileclip_model.compare_one_to_many(query_emb, gallery_embs)
+    similarities = embedding_model.compare_one_to_many(query_emb, gallery_embs)
 
     print(f"\nOne-to-many similarities: {similarities}")
 
@@ -228,16 +237,16 @@ def test_compare_one_to_many(mobileclip_model, test_image):
 
 
 @pytest.mark.heavy
-def test_compare_many_to_many(mobileclip_model):
+def test_compare_many_to_many(embedding_model):
     """Test GPU-accelerated many-to-many comparison."""
     import torch
 
     # Create queries and candidates
-    queries = mobileclip_model.embed_text("a cafe", "a person")
-    candidates = mobileclip_model.embed_text("a cafe", "a restaurant", "a dog")
+    queries = embedding_model.embed_text("a cafe", "a person")
+    candidates = embedding_model.embed_text("a cafe", "a restaurant", "a dog")
 
     # Compare on GPU
-    similarities = mobileclip_model.compare_many_to_many(queries, candidates)
+    similarities = embedding_model.compare_many_to_many(queries, candidates)
 
     print(f"\nMany-to-many similarities:\n{similarities}")
 
@@ -252,17 +261,17 @@ def test_compare_many_to_many(mobileclip_model):
 
 
 @pytest.mark.heavy
-def test_gpu_query_performance(mobileclip_model, test_image):
+def test_gpu_query_performance(embedding_model, test_image):
     """Test that query method uses GPU acceleration."""
     # Create a larger gallery
     gallery_size = 20
     gallery_images = [test_image] * gallery_size
-    gallery_embs = mobileclip_model.embed(*gallery_images)
+    gallery_embs = embedding_model.embed(*gallery_images)
 
-    query_emb = mobileclip_model.embed(test_image)
+    query_emb = embedding_model.embed(test_image)
 
     # Query should use GPU-accelerated comparison
-    results = mobileclip_model.query(query_emb, gallery_embs, top_k=5)
+    results = embedding_model.query(query_emb, gallery_embs, top_k=5)
 
     print(f"\nTop-5 results from gallery of {gallery_size}")
     for idx, sim in results:
@@ -272,3 +281,103 @@ def test_gpu_query_performance(mobileclip_model, test_image):
     # All should be high similarity (same image, allow some variation for image preprocessing)
     for idx, sim in results:
         assert sim > 0.90, f"Same images should have high similarity, got {sim}"
+
+
+@pytest.mark.heavy
+def test_embedding_performance(embedding_model):
+    """Measure embedding performance over multiple real video frames."""
+    import time
+
+    from dimos.utils.testing import TimedSensorReplay
+
+    # Load actual video frames
+    data_dir = "unitree_go2_lidar_corrected"
+    get_data(data_dir)
+
+    video_replay = TimedSensorReplay(f"{data_dir}/video")
+
+    # Collect 10 real frames from the video
+    test_images = []
+    for ts, frame in video_replay.iterate_ts(duration=1.0):
+        test_images.append(frame.to_rgb())
+        if len(test_images) >= 10:
+            break
+
+    if len(test_images) < 10:
+        pytest.skip(f"Not enough video frames found (got {len(test_images)})")
+
+    # Measure single image embedding time
+    times = []
+    for img in test_images:
+        start = time.perf_counter()
+        _ = embedding_model.embed(img)
+        end = time.perf_counter()
+        elapsed_ms = (end - start) * 1000
+        times.append(elapsed_ms)
+
+    # Calculate statistics
+    avg_time = sum(times) / len(times)
+    min_time = min(times)
+    max_time = max(times)
+    std_time = (sum((t - avg_time) ** 2 for t in times) / len(times)) ** 0.5
+
+    print("\n" + "=" * 60)
+    print("Embedding Performance Statistics:")
+    print("=" * 60)
+    print(f"Number of images: {len(test_images)}")
+    print(f"Average time: {avg_time:.2f} ms")
+    print(f"Min time: {min_time:.2f} ms")
+    print(f"Max time: {max_time:.2f} ms")
+    print(f"Std dev: {std_time:.2f} ms")
+    print(f"Throughput: {1000 / avg_time:.1f} images/sec")
+    print("=" * 60)
+
+    # Also test batch embedding performance
+    start = time.perf_counter()
+    batch_embeddings = embedding_model.embed(*test_images)
+    end = time.perf_counter()
+    batch_time = (end - start) * 1000
+    batch_per_image = batch_time / len(test_images)
+
+    print("\nBatch Embedding Performance:")
+    print(f"Total batch time: {batch_time:.2f} ms")
+    print(f"Time per image (batched): {batch_per_image:.2f} ms")
+    print(f"Batch throughput: {1000 / batch_per_image:.1f} images/sec")
+    print(f"Speedup vs single: {avg_time / batch_per_image:.2f}x")
+    print("=" * 60)
+
+    # Verify embeddings are valid
+    assert len(batch_embeddings) == len(test_images)
+    assert all(e.vector is not None for e in batch_embeddings)
+
+    # Sanity check: verify embeddings are meaningful by testing text-image similarity
+    print("\n" + "=" * 60)
+    print("Sanity Check: Text-Image Similarity on First Frame")
+    print("=" * 60)
+    first_frame_emb = batch_embeddings[0]
+
+    # Test common object/scene queries
+    test_queries = [
+        "indoor scene",
+        "outdoor scene",
+        "a person",
+        "a dog",
+        "a robot",
+        "grass and trees",
+        "furniture",
+        "a car",
+    ]
+
+    text_embeddings = embedding_model.embed_text(*test_queries)
+    similarities = []
+    for query, text_emb in zip(test_queries, text_embeddings):
+        sim = first_frame_emb @ text_emb
+        similarities.append((query, sim))
+
+    # Sort by similarity
+    similarities.sort(key=lambda x: x[1], reverse=True)
+
+    print("Top matching concepts:")
+    for query, sim in similarities[:5]:
+        print(f"  '{query}': {sim:.4f}")
+    print("=" * 60)
diff --git a/dimos/perception/detection/reid/base.py b/dimos/models/embedding/type.py
similarity index 93%
rename from dimos/perception/detection/reid/base.py
rename to dimos/models/embedding/type.py
index 4ca17f35d6..5a87b2d2d9 100644
--- a/dimos/perception/detection/reid/base.py
+++ b/dimos/models/embedding/type.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
+import time
 from abc import ABC, abstractmethod
-from typing import Generic, TypeVar
+from typing import Generic, Optional, TypeVar
 
 import numpy as np
 import torch
@@ -31,6 +34,13 @@ class Embedding(Timestamped):
 
     vector: torch.Tensor | np.ndarray
 
+    def __init__(self, vector: torch.Tensor | np.ndarray, timestamp: Optional[float] = None):
+        self.vector = vector
+        if timestamp:
+            self.timestamp = timestamp
+        else:
+            self.timestamp = time.time()
+
     def __matmul__(self, other: "Embedding") -> float:
         """Compute cosine similarity via @ operator."""
         if isinstance(self.vector, torch.Tensor):
@@ -50,7 +60,7 @@ def to_torch(self, device: str | torch.device | None = None) -> torch.Tensor:
         if isinstance(self.vector, np.ndarray):
             tensor = torch.from_numpy(self.vector)
             return tensor.to(device) if device else tensor
-        # Already a tensor
+
         if device is not None and self.vector.device != torch.device(device):
             return self.vector.to(device)
         return self.vector
diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py
index 30c74fd243..7a124e5d32 100644
--- a/dimos/msgs/sensor_msgs/Image.py
+++ b/dimos/msgs/sensor_msgs/Image.py
@@ -21,14 +21,11 @@
 
 import cv2
 import numpy as np
-import reactivex as rx
 from dimos_lcm.sensor_msgs.Image import Image as LCMImage
 from dimos_lcm.std_msgs.Header import Header
-from reactivex import operators as ops
 from reactivex.observable import Observable
-from reactivex.scheduler import ThreadPoolScheduler
 
-from dimos.types.timestamped import Timestamped, TimestampedBufferCollection, to_human_readable
+from dimos.types.timestamped import Timestamped, to_human_readable
 from dimos.utils.reactive import quality_barrier
 
 try:
@@ -301,7 +298,7 @@ def crop(self, x: int, y: int, width: int, height: int) -> "Image":
             ts=self.ts,
         )
 
-    @functools.cached_property
+    @property
     def sharpness(self) -> float:
         """
         Compute the Tenengrad focus measure for an image.
diff --git a/dimos/perception/detection/conftest.py b/dimos/perception/detection/conftest.py
index e6e69ce0af..73abf489cd 100644
--- a/dimos/perception/detection/conftest.py
+++ b/dimos/perception/detection/conftest.py
@@ -178,6 +178,13 @@ def publisher(moment: Moment | Moment2D | Moment3D):
     return publisher
 
 
+@pytest.fixture(scope="session")
+def imageDetections2d(get_moment_2d) -> ImageDetections2D:
+    moment = get_moment_2d()
+    assert len(moment["detections2d"]) > 0, "No detections found in the moment"
+    return moment["detections2d"]
+
+
 @pytest.fixture(scope="session")
 def detection2d(get_moment_2d) -> Detection2D:
     moment = get_moment_2d()
diff --git a/dimos/perception/detection/module2D.py b/dimos/perception/detection/module2D.py
index 86dcfd2ab3..d0b2956539 100644
--- a/dimos/perception/detection/module2D.py
+++ b/dimos/perception/detection/module2D.py
@@ -40,7 +40,7 @@
 
 @dataclass
 class Config(ModuleConfig):
-    max_freq: float = 10  # hz
+    max_freq: float = 10
     detector: Optional[Callable[[Any], Detector]] = YoloPersonDetector
     camera_info: CameraInfo = CameraInfo()
 
@@ -81,7 +81,7 @@ def sharp_image_stream(self) -> Observable[Image]:
 
     @simple_mcache
     def detection_stream_2d(self) -> Observable[ImageDetections2D]:
-        return backpressure(self.image.observable().pipe(ops.map(self.process_image_frame)))
+        return backpressure(self.sharp_image_stream().pipe(ops.map(self.process_image_frame)))
 
     def pixel_to_3d(
         self,
diff --git a/dimos/perception/detection/person_tracker.py b/dimos/perception/detection/person_tracker.py
index 83a62cd092..04173071e3 100644
--- a/dimos/perception/detection/person_tracker.py
+++ b/dimos/perception/detection/person_tracker.py
@@ -93,7 +93,7 @@ def track(self, detections2D: ImageDetections2D):
             return
 
         target = max(detections2D.detections, key=lambda det: det.bbox_2d_volume())
-        vector = self.center_to_3d(target.center_bbox, self.camera_info, 1.0)
+        vector = self.center_to_3d(target.center_bbox, self.camera_info, 2.0)
 
         pose_in_camera = PoseStamped(
             ts=detections2D.ts,
@@ -101,7 +101,7 @@ def track(self, detections2D: ImageDetections2D):
             frame_id="camera_link",
         )
 
-        tf_world_to_camera = self.tf.get("world", "camera_link", detections2D.ts, 0.5)
+        tf_world_to_camera = self.tf.get("world", "camera_link", detections2D.ts, 5.0)
         if not tf_world_to_camera:
             return
 
diff --git a/dimos/perception/detection/reid/__init__.py b/dimos/perception/detection/reid/__init__.py
index 6ac0295caf..f4145897b3 100644
--- a/dimos/perception/detection/reid/__init__.py
+++ b/dimos/perception/detection/reid/__init__.py
@@ -1 +1,21 @@
-from dimos.perception.detection.reid.reidModule import ReidModule as ReidModule
+from dimos.perception.detection.reid.module import Config, ReidModule
+from dimos.perception.detection.reid.type import (
+    EmbeddingFeatureExtractor,
+    EmbeddingIDSystem,
+    FeatureExtractor,
+    IDSystem,
+    PassthroughIDSystem,
+)
+
+__all__ = [
+    # Feature Extractors
+    "FeatureExtractor",
+    "EmbeddingFeatureExtractor",
+    # ID Systems
+    "IDSystem",
+    "EmbeddingIDSystem",
+    "PassthroughIDSystem",
+    # Module
+    "ReidModule",
+    "Config",
+]
diff --git a/dimos/perception/detection/reid/reidModule.py b/dimos/perception/detection/reid/module.py
similarity index 67%
rename from dimos/perception/detection/reid/reidModule.py
rename to dimos/perception/detection/reid/module.py
index 2335fdde35..b70b01399e 100644
--- a/dimos/perception/detection/reid/reidModule.py
+++ b/dimos/perception/detection/reid/module.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Callable, Optional
-
 from dimos_lcm.foxglove_msgs.ImageAnnotations import (
     ImageAnnotations,
     TextAnnotation,
@@ -23,20 +21,22 @@
 from reactivex.observable import Observable
 
 from dimos.core import In, Module, ModuleConfig, Out, rpc
+from dimos.models.embedding import MobileCLIPModel
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection.reid.base import EmbeddingModel
-from dimos.perception.detection.reid.mobileclip import MobileCLIPModel
-from dimos.perception.detection.reid.trackAssociator import TrackAssociator
+from dimos.perception.detection.reid.type import (
+    EmbeddingFeatureExtractor,
+    EmbeddingIDSystem,
+    IDSystem,
+)
 from dimos.perception.detection.type import ImageDetections2D
 from dimos.types.timestamped import align_timestamped, to_ros_stamp
 from dimos.utils.reactive import backpressure
 
 
 class Config(ModuleConfig):
-    embedding_model: Optional[Callable[..., "EmbeddingModel"]] = None
-    similarity_threshold: float = 0.99
+    idsystem: IDSystem
 
 
 class ReidModule(Module):
@@ -46,19 +46,21 @@ class ReidModule(Module):
     image: In[Image] = None  # type: ignore
     annotations: Out[ImageAnnotations] = None  # type: ignore
 
-    def __init__(self, **kwargs):
+    def __init__(self, idsystem: IDSystem | None = None, warmup: bool = True, **kwargs):
         super().__init__(**kwargs)
-        self.config = Config(**kwargs)
-        self.embedding_model = (
-            self.config.embedding_model() if self.config.embedding_model else MobileCLIPModel()
-        )
-        self.associator = (
-            TrackAssociator(
-                model=self.embedding_model, similarity_threshold=self.config.similarity_threshold
+
+        # Create default MobileCLIP-based IDSystem if none provided
+        if idsystem is None:
+            mobileclip_model = MobileCLIPModel()
+            if warmup:
+                mobileclip_model.warmup()
+            feature_extractor = EmbeddingFeatureExtractor(model=mobileclip_model, padding=20)
+            idsystem = EmbeddingIDSystem(
+                feature_extractor=feature_extractor,  # type: ignore[arg-type]
+                similarity_threshold=0.75,
             )
-            if self.embedding_model
-            else None
-        )
+
+        self.idsystem = idsystem
 
     def detections_stream(self) -> Observable[ImageDetections2D]:
         return backpressure(
@@ -77,27 +79,11 @@ def start(self):
         self.detections_stream().subscribe(self.ingress)
 
     def ingress(self, imageDetections: ImageDetections2D):
-        if not self.associator or not self.embedding_model:
-            print("No embedding model or associator configured")
-            return
-
-        track_ids = []
-
-        # Update embeddings for all detections
-        for detection in imageDetections:
-            embedding = self.embedding_model.embed(detection.cropped_image(padding=0))
-            # embed() with single image returns single Embedding
-            assert not isinstance(embedding, list), "Expected single embedding"
-            self.associator.update_embedding(detection.track_id, embedding)
-            track_ids.append(detection.track_id)
-
-        # Record negative constraints (co-occurrence = different objects)
-        self.associator.add_negative_constraints(track_ids)
-
-        # Associate and create annotations
         text_annotations = []
+
         for detection in imageDetections:
-            long_term_id = self.associator.associate(detection.track_id)
+            # Register detection and get long-term ID
+            long_term_id = self.idsystem.register_detection(detection)
             print(
                 f"track_id={detection.track_id} -> long_term_id={long_term_id} "
                 f"({detection.name}, conf={detection.confidence:.2f})"
diff --git a/dimos/perception/detection/reid/test_module.py b/dimos/perception/detection/reid/test_module.py
new file mode 100644
index 0000000000..8bd63be65f
--- /dev/null
+++ b/dimos/perception/detection/reid/test_module.py
@@ -0,0 +1,48 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+import pytest
+import torch
+
+from dimos.core import LCMTransport, start
+from dimos.models.embedding import CLIPModel
+from dimos.msgs.foxglove_msgs import ImageAnnotations
+from dimos.msgs.sensor_msgs import Image
+from dimos.msgs.vision_msgs import Detection2DArray
+from dimos.perception.detection.reid.module import ReidModule
+from dimos.perception.detection.reid.type import (
+    EmbeddingFeatureExtractor,
+    EmbeddingIDSystem,
+)
+
+
+def test_reid_ingress():
+    # Clear GPU cache before loading CLIP to avoid OOM
+
+    # Create CLIP-based IDSystem for testing
+    clip_model = CLIPModel(model_name="openai/clip-vit-base-patch32")
+    clip_model.warmup()
+    # feature_extractor = EmbeddingFeatureExtractor(model=clip_model, padding=20)
+    # idsystem = EmbeddingIDSystem(
+    #     feature_extractor=feature_extractor,  # type: ignore[arg-type]
+    #     similarity_threshold=0.75,
+    # )
+
+    # reid_module = ReidModule(idsystem=idsystem, warmup=False)
+    # print("Processing detections through ReidModule...")
+    # reid_module.annotations._transport = LCMTransport("/annotations", ImageAnnotations)
+    # reid_module.ingress(imageDetections2d)
+    # reid_module._close_module()
+    # print("✓ ReidModule ingress test completed successfully")
diff --git a/dimos/perception/detection/reid/test_trackAssociator.py b/dimos/perception/detection/reid/test_trackAssociator.py
index 76f868bd7b..9c0783af61 100644
--- a/dimos/perception/detection/reid/test_trackAssociator.py
+++ b/dimos/perception/detection/reid/test_trackAssociator.py
@@ -15,8 +15,8 @@
 import pytest
 import torch
 
+from dimos.models.embedding.mobileclip import MobileCLIPModel
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.reid.mobileclip import MobileCLIPModel
 from dimos.perception.detection.reid.trackAssociator import TrackAssociator
 from dimos.utils.data import get_data
 
@@ -31,9 +31,9 @@ def mobileclip_model():
 
 
 @pytest.fixture
-def track_associator(mobileclip_model):
+def track_associator():
     """Create fresh TrackAssociator for each test."""
-    return TrackAssociator(model=mobileclip_model, similarity_threshold=0.75)
+    return TrackAssociator(similarity_threshold=0.75)
 
 
 @pytest.fixture(scope="session")
@@ -214,21 +214,22 @@ def test_gpu_performance(track_associator, mobileclip_model, test_image):
     emb_vec = track_associator.track_embeddings[1]
     assert isinstance(emb_vec, torch.Tensor)
     # Device comparison (handle "cuda" vs "cuda:0")
-    assert emb_vec.device.type == torch.device(track_associator.device).type
+    expected_device = mobileclip_model.device
+    assert emb_vec.device.type == torch.device(expected_device).type
 
     # Running average should happen on GPU
     embedding2 = mobileclip_model.embed(test_image)
     track_associator.update_embedding(track_id=1, new_embedding=embedding2)
 
     avg_vec = track_associator.track_embeddings[1]
-    assert avg_vec.device.type == torch.device(track_associator.device).type
+    assert avg_vec.device.type == torch.device(expected_device).type
 
 
 @pytest.mark.heavy
-def test_similarity_threshold_configurable(mobileclip_model):
+def test_similarity_threshold_configurable():
     """Test that similarity threshold is configurable."""
-    associator_strict = TrackAssociator(model=mobileclip_model, similarity_threshold=0.95)
-    associator_loose = TrackAssociator(model=mobileclip_model, similarity_threshold=0.50)
+    associator_strict = TrackAssociator(similarity_threshold=0.95)
+    associator_loose = TrackAssociator(similarity_threshold=0.50)
 
     assert associator_strict.similarity_threshold == 0.95
     assert associator_loose.similarity_threshold == 0.50
diff --git a/dimos/perception/detection/reid/trackAssociator.py b/dimos/perception/detection/reid/trackAssociator.py
index 44b93392e7..2c3b45aee7 100644
--- a/dimos/perception/detection/reid/trackAssociator.py
+++ b/dimos/perception/detection/reid/trackAssociator.py
@@ -17,7 +17,7 @@
 import torch
 import torch.nn.functional as F
 
-from dimos.perception.detection.reid.base import Embedding, EmbeddingModel
+from dimos.models.embedding.type import Embedding
 
 
 class TrackAssociator:
@@ -29,15 +29,13 @@ class TrackAssociator:
     - Mapping from track_id to unique long-term ID
     """
 
-    def __init__(self, model: EmbeddingModel, similarity_threshold: float = 0.75):
+    def __init__(self, similarity_threshold: float = 0.75):
         """Initialize track associator.
 
         Args:
             model: Embedding model for GPU-accelerated comparisons
             similarity_threshold: Minimum similarity for associating tracks (0-1)
         """
-        self.model = model
-        self.device = model.device
         self.similarity_threshold = similarity_threshold
 
         # Track embeddings (running average, kept on GPU)
@@ -61,8 +59,8 @@ def update_embedding(self, track_id: int, new_embedding: Embedding) -> None:
             track_id: Short-term track ID from detector
             new_embedding: New embedding to incorporate into average
         """
-        # Convert to torch on device (no-op if already on device)
-        new_vec = new_embedding.to_torch(self.device)
+        # Convert to torch (infer device from embedding)
+        new_vec = new_embedding.to_torch()
 
         # Debug: check embedding diversity
         print(
diff --git a/dimos/perception/detection/reid/type.py b/dimos/perception/detection/reid/type.py
new file mode 100644
index 0000000000..6fc1d2ff3c
--- /dev/null
+++ b/dimos/perception/detection/reid/type.py
@@ -0,0 +1,150 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+
+from dimos.models.embedding.type import Embedding, EmbeddingModel
+from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D
+
+
+E = TypeVar("E", bound="Embedding")
+F = TypeVar("F")  # Generic feature type
+
+
+class FeatureExtractor(ABC, Generic[F]):
+    """Abstract base class for extracting features from detections."""
+
+    @abstractmethod
+    def extract(self, detection: Detection2DBBox) -> F:
+        """
+        Extract feature from a detection.
+
+        Args:
+            detection: Detection to extract features from
+
+        Returns:
+            Extracted feature of type F
+        """
+        pass
+
+
+class EmbeddingFeatureExtractor(FeatureExtractor[E], Generic[E]):
+    """Feature extractor that uses an embedding model to extract features from detection crops."""
+
+    def __init__(self, model: EmbeddingModel[E], padding: int = 20):
+        """
+        Initialize embedding feature extractor.
+
+        Args:
+            model: Embedding model to use for feature extraction
+            padding: Padding to add around detection bbox when cropping (default: 0)
+        """
+        self.model = model
+        self.padding = padding
+
+    def extract(self, detection: Detection2DBBox) -> E:
+        """
+        Extract embedding from detection's cropped image.
+
+        Args:
+            detection: Detection to extract embedding from
+
+        Returns:
+            Embedding feature
+        """
+        cropped_image = detection.cropped_image(padding=self.padding)
+        embedding = self.model.embed(cropped_image)
+        assert not isinstance(embedding, list), "Expected single embedding for single image"
+        return embedding
+
+
+class IDSystem(ABC, Generic[F]):
+    """Abstract base class for ID assignment systems using features."""
+
+    def __init__(self, feature_extractor: FeatureExtractor[F]):
+        """
+        Initialize ID system with feature extractor.
+
+        Args:
+            feature_extractor: Feature extractor to use for detection features
+        """
+        self.feature_extractor = feature_extractor
+
+    def register_detections(self, detections: ImageDetections2D) -> None:
+        """Register multiple detections."""
+        for detection in detections.detections:
+            if isinstance(detection, Detection2DBBox):
+                self.register_detection(detection)
+
+    @abstractmethod
+    def register_detection(self, detection: Detection2DBBox) -> int:
+        """
+        Register a single detection, returning assigned (long term) ID.
+
+        Args:
+            detection: Detection to register
+
+        Returns:
+            Long-term unique ID for this detection
+        """
+        ...
+
+
+class PassthroughIDSystem(IDSystem[F]):
+    """Simple ID system that returns track_id with no object permanence."""
+
+    def __init__(self, feature_extractor: FeatureExtractor[F] | None = None):
+        """
+        Initialize passthrough ID system.
+
+        Args:
+            feature_extractor: Optional feature extractor (not used, for interface compatibility)
+        """
+        # Don't call super().__init__ since we don't need feature_extractor
+        self.feature_extractor = feature_extractor  # type: ignore
+
+    def register_detection(self, detection: Detection2DBBox) -> int:
+        """Return detection's track_id as long-term ID (no permanence)."""
+        return detection.track_id
+
+
+class EmbeddingIDSystem(IDSystem[Embedding]):
+    """ID system using embedding similarity for object permanence."""
+
+    def __init__(
+        self,
+        feature_extractor: FeatureExtractor[Embedding],
+        similarity_threshold: float = 0.75,
+    ):
+        """
+        Initialize embedding-based ID system.
+
+        Args:
+            feature_extractor: Feature extractor for embeddings
+            similarity_threshold: Minimum similarity for associating tracks (0-1)
+        """
+        super().__init__(feature_extractor)
+
+        # Import here to avoid circular dependency
+        from dimos.perception.detection.reid.trackAssociator import TrackAssociator
+
+        self.associator = TrackAssociator(similarity_threshold=similarity_threshold)
+
+    def register_detection(self, detection: Detection2DBBox) -> int:
+        embedding = self.feature_extractor.extract(detection)
+        self.associator.update_embedding(detection.track_id, embedding)
+        return self.associator.associate(detection.track_id)
diff --git a/dimos/robot/unitree_webrtc/connection.py b/dimos/robot/unitree_webrtc/connection.py
index 75d3bdd13d..353881b887 100644
--- a/dimos/robot/unitree_webrtc/connection.py
+++ b/dimos/robot/unitree_webrtc/connection.py
@@ -37,6 +37,7 @@
 from dimos.robot.unitree_webrtc.type.lidar import LidarMessage
 from dimos.robot.unitree_webrtc.type.lowstate import LowStateMsg
 from dimos.robot.unitree_webrtc.type.odometry import Odometry
+from dimos.utils.decorators.decorators import simple_mcache
 from dimos.utils.reactive import backpressure, callback_to_observable
 
 VideoMessage: TypeAlias = np.ndarray[tuple[int, int, Literal[3]], np.uint8]
@@ -197,15 +198,15 @@ def publish_request(self, topic: str, data: dict):
         )
         return future.result()
 
-    @functools.cache
+    @simple_mcache
     def raw_lidar_stream(self) -> Subject[LidarMessage]:
         return backpressure(self.unitree_sub_stream(RTC_TOPIC["ULIDAR_ARRAY"]))
 
-    @functools.cache
+    @simple_mcache
     def raw_odom_stream(self) -> Subject[Pose]:
         return backpressure(self.unitree_sub_stream(RTC_TOPIC["ROBOTODOM"]))
 
-    @functools.cache
+    @simple_mcache
     def lidar_stream(self) -> Subject[LidarMessage]:
         return backpressure(
             self.raw_lidar_stream().pipe(
@@ -213,22 +214,23 @@ def lidar_stream(self) -> Subject[LidarMessage]:
             )
         )
 
-    @functools.cache
+    @simple_mcache
     def tf_stream(self) -> Subject[Transform]:
         base_link = functools.partial(Transform.from_pose, "base_link")
         return backpressure(self.odom_stream().pipe(ops.map(base_link)))
 
-    @functools.cache
+    @simple_mcache
     def odom_stream(self) -> Subject[Pose]:
         return backpressure(self.raw_odom_stream().pipe(ops.map(Odometry.from_msg)))
 
-    @functools.cache
+    @simple_mcache
     def video_stream(self) -> Observable[Image]:
         return backpressure(
             self.raw_video_stream().pipe(
                 ops.filter(lambda frame: frame is not None),
                 ops.map(
                     lambda frame: Image.from_numpy(
+                        # np.ascontiguousarray(frame.to_ndarray("rgb24")),
                         frame.to_ndarray(format="rgb24"),
                         frame_id="camera_optical",
                     )
@@ -236,7 +238,7 @@ def video_stream(self) -> Observable[Image]:
             )
         )
 
-    @functools.cache
+    @simple_mcache
     def lowstate_stream(self) -> Subject[LowStateMsg]:
         return backpressure(self.unitree_sub_stream(RTC_TOPIC["LOW_STATE"]))
 
@@ -279,7 +281,7 @@ def color(self, color: VUI_COLOR = VUI_COLOR.RED, colortime: int = 60) -> bool:
             },
         )
 
-    @functools.lru_cache(maxsize=None)
+    @simple_mcache
     def raw_video_stream(self) -> Observable[VideoMessage]:
         subject: Subject[VideoMessage] = Subject()
         stop_event = threading.Event()
diff --git a/dimos/robot/unitree_webrtc/modular/connection_module.py b/dimos/robot/unitree_webrtc/modular/connection_module.py
index 1d67e4f596..0a81beed18 100644
--- a/dimos/robot/unitree_webrtc/modular/connection_module.py
+++ b/dimos/robot/unitree_webrtc/modular/connection_module.py
@@ -324,8 +324,7 @@ def deploy_connection(dimos: DimosCluster, **kwargs):
 
     connection.video.transport = LCMTransport("/image", Image)
     connection.lidar.transport = LCMTransport("/lidar", LidarMessage)
-
-    connection.movecmd.transport = LCMTransport("/cmd_vel", Vector3)
+    connection.movecmd.transport = LCMTransport("/cmd_vel", Twist)
     connection.camera_info.transport = LCMTransport("/camera_info", CameraInfo)
 
     return connection
diff --git a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
index 410ad86ad7..4cb57908ef 100644
--- a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
+++ b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
@@ -41,8 +41,6 @@
 def detection_unitree():
     dimos = start(8)
     connection = deploy_connection(dimos)
-    # mapper = deploy_navigation(dimos, connection)
-    # mapper.start()
 
     def goto(pose):
         print("NAVIGATION REQUESTED:", pose)
@@ -76,14 +74,22 @@ def goto(pose):
     # reidModule.detections.connect(detector.detections)
     # reidModule.annotations.transport = LCMTransport("/reid/annotations", ImageAnnotations)
 
+    nav = deploy_navigation(dimos, connection)
+
     person_tracker = dimos.deploy(PersonTracker, cameraInfo=ConnectionModule._camera_info())
     person_tracker.image.connect(connection.video)
     person_tracker.detections.connect(detector.detections)
-    person_tracker.target.transport = LCMTransport("/target", PoseStamped)
+    person_tracker.target.transport = LCMTransport("/goal_request", PoseStamped)
+
+    reid = dimos.deploy(ReidModule)
+
+    reid.image.connect(connection.video)
+    reid.detections.connect(detector.detections)
 
     detector.start()
     person_tracker.start()
     connection.start()
+    reid.start()
 
     from dimos.agents2 import Agent, Output, Reducer, Stream, skill
     from dimos.agents2.cli.human import HumanInput
diff --git a/dimos/robot/unitree_webrtc/modular/navigation.py b/dimos/robot/unitree_webrtc/modular/navigation.py
index c37cac700a..f16fd29816 100644
--- a/dimos/robot/unitree_webrtc/modular/navigation.py
+++ b/dimos/robot/unitree_webrtc/modular/navigation.py
@@ -15,7 +15,7 @@
 from dimos_lcm.std_msgs import Bool, String
 
 from dimos.core import LCMTransport
-from dimos.msgs.geometry_msgs import PoseStamped, Vector3
+from dimos.msgs.geometry_msgs import PoseStamped, Twist, Vector3
 from dimos.msgs.nav_msgs import OccupancyGrid, Path
 from dimos.navigation.bt_navigator.navigator import BehaviorTreeNavigator
 from dimos.navigation.frontier_exploration import WavefrontFrontierExplorer
@@ -27,7 +27,7 @@
 
 
 def deploy_navigation(dimos, connection):
-    mapper = dimos.deploy(Map, voxel_size=0.5, cost_resolution=0.05, global_publish_interval=0.5)
+    mapper = dimos.deploy(Map, voxel_size=0.5, cost_resolution=0.05, global_publish_interval=2.5)
     mapper.lidar.connect(connection.lidar)
     mapper.global_map.transport = LCMTransport("/global_map", LidarMessage)
     mapper.global_costmap.transport = LCMTransport("/global_costmap", OccupancyGrid)
@@ -49,7 +49,7 @@ def deploy_navigation(dimos, connection):
     navigator.navigation_state.transport = LCMTransport("/navigation_state", String)
     navigator.global_costmap.transport = LCMTransport("/global_costmap", OccupancyGrid)
     global_planner.path.transport = LCMTransport("/global_path", Path)
-    local_planner.cmd_vel.transport = LCMTransport("/cmd_vel", Vector3)
+    local_planner.cmd_vel.transport = LCMTransport("/cmd_vel", Twist)
     frontier_explorer.goal_request.transport = LCMTransport("/goal_request", PoseStamped)
     frontier_explorer.goal_reached.transport = LCMTransport("/goal_reached", Bool)
     frontier_explorer.explore_cmd.transport = LCMTransport("/explore_cmd", Bool)
@@ -83,4 +83,11 @@ def deploy_navigation(dimos, connection):
     navigator.start()
     websocket_vis.start()
 
-    return mapper
+    return {
+        "mapper": mapper,
+        "global_planner": global_planner,
+        "local_planner": local_planner,
+        "navigator": navigator,
+        "frontier_explorer": frontier_explorer,
+        "websocket_vis": websocket_vis,
+    }

From 9e6c6d1b944eb8d742a2f2f680c1c3f07e0e5f21 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Tue, 14 Oct 2025 20:08:59 -0700
Subject: [PATCH 30/47] reid experiment

---
 dimos/models/embedding/__init__.py            |   3 +
 dimos/models/embedding/clip.py                |  20 +-
 dimos/models/embedding/mobileclip.py          |   2 -
 .../models/embedding/test_embedding_models.py |  84 +++++---
 dimos/models/embedding/treid.py               | 120 +++++++++++
 dimos/models/embedding/type.py                |   6 +
 .../detection/detectors/person/yolo.py        |  42 ++--
 dimos/perception/detection/detectors/yolo.py  |  26 +--
 dimos/perception/detection/module2D.py        |   2 +-
 dimos/perception/detection/reid/module.py     |  33 +--
 .../perception/detection/reid/test_module.py  |  12 +-
 .../detection/reid/trackAssociator.py         | 189 +++++++++++-------
 dimos/perception/detection/reid/type.py       |   7 +-
 .../unitree_webrtc/modular/ivan_unitree.py    |  13 +-
 dimos/robot/unitree_webrtc/unitree_go2.py     |  56 +++---
 15 files changed, 410 insertions(+), 205 deletions(-)
 create mode 100644 dimos/models/embedding/treid.py

diff --git a/dimos/models/embedding/__init__.py b/dimos/models/embedding/__init__.py
index cad8cd4255..ed6fc69a65 100644
--- a/dimos/models/embedding/__init__.py
+++ b/dimos/models/embedding/__init__.py
@@ -1,5 +1,6 @@
 from dimos.models.embedding.clip import CLIPEmbedding, CLIPModel
 from dimos.models.embedding.mobileclip import MobileCLIPEmbedding, MobileCLIPModel
+from dimos.models.embedding.treid import TorchReIDEmbedding, TorchReIDModel
 from dimos.models.embedding.type import Embedding, EmbeddingModel
 
 __all__ = [
@@ -9,4 +10,6 @@
     "CLIPModel",
     "MobileCLIPEmbedding",
     "MobileCLIPModel",
+    "TorchReIDEmbedding",
+    "TorchReIDModel",
 ]
diff --git a/dimos/models/embedding/clip.py b/dimos/models/embedding/clip.py
index 4bb3ce5ec4..ca1cc2fc30 100644
--- a/dimos/models/embedding/clip.py
+++ b/dimos/models/embedding/clip.py
@@ -21,6 +21,8 @@
 from dimos.models.embedding.type import Embedding, EmbeddingModel
 from dimos.msgs.sensor_msgs import Image
 
+_CUDA_INITIALIZED = False
+
 
 class CLIPEmbedding(Embedding): ...
 
@@ -32,7 +34,7 @@ def __init__(
         self,
         model_name: str = "openai/clip-vit-base-patch32",
         device: str | None = None,
-        normalize: bool = True,
+        normalize: bool = False,
     ):
         """
         Initialize CLIP model.
@@ -45,11 +47,9 @@ def __init__(
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
         self.normalize = normalize
 
-        print(f"[DEBUG] CLIPModel.__init__: model_name={model_name}, device={self.device}")
         # Load model and processor
         self.model = HFCLIPModel.from_pretrained(model_name).eval().to(self.device)
         self.processor = CLIPProcessor.from_pretrained(model_name)
-        print(f"[DEBUG] CLIPModel.__init__: COMPLETE")
 
     def embed(self, *images: Image) -> CLIPEmbedding | list[CLIPEmbedding]:
         """Embed one or more images.
@@ -98,6 +98,20 @@ def embed_text(self, *texts: str) -> CLIPEmbedding | list[CLIPEmbedding]:
 
     def warmup(self) -> None:
         """Warmup the model with a dummy forward pass."""
+        # WORKAROUND: HuggingFace CLIP fails with CUBLAS_STATUS_ALLOC_FAILED when it's
+        # the first model to use CUDA. Initialize CUDA context with a dummy operation.
+        # This only needs to happen once per process.
+        global _CUDA_INITIALIZED
+        if self.device == "cuda" and not _CUDA_INITIALIZED:
+            try:
+                # Initialize CUDA with a small matmul operation to setup cuBLAS properly
+                _ = torch.zeros(1, 1, device="cuda") @ torch.zeros(1, 1, device="cuda")
+                torch.cuda.synchronize()
+                _CUDA_INITIALIZED = True
+            except Exception:
+                # If initialization fails, continue anyway - the warmup might still work
+                pass
+
         dummy_image = torch.randn(1, 3, 224, 224).to(self.device)
         dummy_text_inputs = self.processor(text=["warmup"], return_tensors="pt", padding=True).to(
             self.device
diff --git a/dimos/models/embedding/mobileclip.py b/dimos/models/embedding/mobileclip.py
index d952196a48..f3175f8398 100644
--- a/dimos/models/embedding/mobileclip.py
+++ b/dimos/models/embedding/mobileclip.py
@@ -48,7 +48,6 @@ def __init__(
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
         self.normalize = normalize
 
-        print(f"[DEBUG] MobileCLIPModel.__init__: model_name={model_name}, model_path={model_path}, device={self.device}")
         # Load model
         pretrained = str(model_path) if model_path else None
         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
@@ -56,7 +55,6 @@ def __init__(
         )
         self.tokenizer = open_clip.get_tokenizer(model_name)
         self.model = self.model.eval().to(self.device)
-        print(f"[DEBUG] MobileCLIPModel.__init__: COMPLETE")
 
     def embed(self, *images: Image) -> MobileCLIPEmbedding | list[MobileCLIPEmbedding]:
         """Embed one or more images.
diff --git a/dimos/models/embedding/test_embedding_models.py b/dimos/models/embedding/test_embedding_models.py
index f9ec892137..bb4403d1eb 100644
--- a/dimos/models/embedding/test_embedding_models.py
+++ b/dimos/models/embedding/test_embedding_models.py
@@ -17,20 +17,21 @@
 
 from dimos.models.embedding.clip import CLIPModel
 from dimos.models.embedding.mobileclip import MobileCLIPModel
+from dimos.models.embedding.treid import TorchReIDModel
 from dimos.msgs.sensor_msgs import Image
 from dimos.utils.data import get_data
 
 
-@pytest.fixture(scope="session", params=["mobileclip", "clip"])
+@pytest.fixture(scope="session", params=["mobileclip", "clip", "treid"])
 def embedding_model(request):
     """Load embedding model once for all tests. Parametrized for different models."""
     if request.param == "mobileclip":
         model_path = get_data("models_mobileclip") / "mobileclip2_s0.pt"
         model = MobileCLIPModel(model_name="MobileCLIP2-S0", model_path=model_path)
-        model.warmup()
     elif request.param == "clip":
         model = CLIPModel(model_name="openai/clip-vit-base-patch32")
-        model.warmup()
+    elif request.param == "treid":
+        model = TorchReIDModel(model_name="osnet_x1_0")
     else:
         raise ValueError(f"Unknown model: {request.param}")
 
@@ -92,6 +93,9 @@ def test_single_text_embedding(embedding_model):
     """Test embedding a single text string."""
     import torch
 
+    if isinstance(embedding_model, TorchReIDModel):
+        pytest.skip("TorchReID does not support text embeddings")
+
     embedding = embedding_model.embed_text("a cafe")
 
     # Should be torch.Tensor
@@ -114,6 +118,9 @@ def test_batch_text_embedding(embedding_model):
     """Test embedding multiple text strings at once."""
     import torch
 
+    if isinstance(embedding_model, TorchReIDModel):
+        pytest.skip("TorchReID does not support text embeddings")
+
     embeddings = embedding_model.embed_text("a cafe", "a person", "a dog")
 
     assert isinstance(embeddings, list), "Batch text embedding should return list"
@@ -129,6 +136,9 @@ def test_batch_text_embedding(embedding_model):
 @pytest.mark.heavy
 def test_text_image_similarity(embedding_model, test_image):
     """Test cross-modal text-image similarity using @ operator."""
+    if isinstance(embedding_model, TorchReIDModel):
+        pytest.skip("TorchReID does not support text embeddings")
+
     img_embedding = embedding_model.embed(test_image)
 
     # Embed text queries
@@ -169,6 +179,9 @@ def test_cosine_distance(embedding_model, test_image):
 @pytest.mark.heavy
 def test_query_functionality(embedding_model, test_image):
     """Test query method for top-k retrieval."""
+    if isinstance(embedding_model, TorchReIDModel):
+        pytest.skip("TorchReID does not support text embeddings")
+
     # Create a query and some candidates
     query_text = embedding_model.embed_text("a cafe")
 
@@ -241,6 +254,9 @@ def test_compare_many_to_many(embedding_model):
     """Test GPU-accelerated many-to-many comparison."""
     import torch
 
+    if isinstance(embedding_model, TorchReIDModel):
+        pytest.skip("TorchReID does not support text embeddings")
+
     # Create queries and candidates
     queries = embedding_model.embed_text("a cafe", "a person")
     candidates = embedding_model.embed_text("a cafe", "a restaurant", "a dog")
@@ -351,33 +367,35 @@ def test_embedding_performance(embedding_model):
     assert all(e.vector is not None for e in batch_embeddings)
 
     # Sanity check: verify embeddings are meaningful by testing text-image similarity
-    print("\n" + "=" * 60)
-    print("Sanity Check: Text-Image Similarity on First Frame")
-    print("=" * 60)
-    first_frame_emb = batch_embeddings[0]
-
-    # Test common object/scene queries
-    test_queries = [
-        "indoor scene",
-        "outdoor scene",
-        "a person",
-        "a dog",
-        "a robot",
-        "grass and trees",
-        "furniture",
-        "a car",
-    ]
-
-    text_embeddings = embedding_model.embed_text(*test_queries)
-    similarities = []
-    for query, text_emb in zip(test_queries, text_embeddings):
-        sim = first_frame_emb @ text_emb
-        similarities.append((query, sim))
-
-    # Sort by similarity
-    similarities.sort(key=lambda x: x[1], reverse=True)
-
-    print("Top matching concepts:")
-    for query, sim in similarities[:5]:
-        print(f"  '{query}': {sim:.4f}")
-    print("=" * 60)
+    # Skip for TorchReID since it doesn't support text embeddings
+    if not isinstance(embedding_model, TorchReIDModel):
+        print("\n" + "=" * 60)
+        print("Sanity Check: Text-Image Similarity on First Frame")
+        print("=" * 60)
+        first_frame_emb = batch_embeddings[0]
+
+        # Test common object/scene queries
+        test_queries = [
+            "indoor scene",
+            "outdoor scene",
+            "a person",
+            "a dog",
+            "a robot",
+            "grass and trees",
+            "furniture",
+            "a car",
+        ]
+
+        text_embeddings = embedding_model.embed_text(*test_queries)
+        similarities = []
+        for query, text_emb in zip(test_queries, text_embeddings):
+            sim = first_frame_emb @ text_emb
+            similarities.append((query, sim))
+
+        # Sort by similarity
+        similarities.sort(key=lambda x: x[1], reverse=True)
+
+        print("Top matching concepts:")
+        for query, sim in similarities[:5]:
+            print(f"  '{query}': {sim:.4f}")
+        print("=" * 60)
diff --git a/dimos/models/embedding/treid.py b/dimos/models/embedding/treid.py
new file mode 100644
index 0000000000..50d69135a0
--- /dev/null
+++ b/dimos/models/embedding/treid.py
@@ -0,0 +1,120 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import torch
+import torch.nn.functional as F
+from torchreid import utils as torchreid_utils
+
+from dimos.models.embedding.type import Embedding, EmbeddingModel
+from dimos.msgs.sensor_msgs import Image
+
+_CUDA_INITIALIZED = False
+
+
+class TorchReIDEmbedding(Embedding): ...
+
+
+class TorchReIDModel(EmbeddingModel[TorchReIDEmbedding]):
+    """TorchReID embedding model for person re-identification."""
+
+    def __init__(
+        self,
+        model_name: str = "se_resnext101_32x4d",
+        model_path: Path | str | None = None,
+        device: str | None = None,
+        normalize: bool = False,
+    ):
+        """
+        Initialize TorchReID model.
+
+        Args:
+            model_name: Name of the model architecture (e.g., "osnet_x1_0", "osnet_x0_75")
+            model_path: Path to pretrained weights (.pth.tar file)
+            device: Device to run on (cuda/cpu), auto-detects if None
+            normalize: Whether to L2 normalize embeddings
+        """
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.normalize = normalize
+
+        # Load model using torchreid's FeatureExtractor
+        model_path_str = str(model_path) if model_path else ""
+        self.extractor = torchreid_utils.FeatureExtractor(
+            model_name=model_name,
+            model_path=model_path_str,
+            device=self.device,
+        )
+
+    def embed(self, *images: Image) -> TorchReIDEmbedding | list[TorchReIDEmbedding]:
+        """Embed one or more images.
+
+        Returns embeddings as torch.Tensor on device for efficient GPU comparisons.
+        """
+        # Convert to numpy arrays - torchreid expects numpy arrays or file paths
+        np_images = [img.to_opencv() for img in images]
+
+        # Extract features
+        with torch.inference_mode():
+            features = self.extractor(np_images)
+
+            # torchreid may return either numpy array or torch tensor depending on configuration
+            if isinstance(features, torch.Tensor):
+                features_tensor = features.to(self.device)
+            else:
+                features_tensor = torch.from_numpy(features).to(self.device)
+
+            if self.normalize:
+                features_tensor = F.normalize(features_tensor, dim=-1)
+
+        # Create embeddings (keep as torch.Tensor on device)
+        embeddings = []
+        for i, feat in enumerate(features_tensor):
+            timestamp = images[i].ts
+            embeddings.append(TorchReIDEmbedding(vector=feat, timestamp=timestamp))
+
+        return embeddings[0] if len(images) == 1 else embeddings
+
+    def embed_text(self, *texts: str) -> TorchReIDEmbedding | list[TorchReIDEmbedding]:
+        """Text embedding not supported for ReID models.
+
+        TorchReID models are vision-only person re-identification models
+        and do not support text embeddings.
+        """
+        raise NotImplementedError(
+            "TorchReID models are vision-only and do not support text embeddings. "
+            "Use CLIP or MobileCLIP for text-image similarity."
+        )
+
+    def warmup(self) -> None:
+        """Warmup the model with a dummy forward pass."""
+        # WORKAROUND: TorchReID can fail with CUBLAS errors when it's the first model to use CUDA.
+        # Initialize CUDA context with a dummy operation. This only needs to happen once per process.
+        global _CUDA_INITIALIZED
+        if self.device == "cuda" and not _CUDA_INITIALIZED:
+            try:
+                # Initialize CUDA with a small matmul operation to setup cuBLAS properly
+                _ = torch.zeros(1, 1, device="cuda") @ torch.zeros(1, 1, device="cuda")
+                torch.cuda.synchronize()
+                _CUDA_INITIALIZED = True
+            except Exception:
+                # If initialization fails, continue anyway - the warmup might still work
+                pass
+
+        # Create a dummy 256x128 image (typical person ReID input size) as numpy array
+        import numpy as np
+
+        dummy_image = np.random.randint(0, 256, (256, 128, 3), dtype=np.uint8)
+        with torch.inference_mode():
+            _ = self.extractor([dummy_image])
diff --git a/dimos/models/embedding/type.py b/dimos/models/embedding/type.py
index 5a87b2d2d9..7f2e1896b9 100644
--- a/dimos/models/embedding/type.py
+++ b/dimos/models/embedding/type.py
@@ -65,6 +65,12 @@ def to_torch(self, device: str | torch.device | None = None) -> torch.Tensor:
             return self.vector.to(device)
         return self.vector
 
+    def to_cpu(self) -> "Embedding":
+        """Move embedding to CPU, returning self for chaining."""
+        if isinstance(self.vector, torch.Tensor):
+            self.vector = self.vector.cpu()
+        return self
+
 
 E = TypeVar("E", bound="Embedding")
 
diff --git a/dimos/perception/detection/detectors/person/yolo.py b/dimos/perception/detection/detectors/person/yolo.py
index 4c0799dafe..05e79fa22f 100644
--- a/dimos/perception/detection/detectors/person/yolo.py
+++ b/dimos/perception/detection/detectors/person/yolo.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import onnxruntime
 from ultralytics import YOLO
 
 from dimos.msgs.sensor_msgs import Image
@@ -26,32 +25,21 @@
 
 
 class YoloPersonDetector(Detector):
-    def __init__(self, model_path="models_yolo", model_name="yolo11s-pose.pt", device: str = None):
-        """Initialize the YOLO person detector.
+    def __init__(self, model_path="models_yolo", model_name="yolo11n-pose.pt", device: str = None):
+        self.model = YOLO(get_data(model_path) / model_name, task="track")
 
-        Args:
-            model_path (str): Path to the YOLO model weights in tests/data LFS directory
-            model_name (str): Name of the YOLO model weights file
-            device (str): Device to run inference on ('cuda' or 'cpu')
-        """
-        self.model = YOLO(
-            get_data(model_path) / model_name,
-            task="track",
-        )
         self.tracker = get_data(model_path) / "botsort.yaml"
 
         if device:
             self.device = device
             return
+
+        if is_cuda_available():
+            self.device = "cuda"
+            logger.info("Using CUDA for YOLO person detector")
         else:
-            if is_cuda_available():
-                if hasattr(onnxruntime, "preload_dlls"):  # Handles CUDA 11 / onnxruntime-gpu<=1.18
-                    onnxruntime.preload_dlls(cuda=True, cudnn=True)
-                self.device = "cuda"
-                logger.info("Using CUDA for YOLO person detector")
-            else:
-                self.device = "cpu"
-                logger.info("Using CPU for YOLO person detector")
+            self.device = "cpu"
+            logger.info("Using CPU for YOLO person detector")
 
     def process_image(self, image: Image) -> ImageDetections2D:
         """Process image and return detection results.
@@ -71,3 +59,17 @@ def process_image(self, image: Image) -> ImageDetections2D:
             device=self.device,
         )
         return ImageDetections2D.from_ultralytics_result(image, results)
+
+    def stop(self):
+        """
+        Clean up resources used by the detector, including tracker threads.
+        """
+        if hasattr(self.model, "predictor") and self.model.predictor is not None:
+            predictor = self.model.predictor
+            if hasattr(predictor, "trackers") and predictor.trackers:
+                for tracker in predictor.trackers:
+                    if hasattr(tracker, "tracker") and hasattr(tracker.tracker, "gmc"):
+                        gmc = tracker.tracker.gmc
+                        if hasattr(gmc, "executor") and gmc.executor is not None:
+                            gmc.executor.shutdown(wait=True)
+            self.model.predictor = None
diff --git a/dimos/perception/detection/detectors/yolo.py b/dimos/perception/detection/detectors/yolo.py
index 459da20579..a338d3c8de 100644
--- a/dimos/perception/detection/detectors/yolo.py
+++ b/dimos/perception/detection/detectors/yolo.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-
-import cv2
-import onnxruntime
 from ultralytics import YOLO
 
 from dimos.msgs.sensor_msgs import Image
@@ -29,26 +25,17 @@
 
 
 class Yolo2DDetector(Detector):
-    def __init__(self, model_path="models_yolo", model_name="yolo11n.onnx", device: str = None):
-        """
-        Initialize the YOLO detector.
-
-        Args:
-            model_path (str): Path to the YOLO model weights in tests/data LFS directory
-            model_name (str): Name of the YOLO model weights file
-            device (str): Device to run inference on ('cuda' or 'cpu')
-        """
-        self.model = YOLO(get_data(model_path) / model_name, task="detect")
-
-        module_dir = os.path.dirname(__file__)
-        self.tracker_config = os.path.join(module_dir, "config", "custom_tracker.yaml")
+    def __init__(self, model_path="models_yolo", model_name="yolo11n.pt", device: str = None):
+        self.model = YOLO(
+            get_data(model_path) / model_name,
+            task="detect",
+        )
 
         if device:
             self.device = device
             return
+
         if is_cuda_available():
-            if hasattr(onnxruntime, "preload_dlls"):  # Handles CUDA 11 / onnxruntime-gpu<=1.18
-                onnxruntime.preload_dlls(cuda=True, cudnn=True)
             self.device = "cuda"
             logger.debug("Using CUDA for YOLO 2d detector")
         else:
@@ -72,7 +59,6 @@ def process_image(self, image: Image) -> ImageDetections2D:
             iou=0.6,
             persist=True,
             verbose=False,
-            tracker=self.tracker_config,
         )
 
         return ImageDetections2D.from_ultralytics_result(image, results)
diff --git a/dimos/perception/detection/module2D.py b/dimos/perception/detection/module2D.py
index d0b2956539..c4b0ba5a43 100644
--- a/dimos/perception/detection/module2D.py
+++ b/dimos/perception/detection/module2D.py
@@ -81,7 +81,7 @@ def sharp_image_stream(self) -> Observable[Image]:
 
     @simple_mcache
     def detection_stream_2d(self) -> Observable[ImageDetections2D]:
-        return backpressure(self.sharp_image_stream().pipe(ops.map(self.process_image_frame)))
+        return backpressure(self.image.observable().pipe(ops.map(self.process_image_frame)))
 
     def pixel_to_3d(
         self,
diff --git a/dimos/perception/detection/reid/module.py b/dimos/perception/detection/reid/module.py
index b70b01399e..722c3e8a38 100644
--- a/dimos/perception/detection/reid/module.py
+++ b/dimos/perception/detection/reid/module.py
@@ -21,7 +21,7 @@
 from reactivex.observable import Observable
 
 from dimos.core import In, Module, ModuleConfig, Out, rpc
-from dimos.models.embedding import MobileCLIPModel
+from dimos.models.embedding import TorchReIDModel
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.vision_msgs import Detection2DArray
@@ -49,12 +49,14 @@ class ReidModule(Module):
     def __init__(self, idsystem: IDSystem | None = None, warmup: bool = True, **kwargs):
         super().__init__(**kwargs)
 
-        # Create default MobileCLIP-based IDSystem if none provided
+        # Create default TorchReID-based IDSystem if none provided
         if idsystem is None:
-            mobileclip_model = MobileCLIPModel()
+            # osnet_x1_0
+            # se_resnet50
+            reid_model = TorchReIDModel()
             if warmup:
-                mobileclip_model.warmup()
-            feature_extractor = EmbeddingFeatureExtractor(model=mobileclip_model, padding=20)
+                reid_model.warmup()
+            feature_extractor = EmbeddingFeatureExtractor(model=reid_model, padding=20)
             idsystem = EmbeddingIDSystem(
                 feature_extractor=feature_extractor,  # type: ignore[arg-type]
                 similarity_threshold=0.75,
@@ -89,6 +91,10 @@ def ingress(self, imageDetections: ImageDetections2D):
                 f"({detection.name}, conf={detection.confidence:.2f})"
             )
 
+            # Skip annotation if not ready yet (long_term_id == -1)
+            if long_term_id == -1:
+                continue
+
             # Create text annotation for long_term_id above the detection
             x1, y1, _, _ = detection.bbox
             font_size = imageDetections.image.width / 60
@@ -104,12 +110,11 @@ def ingress(self, imageDetections: ImageDetections2D):
                 )
             )
 
-        # Publish annotations
-        if text_annotations:
-            annotations = ImageAnnotations(
-                texts=text_annotations,
-                texts_length=len(text_annotations),
-                points=[],
-                points_length=0,
-            )
-            self.annotations.publish(annotations)
+        # Publish annotations (even if empty to clear previous annotations)
+        annotations = ImageAnnotations(
+            texts=text_annotations,
+            texts_length=len(text_annotations),
+            points=[],
+            points_length=0,
+        )
+        self.annotations.publish(annotations)
diff --git a/dimos/perception/detection/reid/test_module.py b/dimos/perception/detection/reid/test_module.py
index 8bd63be65f..05c0ba797d 100644
--- a/dimos/perception/detection/reid/test_module.py
+++ b/dimos/perception/detection/reid/test_module.py
@@ -17,7 +17,7 @@
 import torch
 
 from dimos.core import LCMTransport, start
-from dimos.models.embedding import CLIPModel
+from dimos.models.embedding import TorchReIDModel
 from dimos.msgs.foxglove_msgs import ImageAnnotations
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.vision_msgs import Detection2DArray
@@ -29,12 +29,10 @@
 
 
 def test_reid_ingress():
-    # Clear GPU cache before loading CLIP to avoid OOM
-
-    # Create CLIP-based IDSystem for testing
-    clip_model = CLIPModel(model_name="openai/clip-vit-base-patch32")
-    clip_model.warmup()
-    # feature_extractor = EmbeddingFeatureExtractor(model=clip_model, padding=20)
+    # Create TorchReID-based IDSystem for testing
+    reid_model = TorchReIDModel(model_name="osnet_x1_0")
+    reid_model.warmup()
+    # feature_extractor = EmbeddingFeatureExtractor(model=reid_model, padding=20)
     # idsystem = EmbeddingIDSystem(
     #     feature_extractor=feature_extractor,  # type: ignore[arg-type]
     #     similarity_threshold=0.75,
diff --git a/dimos/perception/detection/reid/trackAssociator.py b/dimos/perception/detection/reid/trackAssociator.py
index 2c3b45aee7..f7d3a53c22 100644
--- a/dimos/perception/detection/reid/trackAssociator.py
+++ b/dimos/perception/detection/reid/trackAssociator.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, List, Set
+from typing import Dict, List, Literal, Set
 
-import torch
-import torch.nn.functional as F
+import numpy as np
 
 from dimos.models.embedding.type import Embedding
 
@@ -24,23 +23,39 @@ class TrackAssociator:
     """Associates short-term track_ids to long-term unique detection IDs via embedding similarity.
 
     Maintains:
-    - Running average embeddings per track_id (on GPU)
+    - All embeddings per track_id (as numpy arrays) for robust group comparison
     - Negative constraints from co-occurrence (tracks in same frame = different objects)
     - Mapping from track_id to unique long-term ID
     """
 
-    def __init__(self, similarity_threshold: float = 0.75):
+    def __init__(
+        self,
+        similarity_threshold: float = 0.8,
+        comparison_mode: Literal["max", "mean", "top_k_mean"] = "top_k_mean",
+        top_k: int = 10,
+        max_embeddings_per_track: int = 500,
+        min_embeddings_for_matching: int = 10,
+    ):
         """Initialize track associator.
 
         Args:
-            model: Embedding model for GPU-accelerated comparisons
             similarity_threshold: Minimum similarity for associating tracks (0-1)
+            comparison_mode: How to aggregate similarities between embedding groups
+                - "max": Use maximum similarity between any pair
+                - "mean": Use mean of all pairwise similarities
+                - "top_k_mean": Use mean of top-k similarities
+            top_k: Number of top similarities to average (if using top_k_mean)
+            max_embeddings_per_track: Maximum number of embeddings to keep per track
+            min_embeddings_for_matching: Minimum embeddings before attempting to match tracks
         """
-        self.similarity_threshold = similarity_threshold
+        self.similarity_threshold = 0.7
+        self.comparison_mode = comparison_mode
+        self.top_k = top_k
+        self.max_embeddings_per_track = max_embeddings_per_track
+        self.min_embeddings_for_matching = min_embeddings_for_matching
 
-        # Track embeddings (running average, kept on GPU)
-        self.track_embeddings: Dict[int, torch.Tensor] = {}
-        self.embedding_counts: Dict[int, int] = {}
+        # Track embeddings (list of all embeddings as numpy arrays)
+        self.track_embeddings: Dict[int, List[np.ndarray]] = {}
 
         # Negative constraints (track_ids that co-occurred = different objects)
         self.negative_pairs: Dict[int, Set[int]] = {}
@@ -53,37 +68,66 @@ def __init__(self, similarity_threshold: float = 0.75):
         self.similarity_history: List[float] = []
 
     def update_embedding(self, track_id: int, new_embedding: Embedding) -> None:
-        """Update running average embedding for a track_id.
+        """Add new embedding to track's embedding collection.
 
         Args:
             track_id: Short-term track ID from detector
-            new_embedding: New embedding to incorporate into average
+            new_embedding: New embedding to add to collection
         """
-        # Convert to torch (infer device from embedding)
-        new_vec = new_embedding.to_torch()
+        # Convert to numpy array (already on CPU from feature extractor)
+        new_vec = new_embedding.to_numpy()
 
-        # Debug: check embedding diversity
-        print(
-            f"Track {track_id}: embedding norm={new_vec.norm().item():.3f}, first 3 values={new_vec[:3].cpu().tolist()}"
-        )
+        # Ensure normalized for cosine similarity
+        norm = np.linalg.norm(new_vec)
+        if norm > 0:
+            new_vec = new_vec / norm
 
-        if track_id in self.track_embeddings:
-            # Running average
-            count = self.embedding_counts[track_id]
-            old_avg = self.track_embeddings[track_id]
+        if track_id not in self.track_embeddings:
+            self.track_embeddings[track_id] = []
+
+        embeddings = self.track_embeddings[track_id]
+        embeddings.append(new_vec)
+
+        # Keep only most recent embeddings if limit exceeded
+        if len(embeddings) > self.max_embeddings_per_track:
+            embeddings.pop(0)  # Remove oldest
+
+    def _compute_group_similarity(
+        self, query_embeddings: List[np.ndarray], candidate_embeddings: List[np.ndarray]
+    ) -> float:
+        """Compute similarity between two groups of embeddings.
+
+        Args:
+            query_embeddings: List of embeddings for query track
+            candidate_embeddings: List of embeddings for candidate track
 
-            # Compute average on GPU
-            new_avg = (old_avg * count + new_vec) / (count + 1)
+        Returns:
+            Aggregated similarity score
+        """
+        # Compute all pairwise similarities efficiently
+        query_matrix = np.stack(query_embeddings)  # [M, D]
+        candidate_matrix = np.stack(candidate_embeddings)  # [N, D]
+
+        # Cosine similarity via matrix multiplication (already normalized)
+        similarities = query_matrix @ candidate_matrix.T  # [M, N]
 
-            # Re-normalize (important for cosine similarity)
-            new_avg = F.normalize(new_avg, dim=-1)
+        if self.comparison_mode == "max":
+            # Maximum similarity across all pairs
+            return float(np.max(similarities))
+
+        elif self.comparison_mode == "mean":
+            # Mean of all pairwise similarities
+            return float(np.mean(similarities))
+
+        elif self.comparison_mode == "top_k_mean":
+            # Mean of top-k similarities
+            flat_sims = similarities.flatten()
+            k = min(self.top_k, len(flat_sims))
+            top_k_sims = np.partition(flat_sims, -k)[-k:]
+            return float(np.mean(top_k_sims))
 
-            self.track_embeddings[track_id] = new_avg
-            self.embedding_counts[track_id] += 1
         else:
-            # First embedding for this track (normalize for consistency)
-            self.track_embeddings[track_id] = F.normalize(new_vec, dim=-1)
-            self.embedding_counts[track_id] = 1
+            raise ValueError(f"Unknown comparison mode: {self.comparison_mode}")
 
     def add_negative_constraints(self, track_ids: List[int]) -> None:
         """Record that these track_ids co-occurred in same frame (different objects).
@@ -104,70 +148,81 @@ def associate(self, track_id: int) -> int:
             track_id: Short-term track ID to associate
 
         Returns:
-            Long-term unique detection ID, or -1 if not ready yet
+            Long-term unique detection ID
         """
         # Already has assignment
         if track_id in self.track_to_long_term:
             return self.track_to_long_term[track_id]
 
-        # Need embedding to compare
-        if track_id not in self.track_embeddings:
-            return -1  # Not ready yet
+        # Need embeddings to compare
+        if track_id not in self.track_embeddings or not self.track_embeddings[track_id]:
+            # Create new ID if no embeddings yet
+            new_id = self.long_term_counter
+            self.long_term_counter += 1
+            self.track_to_long_term[track_id] = new_id
+            return new_id
 
-        # Build candidate list (only tracks with assigned long_term_ids)
-        query_vec = self.track_embeddings[track_id]
+        # Get query embeddings
+        query_embeddings = self.track_embeddings[track_id]
+
+        # Don't attempt matching until we have enough embeddings for the query track
+        if len(query_embeddings) < self.min_embeddings_for_matching:
+            # Not ready yet - return -1
+            return -1
 
-        candidates = []
-        candidate_track_ids = []
+        # Build candidate list (only tracks with assigned long_term_ids)
+        best_similarity = -1.0
+        best_track_id = None
 
-        for other_tid, other_vec in self.track_embeddings.items():
+        for other_tid, other_embeddings in self.track_embeddings.items():
             # Skip self
             if other_tid == track_id:
                 continue
+
             # Skip if negative constraint (co-occurred)
             if other_tid in self.negative_pairs.get(track_id, set()):
                 continue
+
             # Skip if no long_term_id yet
             if other_tid not in self.track_to_long_term:
                 continue
 
-            candidates.append(other_vec)
-            candidate_track_ids.append(other_tid)
-
-        if candidates:
-            # GPU-accelerated comparison (single matrix multiplication)
-            candidate_stack = torch.stack(candidates)  # [N, D]
-            similarities = query_vec @ candidate_stack.T  # [N]
-
-            # Find best match
-            best_sim, best_idx = similarities.max(dim=0)
-            best_sim_value = best_sim.item()  # Move to CPU only for comparison
+            # Skip if not enough embeddings
+            if len(other_embeddings) < self.min_embeddings_for_matching:
+                continue
 
-            # Debug: show similarity values and check for exact match
-            matched_track_id = candidate_track_ids[best_idx]
-            matched_long_term_id = self.track_to_long_term[matched_track_id]
+            # Compute group similarity
+            similarity = self._compute_group_similarity(query_embeddings, other_embeddings)
 
-            # Check if embeddings are actually identical
-            matched_vec = self.track_embeddings[matched_track_id]
-            diff = (query_vec - matched_vec).abs().max().item()
+            if similarity > best_similarity:
+                best_similarity = similarity
+                best_track_id = other_tid
 
+        # Check if best match exceeds threshold
+        if best_track_id is not None and best_similarity >= self.similarity_threshold:
+            matched_long_term_id = self.track_to_long_term[best_track_id]
             print(
-                f"Track {track_id}: best similarity = {best_sim_value:.6f} with track {matched_track_id} "
-                f"(long_term_id={matched_long_term_id}, max_diff={diff:.6f}, counts: {self.embedding_counts[track_id]} vs {self.embedding_counts[matched_track_id]})"
+                f"Track {track_id}: matched with track {best_track_id} "
+                f"(long_term_id={matched_long_term_id}, similarity={best_similarity:.4f}, "
+                f"mode={self.comparison_mode}, embeddings: {len(query_embeddings)} vs {len(self.track_embeddings[best_track_id])}), threshold: {self.similarity_threshold}"
             )
 
-            # Track similarity distribution (for future adaptive thresholding)
-            self.similarity_history.append(best_sim_value)
+            # Track similarity history
+            self.similarity_history.append(best_similarity)
 
-            if best_sim_value >= self.similarity_threshold:
-                # Associate with existing long_term_id
-                matched_track_id = candidate_track_ids[best_idx]
-                long_term_id = self.track_to_long_term[matched_track_id]
-                self.track_to_long_term[track_id] = long_term_id
-                return long_term_id
+            # Associate with existing long_term_id
+            self.track_to_long_term[track_id] = matched_long_term_id
+            return matched_long_term_id
 
         # Create new unique detection ID
         new_id = self.long_term_counter
         self.long_term_counter += 1
         self.track_to_long_term[track_id] = new_id
+
+        if best_track_id is not None:
+            print(
+                f"Track {track_id}: creating new ID {new_id} "
+                f"(best similarity={best_similarity:.4f} below threshold={self.similarity_threshold})"
+            )
+
         return new_id
diff --git a/dimos/perception/detection/reid/type.py b/dimos/perception/detection/reid/type.py
index 6fc1d2ff3c..1cb10724a0 100644
--- a/dimos/perception/detection/reid/type.py
+++ b/dimos/perception/detection/reid/type.py
@@ -20,7 +20,6 @@
 from dimos.models.embedding.type import Embedding, EmbeddingModel
 from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D
 
-
 E = TypeVar("E", bound="Embedding")
 F = TypeVar("F")  # Generic feature type
 
@@ -45,7 +44,7 @@ def extract(self, detection: Detection2DBBox) -> F:
 class EmbeddingFeatureExtractor(FeatureExtractor[E], Generic[E]):
     """Feature extractor that uses an embedding model to extract features from detection crops."""
 
-    def __init__(self, model: EmbeddingModel[E], padding: int = 20):
+    def __init__(self, model: EmbeddingModel[E], padding: int = 0):
         """
         Initialize embedding feature extractor.
 
@@ -64,11 +63,13 @@ def extract(self, detection: Detection2DBBox) -> E:
             detection: Detection to extract embedding from
 
         Returns:
-            Embedding feature
+            Embedding feature (moved to CPU to save GPU memory)
         """
         cropped_image = detection.cropped_image(padding=self.padding)
         embedding = self.model.embed(cropped_image)
         assert not isinstance(embedding, list), "Expected single embedding for single image"
+        # Move embedding to CPU immediately to free GPU memory
+        embedding = embedding.to_cpu()
         return embedding
 
 
diff --git a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
index 4cb57908ef..948dccaa16 100644
--- a/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
+++ b/dimos/robot/unitree_webrtc/modular/ivan_unitree.py
@@ -74,20 +74,21 @@ def goto(pose):
     # reidModule.detections.connect(detector.detections)
     # reidModule.annotations.transport = LCMTransport("/reid/annotations", ImageAnnotations)
 
-    nav = deploy_navigation(dimos, connection)
+    # nav = deploy_navigation(dimos, connection)
 
-    person_tracker = dimos.deploy(PersonTracker, cameraInfo=ConnectionModule._camera_info())
-    person_tracker.image.connect(connection.video)
-    person_tracker.detections.connect(detector.detections)
-    person_tracker.target.transport = LCMTransport("/goal_request", PoseStamped)
+    # person_tracker = dimos.deploy(PersonTracker, cameraInfo=ConnectionModule._camera_info())
+    # person_tracker.image.connect(connection.video)
+    # person_tracker.detections.connect(detector.detections)
+    # person_tracker.target.transport = LCMTransport("/goal_request", PoseStamped)
 
     reid = dimos.deploy(ReidModule)
 
     reid.image.connect(connection.video)
     reid.detections.connect(detector.detections)
+    reid.annotations.transport = LCMTransport("/reid/annotations", ImageAnnotations)
 
     detector.start()
-    person_tracker.start()
+    # person_tracker.start()
     connection.start()
     reid.start()
 
diff --git a/dimos/robot/unitree_webrtc/unitree_go2.py b/dimos/robot/unitree_webrtc/unitree_go2.py
index 3c05062149..529207913d 100644
--- a/dimos/robot/unitree_webrtc/unitree_go2.py
+++ b/dimos/robot/unitree_webrtc/unitree_go2.py
@@ -22,50 +22,48 @@
 import warnings
 from typing import Optional
 
+from dimos_lcm.sensor_msgs import CameraInfo
+from dimos_lcm.std_msgs import Bool, String
 from reactivex import Observable
 
 from dimos import core
 from dimos.constants import DEFAULT_CAPACITY_COLOR_IMAGE
 from dimos.core import In, Module, Out, rpc
 from dimos.mapping.types import LatLon
-from dimos.msgs.std_msgs import Header
-from dimos.msgs.geometry_msgs import PoseStamped, Transform, Twist, Vector3, Quaternion
+from dimos.msgs.geometry_msgs import PoseStamped, Quaternion, Transform, Twist, Vector3
 from dimos.msgs.nav_msgs import OccupancyGrid, Path
 from dimos.msgs.sensor_msgs import Image
+from dimos.msgs.std_msgs import Header
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos_lcm.std_msgs import String
-from dimos_lcm.sensor_msgs import CameraInfo
-from dimos.perception.spatial_perception import SpatialMemory
+from dimos.navigation.bbox_navigation import BBoxNavigationModule
+from dimos.navigation.bt_navigator.navigator import BehaviorTreeNavigator, NavigatorState
+from dimos.navigation.frontier_exploration import WavefrontFrontierExplorer
+from dimos.navigation.global_planner import AstarPlanner
+from dimos.navigation.local_planner.holonomic_local_planner import HolonomicLocalPlanner
 from dimos.perception.common.utils import (
     load_camera_info,
     load_camera_info_opencv,
     rectify_image,
 )
+from dimos.perception.object_tracker_2d import ObjectTracker2D
+from dimos.perception.spatial_perception import SpatialMemory
 from dimos.protocol import pubsub
 from dimos.protocol.pubsub.lcmpubsub import LCM, Topic
 from dimos.protocol.tf import TF
 from dimos.robot.foxglove_bridge import FoxgloveBridge
-from dimos.utils.monitoring import UtilizationModule
-from dimos.web.websocket_vis.websocket_vis_module import WebsocketVisModule
-from dimos.navigation.global_planner import AstarPlanner
-from dimos.navigation.local_planner.holonomic_local_planner import HolonomicLocalPlanner
-from dimos.navigation.bt_navigator.navigator import BehaviorTreeNavigator, NavigatorState
-from dimos.navigation.frontier_exploration import WavefrontFrontierExplorer
+from dimos.robot.robot import UnitreeRobot
 from dimos.robot.unitree_webrtc.connection import UnitreeWebRTCConnection
 from dimos.robot.unitree_webrtc.type.lidar import LidarMessage
 from dimos.robot.unitree_webrtc.type.map import Map
 from dimos.robot.unitree_webrtc.type.odometry import Odometry
 from dimos.robot.unitree_webrtc.unitree_skills import MyUnitreeSkills
 from dimos.skills.skills import AbstractRobotSkill, SkillLibrary
+from dimos.types.robot_capabilities import RobotCapability
 from dimos.utils.data import get_data
 from dimos.utils.logging_config import setup_logger
+from dimos.utils.monitoring import UtilizationModule
 from dimos.utils.testing import TimedSensorReplay
-from dimos.perception.object_tracker_2d import ObjectTracker2D
-from dimos.navigation.bbox_navigation import BBoxNavigationModule
-from dimos_lcm.std_msgs import Bool
-from dimos.robot.robot import UnitreeRobot
-from dimos.types.robot_capabilities import RobotCapability
-
+from dimos.web.websocket_vis.websocket_vis_module import WebsocketVisModule
 
 logger = setup_logger("dimos.robot.unitree_webrtc.unitree_go2", level=logging.INFO)
 
@@ -387,10 +385,10 @@ def start(self):
         self._deploy_connection()
         self._deploy_mapping()
         self._deploy_navigation()
-        # self._deploy_visualization()
+        self._deploy_visualization()
         self._deploy_foxglove_bridge()
-        self._deploy_perception()
         self._deploy_camera()
+        # self._deploy_perception()
 
         self._start_modules()
 
@@ -568,11 +566,11 @@ def _deploy_camera(self):
             logger.info("Object tracker connected to camera")
 
         # Connect bbox navigator inputs
-        if self.bbox_navigator:
-            self.bbox_navigator.detection2d.connect(self.object_tracker.detection2darray)
-            self.bbox_navigator.camera_info.connect(self.connection.camera_info)
-            self.bbox_navigator.goal_request.connect(self.navigator.goal_request)
-            logger.info("BBox navigator connected")
+        # if self.bbox_navigator:
+        #    self.bbox_navigator.detection2d.connect(self.object_tracker.detection2darray)
+        #    self.bbox_navigator.camera_info.connect(self.connection.camera_info)
+        #    self.bbox_navigator.goal_request.connect(self.navigator.goal_request)
+        #    logger.info("BBox navigator connected")
 
     def _start_modules(self):
         """Start all deployed modules in the correct order."""
@@ -582,12 +580,12 @@ def _start_modules(self):
         self.local_planner.start()
         self.navigator.start()
         self.frontier_explorer.start()
-        # self.websocket_vis.start()
+        self.websocket_vis.start()
         self.foxglove_bridge.start()
-        self.spatial_memory_module.start()
-        self.object_tracker.start()
-        self.bbox_navigator.start()
-        self.utilization_module.start()
+        # self.spatial_memory_module.start()
+        # self.object_tracker.start()
+        # self.bbox_navigator.start()
+        # self.utilization_module.start()
 
         # Initialize skills after connection is established
         if self.skill_library is not None:

From a2813e8ff3d5fba47a98d324b17392f77ed714e5 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Tue, 14 Oct 2025 21:50:38 -0700
Subject: [PATCH 31/47] reid simplification

---
 dimos/perception/detection/reid/__init__.py   |  14 +--
 ...ckAssociator.py => embedding_id_system.py} |  49 ++++++--
 dimos/perception/detection/reid/module.py     |  26 +----
 .../perception/detection/reid/test_module.py  |   9 +-
 .../detection/reid/test_trackAssociator.py    |  14 +--
 dimos/perception/detection/reid/type.py       | 107 +-----------------
 6 files changed, 62 insertions(+), 157 deletions(-)
 rename dimos/perception/detection/reid/{trackAssociator.py => embedding_id_system.py} (82%)

diff --git a/dimos/perception/detection/reid/__init__.py b/dimos/perception/detection/reid/__init__.py
index f4145897b3..b76741a7eb 100644
--- a/dimos/perception/detection/reid/__init__.py
+++ b/dimos/perception/detection/reid/__init__.py
@@ -1,20 +1,12 @@
 from dimos.perception.detection.reid.module import Config, ReidModule
-from dimos.perception.detection.reid.type import (
-    EmbeddingFeatureExtractor,
-    EmbeddingIDSystem,
-    FeatureExtractor,
-    IDSystem,
-    PassthroughIDSystem,
-)
+from dimos.perception.detection.reid.embedding_id_system import EmbeddingIDSystem
+from dimos.perception.detection.reid.type import IDSystem, PassthroughIDSystem
 
 __all__ = [
-    # Feature Extractors
-    "FeatureExtractor",
-    "EmbeddingFeatureExtractor",
     # ID Systems
     "IDSystem",
-    "EmbeddingIDSystem",
     "PassthroughIDSystem",
+    "EmbeddingIDSystem",
     # Module
     "ReidModule",
     "Config",
diff --git a/dimos/perception/detection/reid/trackAssociator.py b/dimos/perception/detection/reid/embedding_id_system.py
similarity index 82%
rename from dimos/perception/detection/reid/trackAssociator.py
rename to dimos/perception/detection/reid/embedding_id_system.py
index f7d3a53c22..15ee5a44d6 100644
--- a/dimos/perception/detection/reid/trackAssociator.py
+++ b/dimos/perception/detection/reid/embedding_id_system.py
@@ -12,14 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, List, Literal, Set
+from typing import Callable, Dict, List, Literal, Set
 
 import numpy as np
 
-from dimos.models.embedding.type import Embedding
+from dimos.models.embedding.type import Embedding, EmbeddingModel
+from dimos.perception.detection.reid.type import IDSystem
+from dimos.perception.detection.type import Detection2DBBox
 
 
-class TrackAssociator:
+class EmbeddingIDSystem(IDSystem):
     """Associates short-term track_ids to long-term unique detection IDs via embedding similarity.
 
     Maintains:
@@ -30,15 +32,19 @@ class TrackAssociator:
 
     def __init__(
         self,
-        similarity_threshold: float = 0.8,
+        model: Callable[[], EmbeddingModel[Embedding]],
+        padding: int = 0,
+        similarity_threshold: float = 0.63,
         comparison_mode: Literal["max", "mean", "top_k_mean"] = "top_k_mean",
-        top_k: int = 10,
+        top_k: int = 30,
         max_embeddings_per_track: int = 500,
         min_embeddings_for_matching: int = 10,
     ):
         """Initialize track associator.
 
         Args:
+            model: Callable (class or function) that returns an embedding model for feature extraction
+            padding: Padding to add around detection bbox when cropping (default: 0)
             similarity_threshold: Minimum similarity for associating tracks (0-1)
             comparison_mode: How to aggregate similarities between embedding groups
                 - "max": Use maximum similarity between any pair
@@ -48,7 +54,15 @@ def __init__(
             max_embeddings_per_track: Maximum number of embeddings to keep per track
             min_embeddings_for_matching: Minimum embeddings before attempting to match tracks
         """
-        self.similarity_threshold = 0.7
+        # Call model factory (class or function) to get model instance
+        self.model = model()
+
+        # Call warmup if available
+        if hasattr(self.model, "warmup"):
+            self.model.warmup()
+
+        self.padding = padding
+        self.similarity_threshold = similarity_threshold
         self.comparison_mode = comparison_mode
         self.top_k = top_k
         self.max_embeddings_per_track = max_embeddings_per_track
@@ -67,6 +81,27 @@ def __init__(
         # Similarity history for optional adaptive thresholding
         self.similarity_history: List[float] = []
 
+    def register_detection(self, detection: Detection2DBBox) -> int:
+        """
+        Register detection and return long-term ID.
+
+        Args:
+            detection: Detection to register
+
+        Returns:
+            Long-term unique ID for this detection
+        """
+        # Extract embedding from detection's cropped image
+        cropped_image = detection.cropped_image(padding=self.padding)
+        embedding = self.model.embed(cropped_image)
+        assert not isinstance(embedding, list), "Expected single embedding for single image"
+        # Move embedding to CPU immediately to free GPU memory
+        embedding = embedding.to_cpu()
+
+        # Update and associate track
+        self.update_embedding(detection.track_id, embedding)
+        return self.associate(detection.track_id)
+
     def update_embedding(self, track_id: int, new_embedding: Embedding) -> None:
         """Add new embedding to track's embedding collection.
 
@@ -222,7 +257,7 @@ def associate(self, track_id: int) -> int:
         if best_track_id is not None:
             print(
                 f"Track {track_id}: creating new ID {new_id} "
-                f"(best similarity={best_similarity:.4f} below threshold={self.similarity_threshold})"
+                f"(best similarity={best_similarity:.4f} with id={self.track_to_long_term[best_track_id]} below threshold={self.similarity_threshold})"
             )
 
         return new_id
diff --git a/dimos/perception/detection/reid/module.py b/dimos/perception/detection/reid/module.py
index 722c3e8a38..ac5003a2eb 100644
--- a/dimos/perception/detection/reid/module.py
+++ b/dimos/perception/detection/reid/module.py
@@ -25,11 +25,8 @@
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection.reid.type import (
-    EmbeddingFeatureExtractor,
-    EmbeddingIDSystem,
-    IDSystem,
-)
+from dimos.perception.detection.reid.embedding_id_system import EmbeddingIDSystem
+from dimos.perception.detection.reid.type import IDSystem
 from dimos.perception.detection.type import ImageDetections2D
 from dimos.types.timestamped import align_timestamped, to_ros_stamp
 from dimos.utils.reactive import backpressure
@@ -46,21 +43,10 @@ class ReidModule(Module):
     image: In[Image] = None  # type: ignore
     annotations: Out[ImageAnnotations] = None  # type: ignore
 
-    def __init__(self, idsystem: IDSystem | None = None, warmup: bool = True, **kwargs):
+    def __init__(self, idsystem: IDSystem | None = None, **kwargs):
         super().__init__(**kwargs)
-
-        # Create default TorchReID-based IDSystem if none provided
         if idsystem is None:
-            # osnet_x1_0
-            # se_resnet50
-            reid_model = TorchReIDModel()
-            if warmup:
-                reid_model.warmup()
-            feature_extractor = EmbeddingFeatureExtractor(model=reid_model, padding=20)
-            idsystem = EmbeddingIDSystem(
-                feature_extractor=feature_extractor,  # type: ignore[arg-type]
-                similarity_threshold=0.75,
-            )
+            idsystem = EmbeddingIDSystem(model=TorchReIDModel, padding=0)
 
         self.idsystem = idsystem
 
@@ -86,10 +72,6 @@ def ingress(self, imageDetections: ImageDetections2D):
         for detection in imageDetections:
             # Register detection and get long-term ID
             long_term_id = self.idsystem.register_detection(detection)
-            print(
-                f"track_id={detection.track_id} -> long_term_id={long_term_id} "
-                f"({detection.name}, conf={detection.confidence:.2f})"
-            )
 
             # Skip annotation if not ready yet (long_term_id == -1)
             if long_term_id == -1:
diff --git a/dimos/perception/detection/reid/test_module.py b/dimos/perception/detection/reid/test_module.py
index 05c0ba797d..9747ce5cbe 100644
--- a/dimos/perception/detection/reid/test_module.py
+++ b/dimos/perception/detection/reid/test_module.py
@@ -22,19 +22,16 @@
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.perception.detection.reid.module import ReidModule
-from dimos.perception.detection.reid.type import (
-    EmbeddingFeatureExtractor,
-    EmbeddingIDSystem,
-)
+from dimos.perception.detection.reid.embedding_id_system import EmbeddingIDSystem
 
 
 def test_reid_ingress():
     # Create TorchReID-based IDSystem for testing
     reid_model = TorchReIDModel(model_name="osnet_x1_0")
     reid_model.warmup()
-    # feature_extractor = EmbeddingFeatureExtractor(model=reid_model, padding=20)
     # idsystem = EmbeddingIDSystem(
-    #     feature_extractor=feature_extractor,  # type: ignore[arg-type]
+    #     model=lambda: reid_model,
+    #     padding=20,
     #     similarity_threshold=0.75,
     # )
 
diff --git a/dimos/perception/detection/reid/test_trackAssociator.py b/dimos/perception/detection/reid/test_trackAssociator.py
index 9c0783af61..2aa54ee2ee 100644
--- a/dimos/perception/detection/reid/test_trackAssociator.py
+++ b/dimos/perception/detection/reid/test_trackAssociator.py
@@ -17,7 +17,7 @@
 
 from dimos.models.embedding.mobileclip import MobileCLIPModel
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.reid.trackAssociator import TrackAssociator
+from dimos.perception.detection.reid.embedding_id_system import EmbeddingIDSystem
 from dimos.utils.data import get_data
 
 
@@ -31,9 +31,9 @@ def mobileclip_model():
 
 
 @pytest.fixture
-def track_associator():
-    """Create fresh TrackAssociator for each test."""
-    return TrackAssociator(similarity_threshold=0.75)
+def track_associator(mobileclip_model):
+    """Create fresh EmbeddingIDSystem for each test."""
+    return EmbeddingIDSystem(model=lambda: mobileclip_model, similarity_threshold=0.75)
 
 
 @pytest.fixture(scope="session")
@@ -226,10 +226,10 @@ def test_gpu_performance(track_associator, mobileclip_model, test_image):
 
 
 @pytest.mark.heavy
-def test_similarity_threshold_configurable():
+def test_similarity_threshold_configurable(mobileclip_model):
     """Test that similarity threshold is configurable."""
-    associator_strict = TrackAssociator(similarity_threshold=0.95)
-    associator_loose = TrackAssociator(similarity_threshold=0.50)
+    associator_strict = EmbeddingIDSystem(model=lambda: mobileclip_model, similarity_threshold=0.95)
+    associator_loose = EmbeddingIDSystem(model=lambda: mobileclip_model, similarity_threshold=0.50)
 
     assert associator_strict.similarity_threshold == 0.95
     assert associator_loose.similarity_threshold == 0.50
diff --git a/dimos/perception/detection/reid/type.py b/dimos/perception/detection/reid/type.py
index 1cb10724a0..0ef2da961c 100644
--- a/dimos/perception/detection/reid/type.py
+++ b/dimos/perception/detection/reid/type.py
@@ -15,75 +15,12 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Generic, TypeVar
 
-from dimos.models.embedding.type import Embedding, EmbeddingModel
 from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D
 
-E = TypeVar("E", bound="Embedding")
-F = TypeVar("F")  # Generic feature type
 
-
-class FeatureExtractor(ABC, Generic[F]):
-    """Abstract base class for extracting features from detections."""
-
-    @abstractmethod
-    def extract(self, detection: Detection2DBBox) -> F:
-        """
-        Extract feature from a detection.
-
-        Args:
-            detection: Detection to extract features from
-
-        Returns:
-            Extracted feature of type F
-        """
-        pass
-
-
-class EmbeddingFeatureExtractor(FeatureExtractor[E], Generic[E]):
-    """Feature extractor that uses an embedding model to extract features from detection crops."""
-
-    def __init__(self, model: EmbeddingModel[E], padding: int = 0):
-        """
-        Initialize embedding feature extractor.
-
-        Args:
-            model: Embedding model to use for feature extraction
-            padding: Padding to add around detection bbox when cropping (default: 0)
-        """
-        self.model = model
-        self.padding = padding
-
-    def extract(self, detection: Detection2DBBox) -> E:
-        """
-        Extract embedding from detection's cropped image.
-
-        Args:
-            detection: Detection to extract embedding from
-
-        Returns:
-            Embedding feature (moved to CPU to save GPU memory)
-        """
-        cropped_image = detection.cropped_image(padding=self.padding)
-        embedding = self.model.embed(cropped_image)
-        assert not isinstance(embedding, list), "Expected single embedding for single image"
-        # Move embedding to CPU immediately to free GPU memory
-        embedding = embedding.to_cpu()
-        return embedding
-
-
-class IDSystem(ABC, Generic[F]):
-    """Abstract base class for ID assignment systems using features."""
-
-    def __init__(self, feature_extractor: FeatureExtractor[F]):
-        """
-        Initialize ID system with feature extractor.
-
-        Args:
-            feature_extractor: Feature extractor to use for detection features
-        """
-        self.feature_extractor = feature_extractor
+class IDSystem(ABC):
+    """Abstract base class for ID assignment systems."""
 
     def register_detections(self, detections: ImageDetections2D) -> None:
         """Register multiple detections."""
@@ -105,47 +42,9 @@ def register_detection(self, detection: Detection2DBBox) -> int:
         ...
 
 
-class PassthroughIDSystem(IDSystem[F]):
+class PassthroughIDSystem(IDSystem):
     """Simple ID system that returns track_id with no object permanence."""
 
-    def __init__(self, feature_extractor: FeatureExtractor[F] | None = None):
-        """
-        Initialize passthrough ID system.
-
-        Args:
-            feature_extractor: Optional feature extractor (not used, for interface compatibility)
-        """
-        # Don't call super().__init__ since we don't need feature_extractor
-        self.feature_extractor = feature_extractor  # type: ignore
-
     def register_detection(self, detection: Detection2DBBox) -> int:
         """Return detection's track_id as long-term ID (no permanence)."""
         return detection.track_id
-
-
-class EmbeddingIDSystem(IDSystem[Embedding]):
-    """ID system using embedding similarity for object permanence."""
-
-    def __init__(
-        self,
-        feature_extractor: FeatureExtractor[Embedding],
-        similarity_threshold: float = 0.75,
-    ):
-        """
-        Initialize embedding-based ID system.
-
-        Args:
-            feature_extractor: Feature extractor for embeddings
-            similarity_threshold: Minimum similarity for associating tracks (0-1)
-        """
-        super().__init__(feature_extractor)
-
-        # Import here to avoid circular dependency
-        from dimos.perception.detection.reid.trackAssociator import TrackAssociator
-
-        self.associator = TrackAssociator(similarity_threshold=similarity_threshold)
-
-    def register_detection(self, detection: Detection2DBBox) -> int:
-        embedding = self.feature_extractor.extract(detection)
-        self.associator.update_embedding(detection.track_id, embedding)
-        return self.associator.associate(detection.track_id)

From 7687619aef1f6840bbd275186b8909f4b9a838d5 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Tue, 14 Oct 2025 22:10:29 -0700
Subject: [PATCH 32/47] disabling single test for now

---
 .../detection/type/detection3d/test_imageDetections3DPC.py     | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py b/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
index 5173646953..0b962e0d4a 100644
--- a/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
+++ b/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pytest
 
+
+@pytest.mark.heavy
 def test_to_foxglove_scene_update(get_moment_3dpc):
     """Test conversion of ImageDetections3DPC to Foxglove SceneUpdate."""
     moment = get_moment_3dpc(seek=10.0)

From c13395f275274fc845fa1ddd83fbc0fba3ca5129 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Tue, 14 Oct 2025 22:13:14 -0700
Subject: [PATCH 33/47] removing garbage files

---
 dimos/perception/detection/.claude/settings.local.json |  9 ---------
 .../detection/type/.claude/settings.local.json         | 10 ----------
 2 files changed, 19 deletions(-)
 delete mode 100644 dimos/perception/detection/.claude/settings.local.json
 delete mode 100644 dimos/perception/detection/type/.claude/settings.local.json

diff --git a/dimos/perception/detection/.claude/settings.local.json b/dimos/perception/detection/.claude/settings.local.json
deleted file mode 100644
index 060f1e47cd..0000000000
--- a/dimos/perception/detection/.claude/settings.local.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Read(//home/lesh/coding/dimensional/dimos/dimos/**)"
-    ],
-    "deny": [],
-    "ask": []
-  }
-}
diff --git a/dimos/perception/detection/type/.claude/settings.local.json b/dimos/perception/detection/type/.claude/settings.local.json
deleted file mode 100644
index f3e68a36e6..0000000000
--- a/dimos/perception/detection/type/.claude/settings.local.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Bash(pytest:*)",
-      "Bash(grep:*)",
-      "Read(//home/lesh/coding/dimensional/dimos/dimos/perception/detection2d/**)"
-    ],
-    "deny": []
-  }
-}

From 451b30989f8aa021b5f4d80069e8d332b5896d4c Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Tue, 14 Oct 2025 22:18:11 -0700
Subject: [PATCH 34/47] correct test naming

---
 .../{test_trackAssociator.py => test_embedding_id_system.py}    | 0
 .../detection/type/detection3d/test_imageDetections3DPC.py      | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename dimos/perception/detection/reid/{test_trackAssociator.py => test_embedding_id_system.py} (100%)

diff --git a/dimos/perception/detection/reid/test_trackAssociator.py b/dimos/perception/detection/reid/test_embedding_id_system.py
similarity index 100%
rename from dimos/perception/detection/reid/test_trackAssociator.py
rename to dimos/perception/detection/reid/test_embedding_id_system.py
diff --git a/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py b/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
index 0b962e0d4a..fb5608b9ab 100644
--- a/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
+++ b/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
@@ -15,7 +15,7 @@
 import pytest
 
 
-@pytest.mark.heavy
+@pytest.mark.skip
 def test_to_foxglove_scene_update(get_moment_3dpc):
     """Test conversion of ImageDetections3DPC to Foxglove SceneUpdate."""
     moment = get_moment_3dpc(seek=10.0)

From 5f810fbcdcb22f2345f9d9cc06fe2124f41d0371 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Wed, 15 Oct 2025 11:20:43 -0700
Subject: [PATCH 35/47] renamde type.py -> base.py for embedding models

---
 .gitignore                                             | 1 +
 dimos/models/embedding/__init__.py                     | 2 +-
 dimos/models/embedding/{type.py => base.py}            | 0
 dimos/models/embedding/clip.py                         | 2 +-
 dimos/models/embedding/mobileclip.py                   | 2 +-
 dimos/models/embedding/treid.py                        | 2 +-
 dimos/perception/detection/reid/embedding_id_system.py | 2 +-
 7 files changed, 6 insertions(+), 5 deletions(-)
 rename dimos/models/embedding/{type.py => base.py} (100%)

diff --git a/.gitignore b/.gitignore
index 12cb51509a..18fd575c85 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,3 +48,4 @@ yolo11n.pt
 
 # symlink one of .envrc.* if you'd like to use
 .envrc
+.claude
diff --git a/dimos/models/embedding/__init__.py b/dimos/models/embedding/__init__.py
index ed6fc69a65..a8f3784ca5 100644
--- a/dimos/models/embedding/__init__.py
+++ b/dimos/models/embedding/__init__.py
@@ -1,7 +1,7 @@
+from dimos.models.embedding.base import Embedding, EmbeddingModel
 from dimos.models.embedding.clip import CLIPEmbedding, CLIPModel
 from dimos.models.embedding.mobileclip import MobileCLIPEmbedding, MobileCLIPModel
 from dimos.models.embedding.treid import TorchReIDEmbedding, TorchReIDModel
-from dimos.models.embedding.type import Embedding, EmbeddingModel
 
 __all__ = [
     "Embedding",
diff --git a/dimos/models/embedding/type.py b/dimos/models/embedding/base.py
similarity index 100%
rename from dimos/models/embedding/type.py
rename to dimos/models/embedding/base.py
diff --git a/dimos/models/embedding/clip.py b/dimos/models/embedding/clip.py
index ca1cc2fc30..e751e9ee33 100644
--- a/dimos/models/embedding/clip.py
+++ b/dimos/models/embedding/clip.py
@@ -18,7 +18,7 @@
 from transformers import CLIPModel as HFCLIPModel
 from transformers import CLIPProcessor
 
-from dimos.models.embedding.type import Embedding, EmbeddingModel
+from dimos.models.embedding.base import Embedding, EmbeddingModel
 from dimos.msgs.sensor_msgs import Image
 
 _CUDA_INITIALIZED = False
diff --git a/dimos/models/embedding/mobileclip.py b/dimos/models/embedding/mobileclip.py
index f3175f8398..8421d07eac 100644
--- a/dimos/models/embedding/mobileclip.py
+++ b/dimos/models/embedding/mobileclip.py
@@ -19,7 +19,7 @@
 import torch.nn.functional as F
 from PIL import Image as PILImage
 
-from dimos.models.embedding.type import Embedding, EmbeddingModel
+from dimos.models.embedding.base import Embedding, EmbeddingModel
 from dimos.msgs.sensor_msgs import Image
 
 
diff --git a/dimos/models/embedding/treid.py b/dimos/models/embedding/treid.py
index 50d69135a0..b56aeab714 100644
--- a/dimos/models/embedding/treid.py
+++ b/dimos/models/embedding/treid.py
@@ -18,7 +18,7 @@
 import torch.nn.functional as F
 from torchreid import utils as torchreid_utils
 
-from dimos.models.embedding.type import Embedding, EmbeddingModel
+from dimos.models.embedding.base import Embedding, EmbeddingModel
 from dimos.msgs.sensor_msgs import Image
 
 _CUDA_INITIALIZED = False
diff --git a/dimos/perception/detection/reid/embedding_id_system.py b/dimos/perception/detection/reid/embedding_id_system.py
index 15ee5a44d6..7fb0a2ba40 100644
--- a/dimos/perception/detection/reid/embedding_id_system.py
+++ b/dimos/perception/detection/reid/embedding_id_system.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 
-from dimos.models.embedding.type import Embedding, EmbeddingModel
+from dimos.models.embedding.base import Embedding, EmbeddingModel
 from dimos.perception.detection.reid.type import IDSystem
 from dimos.perception.detection.type import Detection2DBBox
 

From 9dea5ee979e3c9a278136853899b6cf29e467b53 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Wed, 15 Oct 2025 12:33:19 -0700
Subject: [PATCH 36/47] openclip optional, passing tests

---
 .../models/embedding/test_embedding_models.py | 22 +++++++++++++++++--
 .../image_impls/test_image_backends.py        |  5 ++++-
 dimos/perception/detection/module3D.py        |  4 ++++
 dimos/perception/detection/person_tracker.py  |  4 ++++
 dimos/perception/detection/reid/module.py     |  4 ++++
 pyproject.toml                                | 10 ++++++---
 6 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/dimos/models/embedding/test_embedding_models.py b/dimos/models/embedding/test_embedding_models.py
index bb4403d1eb..ee69c7cfd0 100644
--- a/dimos/models/embedding/test_embedding_models.py
+++ b/dimos/models/embedding/test_embedding_models.py
@@ -16,16 +16,34 @@
 import pytest
 
 from dimos.models.embedding.clip import CLIPModel
-from dimos.models.embedding.mobileclip import MobileCLIPModel
 from dimos.models.embedding.treid import TorchReIDModel
 from dimos.msgs.sensor_msgs import Image
 from dimos.utils.data import get_data
 
+# Try to import MobileCLIP, skip if not available
+try:
+    from dimos.models.embedding.mobileclip import MobileCLIPModel
 
-@pytest.fixture(scope="session", params=["mobileclip", "clip", "treid"])
+    HAS_OPENCLIP = True
+except ImportError:
+    HAS_OPENCLIP = False
+    MobileCLIPModel = None
+
+
+def _get_test_params():
+    """Get test parameters based on available packages."""
+    params = ["clip", "treid"]
+    if HAS_OPENCLIP:
+        params.insert(0, "mobileclip")
+    return params
+
+
+@pytest.fixture(scope="session", params=_get_test_params())
 def embedding_model(request):
     """Load embedding model once for all tests. Parametrized for different models."""
     if request.param == "mobileclip":
+        if not HAS_OPENCLIP:
+            pytest.skip("open_clip_torch not installed. Install with: pip install dimos[openclip]")
         model_path = get_data("models_mobileclip") / "mobileclip2_s0.pt"
         model = MobileCLIPModel(model_name="MobileCLIP2-S0", model_path=model_path)
     elif request.param == "clip":
diff --git a/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py b/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
index 931a30ea5f..a87b9899a9 100644
--- a/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
+++ b/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 
-from dimos.msgs.sensor_msgs.Image import Image, ImageFormat, HAS_CUDA
+from dimos.msgs.sensor_msgs.Image import HAS_CUDA, Image, ImageFormat
 from dimos.utils.data import get_data
 
 IMAGE_PATH = get_data("chair-image.png")
@@ -416,6 +416,9 @@ def test_perf_solvepnp(alloc_timer):
         print(f"solvePnP (avg per call) cpu={cpu_t:.6f}s")
 
 
+# this test is failing with
+#  raise RuntimeError("OpenCV CSRT tracker not available")
+@pytest.mark.skip
 def test_perf_tracker(alloc_timer):
     """Test tracker performance with NumpyImage always, add CudaImage when available."""
     # Don't check - just let it fail if CSRT isn't available
diff --git a/dimos/perception/detection/module3D.py b/dimos/perception/detection/module3D.py
index 0d6f57e080..b8fe42da9a 100644
--- a/dimos/perception/detection/module3D.py
+++ b/dimos/perception/detection/module3D.py
@@ -123,6 +123,10 @@ def detection2d_to_3d(args):
 
         self.detection_stream_3d.subscribe(self._publish_detections)
 
+    @rpc
+    def stop(self) -> None:
+        super().stop()
+
     def _publish_detections(self, detections: ImageDetections3DPC):
         if not detections:
             return
diff --git a/dimos/perception/detection/person_tracker.py b/dimos/perception/detection/person_tracker.py
index 04173071e3..fe69fbc15e 100644
--- a/dimos/perception/detection/person_tracker.py
+++ b/dimos/perception/detection/person_tracker.py
@@ -88,6 +88,10 @@ def detections_stream(self) -> Observable[ImageDetections2D]:
     def start(self):
         self.detections_stream().subscribe(self.track)
 
+    @rpc
+    def stop(self):
+        super().stop()
+
     def track(self, detections2D: ImageDetections2D):
         if len(detections2D) == 0:
             return
diff --git a/dimos/perception/detection/reid/module.py b/dimos/perception/detection/reid/module.py
index ac5003a2eb..b3019d90d0 100644
--- a/dimos/perception/detection/reid/module.py
+++ b/dimos/perception/detection/reid/module.py
@@ -66,6 +66,10 @@ def detections_stream(self) -> Observable[ImageDetections2D]:
     def start(self):
         self.detections_stream().subscribe(self.ingress)
 
+    @rpc
+    def stop(self):
+        super().stop()
+
     def ingress(self, imageDetections: ImageDetections2D):
         text_annotations = []
 
diff --git a/pyproject.toml b/pyproject.toml
index 2eab703602..f495e12d2a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -118,7 +118,7 @@ human-cli = "dimos.agents2.cli.human_cli:main"
 
 [project.optional-dependencies]
 manipulation = [
-    
+
     # Contact Graspnet Dependencies
     "h5py>=3.7.0",
     "pyrender>=0.1.45",
@@ -131,15 +131,19 @@ manipulation = [
     "tqdm>=4.65.0",
     "pyyaml>=6.0",
     "contact-graspnet-pytorch @ git+https://github.com/dimensionalOS/contact_graspnet_pytorch.git",
-    
+
     # piper arm
     "piper-sdk",
-    
+
     # Visualization (Optional)
     "kaleido>=0.2.1",
     "plotly>=5.9.0",
 ]
 
+openclip = [
+    "open_clip_torch>=3.0.0",
+]
+
 cpu = [
     # CPU inference backends
     "onnxruntime",

From 268501d6651e483486835150da97703e2d682405 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Wed, 15 Oct 2025 12:35:37 -0700
Subject: [PATCH 37/47] image backend test skip

---
 dimos/msgs/sensor_msgs/image_impls/test_image_backends.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py b/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
index a87b9899a9..7d95be7669 100644
--- a/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
+++ b/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
@@ -416,9 +416,6 @@ def test_perf_solvepnp(alloc_timer):
         print(f"solvePnP (avg per call) cpu={cpu_t:.6f}s")
 
 
-# this test is failing with
-#  raise RuntimeError("OpenCV CSRT tracker not available")
-@pytest.mark.skip
 def test_perf_tracker(alloc_timer):
     """Test tracker performance with NumpyImage always, add CudaImage when available."""
     # Don't check - just let it fail if CSRT isn't available
@@ -464,6 +461,9 @@ def test_perf_tracker(alloc_timer):
         print(f"tracker (avg per call) cpu={cpu_t:.6f}s")
 
 
+# this test is failing with
+#  raise RuntimeError("OpenCV CSRT tracker not available")
+@pytest.mark.skip
 def test_csrt_tracker(alloc_timer):
     """Test CSRT tracker with NumpyImage always, add CudaImage parity when available."""
     # Don't check - just let it fail if CSRT isn't available

From 5fdac3377461a0c1e037d02d5ff2e6383769649b Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Wed, 15 Oct 2025 12:50:17 -0700
Subject: [PATCH 38/47] removing .claude

---
 .../detectors/person/.claude/settings.local.json       | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 dimos/perception/detection/detectors/person/.claude/settings.local.json

diff --git a/dimos/perception/detection/detectors/person/.claude/settings.local.json b/dimos/perception/detection/detectors/person/.claude/settings.local.json
deleted file mode 100644
index 69334f84de..0000000000
--- a/dimos/perception/detection/detectors/person/.claude/settings.local.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Bash(pytest:*)",
-      "Bash(python3:*)"
-    ],
-    "deny": [],
-    "ask": []
-  }
-}

From 07761e1aa95abb360fabf4f786691962cce8f563 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Wed, 15 Oct 2025 14:20:14 -0700
Subject: [PATCH 39/47] tests fix

---
 dimos/conftest.py                                      | 10 ++--------
 .../sensor_msgs/image_impls/test_image_backends.py     |  3 +++
 dimos/perception/detection/conftest.py                 |  9 +++++++--
 .../type/detection3d/test_imageDetections3DPC.py       |  6 +-----
 4 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/dimos/conftest.py b/dimos/conftest.py
index f34255fb49..495afa8a24 100644
--- a/dimos/conftest.py
+++ b/dimos/conftest.py
@@ -33,15 +33,9 @@ def event_loop():
 _skip_for = ["lcm", "heavy", "ros"]
 
 
-@pytest.fixture(scope="session", autouse=True)
-def track_session_threads():
+@pytest.hookimpl()
+def pytest_sessionfinish(session):
     """Track threads that exist at session start - these are not leaks."""
-    # Capture initial threads before any tests run
-    initial = threading.enumerate()
-    with _seen_threads_lock:
-        for t in initial:
-            if t.ident is not None:
-                _session_threads.add(t.ident)
 
     yield
 
diff --git a/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py b/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
index 7d95be7669..0e19a24167 100644
--- a/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
+++ b/dimos/msgs/sensor_msgs/image_impls/test_image_backends.py
@@ -416,6 +416,9 @@ def test_perf_solvepnp(alloc_timer):
         print(f"solvePnP (avg per call) cpu={cpu_t:.6f}s")
 
 
+# this test is failing with
+#  raise RuntimeError("OpenCV CSRT tracker not available")
+@pytest.mark.skip
 def test_perf_tracker(alloc_timer):
     """Test tracker performance with NumpyImage always, add CudaImage when available."""
     # Don't check - just let it fail if CSRT isn't available
diff --git a/dimos/perception/detection/conftest.py b/dimos/perception/detection/conftest.py
index 73abf489cd..cdd15c1f92 100644
--- a/dimos/perception/detection/conftest.py
+++ b/dimos/perception/detection/conftest.py
@@ -193,10 +193,15 @@ def detection2d(get_moment_2d) -> Detection2D:
 
 
 @pytest.fixture(scope="session")
-def detection3dpc(get_moment_3dpc) -> Detection3DPC:
+def detections3dpc(get_moment_3dpc) -> Detection3DPC:
     moment = get_moment_3dpc(seek=10.0)
     assert len(moment["detections3dpc"]) > 0, "No detections found in the moment"
-    return moment["detections3dpc"][0]
+    return moment["detections3dpc"]
+
+
+@pytest.fixture(scope="session")
+def detection3dpc(detections3dpc) -> Detection3DPC:
+    return detections3dpc[0]
 
 
 @pytest.fixture(scope="session")
diff --git a/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py b/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
index fb5608b9ab..31e44dad91 100644
--- a/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
+++ b/dimos/perception/detection/type/detection3d/test_imageDetections3DPC.py
@@ -16,11 +16,7 @@
 
 
 @pytest.mark.skip
-def test_to_foxglove_scene_update(get_moment_3dpc):
-    """Test conversion of ImageDetections3DPC to Foxglove SceneUpdate."""
-    moment = get_moment_3dpc(seek=10.0)
-    detections3dpc = moment["detections3dpc"]
-
+def test_to_foxglove_scene_update(detections3dpc):
     # Convert to scene update
     scene_update = detections3dpc.to_foxglove_scene_update()
 

From 2c5565cce7401d90028758d1c0dced36442ba22e Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Wed, 15 Oct 2025 15:30:37 -0700
Subject: [PATCH 40/47] mobile clip optional

---
 dimos/models/embedding/__init__.py   | 11 ++++++++---
 dimos/models/embedding/mobileclip.py | 14 +++++++++++++-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/dimos/models/embedding/__init__.py b/dimos/models/embedding/__init__.py
index a8f3784ca5..587f49576c 100644
--- a/dimos/models/embedding/__init__.py
+++ b/dimos/models/embedding/__init__.py
@@ -1,6 +1,5 @@
 from dimos.models.embedding.base import Embedding, EmbeddingModel
 from dimos.models.embedding.clip import CLIPEmbedding, CLIPModel
-from dimos.models.embedding.mobileclip import MobileCLIPEmbedding, MobileCLIPModel
 from dimos.models.embedding.treid import TorchReIDEmbedding, TorchReIDModel
 
 __all__ = [
@@ -8,8 +7,14 @@
     "EmbeddingModel",
     "CLIPEmbedding",
     "CLIPModel",
-    "MobileCLIPEmbedding",
-    "MobileCLIPModel",
     "TorchReIDEmbedding",
     "TorchReIDModel",
 ]
+
+# Optional: MobileCLIP (requires open-clip-torch)
+try:
+    from dimos.models.embedding.mobileclip import MobileCLIPEmbedding, MobileCLIPModel
+
+    __all__.extend(["MobileCLIPEmbedding", "MobileCLIPModel"])
+except ImportError:
+    pass
diff --git a/dimos/models/embedding/mobileclip.py b/dimos/models/embedding/mobileclip.py
index 8421d07eac..755010d5a7 100644
--- a/dimos/models/embedding/mobileclip.py
+++ b/dimos/models/embedding/mobileclip.py
@@ -14,7 +14,13 @@
 
 from pathlib import Path
 
-import open_clip
+try:
+    import open_clip
+
+    OPEN_CLIP_AVAILABLE = True
+except ImportError:
+    OPEN_CLIP_AVAILABLE = False
+
 import torch
 import torch.nn.functional as F
 from PIL import Image as PILImage
@@ -45,6 +51,12 @@ def __init__(
             device: Device to run on (cuda/cpu), auto-detects if None
             normalize: Whether to L2 normalize embeddings
         """
+        if not OPEN_CLIP_AVAILABLE:
+            raise ImportError(
+                "open_clip is required for MobileCLIPModel. "
+                "Install it with: pip install open-clip-torch"
+            )
+
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
         self.normalize = normalize
 

From 3cd564172274aa08571d0863ed6aed8a78b66853 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Thu, 16 Oct 2025 11:55:03 -0700
Subject: [PATCH 41/47] torch reid import issues fix

---
 dimos/models/embedding/__init__.py            | 12 +++++--
 .../models/embedding/test_embedding_models.py | 16 +++++++--
 dimos/perception/detection/reid/module.py     |  3 +-
 .../perception/detection/reid/test_module.py  | 36 +++++++++----------
 4 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/dimos/models/embedding/__init__.py b/dimos/models/embedding/__init__.py
index 587f49576c..5efe1c8107 100644
--- a/dimos/models/embedding/__init__.py
+++ b/dimos/models/embedding/__init__.py
@@ -1,14 +1,11 @@
 from dimos.models.embedding.base import Embedding, EmbeddingModel
 from dimos.models.embedding.clip import CLIPEmbedding, CLIPModel
-from dimos.models.embedding.treid import TorchReIDEmbedding, TorchReIDModel
 
 __all__ = [
     "Embedding",
     "EmbeddingModel",
     "CLIPEmbedding",
     "CLIPModel",
-    "TorchReIDEmbedding",
-    "TorchReIDModel",
 ]
 
 # Optional: MobileCLIP (requires open-clip-torch)
@@ -18,3 +15,12 @@
     __all__.extend(["MobileCLIPEmbedding", "MobileCLIPModel"])
 except ImportError:
     pass
+
+
+# Optional: TorchReid (requires torchreid)
+try:
+    from dimos.models.embedding.treid import TorchReIDEmbedding, TorchReIDModel
+
+    __all__.extend(["TorchReIDEmbedding", "TorchReIDModel"])
+except ImportError:
+    pass
diff --git a/dimos/models/embedding/test_embedding_models.py b/dimos/models/embedding/test_embedding_models.py
index ee69c7cfd0..0338b8dbe2 100644
--- a/dimos/models/embedding/test_embedding_models.py
+++ b/dimos/models/embedding/test_embedding_models.py
@@ -16,7 +16,6 @@
 import pytest
 
 from dimos.models.embedding.clip import CLIPModel
-from dimos.models.embedding.treid import TorchReIDModel
 from dimos.msgs.sensor_msgs import Image
 from dimos.utils.data import get_data
 
@@ -29,12 +28,23 @@
     HAS_OPENCLIP = False
     MobileCLIPModel = None
 
+# Try to import MobileCLIP, skip if not available
+try:
+    from dimos.models.embedding.treid import TorchReIDModel
+
+    HAS_TORCHREID = True
+except ImportError:
+    HAS_TORCHREID = False
+    TorchReIDModel = None
+
 
 def _get_test_params():
     """Get test parameters based on available packages."""
-    params = ["clip", "treid"]
+    params = ["clip"]
     if HAS_OPENCLIP:
-        params.insert(0, "mobileclip")
+        params.append("mobileclip")
+    if HAS_TORCHREID:
+        params.append("treid")
     return params
 
 
diff --git a/dimos/perception/detection/reid/module.py b/dimos/perception/detection/reid/module.py
index b3019d90d0..cf22d1b573 100644
--- a/dimos/perception/detection/reid/module.py
+++ b/dimos/perception/detection/reid/module.py
@@ -21,7 +21,6 @@
 from reactivex.observable import Observable
 
 from dimos.core import In, Module, ModuleConfig, Out, rpc
-from dimos.models.embedding import TorchReIDModel
 from dimos.msgs.foxglove_msgs.Color import Color
 from dimos.msgs.sensor_msgs import Image
 from dimos.msgs.vision_msgs import Detection2DArray
@@ -46,6 +45,8 @@ class ReidModule(Module):
     def __init__(self, idsystem: IDSystem | None = None, **kwargs):
         super().__init__(**kwargs)
         if idsystem is None:
+            from dimos.models.embedding import TorchReIDModel
+
             idsystem = EmbeddingIDSystem(model=TorchReIDModel, padding=0)
 
         self.idsystem = idsystem
diff --git a/dimos/perception/detection/reid/test_module.py b/dimos/perception/detection/reid/test_module.py
index 9747ce5cbe..71fffa1d8f 100644
--- a/dimos/perception/detection/reid/test_module.py
+++ b/dimos/perception/detection/reid/test_module.py
@@ -11,33 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import time
 
 import pytest
-import torch
 
-from dimos.core import LCMTransport, start
-from dimos.models.embedding import TorchReIDModel
+from dimos.core import LCMTransport
 from dimos.msgs.foxglove_msgs import ImageAnnotations
-from dimos.msgs.sensor_msgs import Image
-from dimos.msgs.vision_msgs import Detection2DArray
-from dimos.perception.detection.reid.module import ReidModule
 from dimos.perception.detection.reid.embedding_id_system import EmbeddingIDSystem
+from dimos.perception.detection.reid.module import ReidModule
+
 
+@pytest.mark.tool
+def test_reid_ingress(imageDetections2d):
+    from dimos.models.embedding import TorchReIDModel
 
-def test_reid_ingress():
     # Create TorchReID-based IDSystem for testing
     reid_model = TorchReIDModel(model_name="osnet_x1_0")
     reid_model.warmup()
-    # idsystem = EmbeddingIDSystem(
-    #     model=lambda: reid_model,
-    #     padding=20,
-    #     similarity_threshold=0.75,
-    # )
+    idsystem = EmbeddingIDSystem(
+        model=lambda: reid_model,
+        padding=20,
+        similarity_threshold=0.75,
+    )
 
-    # reid_module = ReidModule(idsystem=idsystem, warmup=False)
-    # print("Processing detections through ReidModule...")
-    # reid_module.annotations._transport = LCMTransport("/annotations", ImageAnnotations)
-    # reid_module.ingress(imageDetections2d)
-    # reid_module._close_module()
-    # print("✓ ReidModule ingress test completed successfully")
+    reid_module = ReidModule(idsystem=idsystem, warmup=False)
+    print("Processing detections through ReidModule...")
+    reid_module.annotations._transport = LCMTransport("/annotations", ImageAnnotations)
+    reid_module.ingress(imageDetections2d)
+    reid_module._close_module()
+    print("✓ ReidModule ingress test completed successfully")

From a29e1569eee5d9a9bf3d5b5d12b0b28408990132 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Thu, 16 Oct 2025 14:40:33 -0700
Subject: [PATCH 42/47] removing package optionality for now

---
 dimos/models/embedding/__init__.py            | 23 ++++---------
 .../models/embedding/test_embedding_models.py | 34 ++-----------------
 dimos/perception/detection/reid/module.py     | 11 ++++--
 .../perception/detection/reid/test_module.py  |  5 ++-
 pyproject.toml                                | 11 +++---
 5 files changed, 26 insertions(+), 58 deletions(-)

diff --git a/dimos/models/embedding/__init__.py b/dimos/models/embedding/__init__.py
index 5efe1c8107..a8f3784ca5 100644
--- a/dimos/models/embedding/__init__.py
+++ b/dimos/models/embedding/__init__.py
@@ -1,26 +1,15 @@
 from dimos.models.embedding.base import Embedding, EmbeddingModel
 from dimos.models.embedding.clip import CLIPEmbedding, CLIPModel
+from dimos.models.embedding.mobileclip import MobileCLIPEmbedding, MobileCLIPModel
+from dimos.models.embedding.treid import TorchReIDEmbedding, TorchReIDModel
 
 __all__ = [
     "Embedding",
     "EmbeddingModel",
     "CLIPEmbedding",
     "CLIPModel",
+    "MobileCLIPEmbedding",
+    "MobileCLIPModel",
+    "TorchReIDEmbedding",
+    "TorchReIDModel",
 ]
-
-# Optional: MobileCLIP (requires open-clip-torch)
-try:
-    from dimos.models.embedding.mobileclip import MobileCLIPEmbedding, MobileCLIPModel
-
-    __all__.extend(["MobileCLIPEmbedding", "MobileCLIPModel"])
-except ImportError:
-    pass
-
-
-# Optional: TorchReid (requires torchreid)
-try:
-    from dimos.models.embedding.treid import TorchReIDEmbedding, TorchReIDModel
-
-    __all__.extend(["TorchReIDEmbedding", "TorchReIDModel"])
-except ImportError:
-    pass
diff --git a/dimos/models/embedding/test_embedding_models.py b/dimos/models/embedding/test_embedding_models.py
index 0338b8dbe2..6126138d1c 100644
--- a/dimos/models/embedding/test_embedding_models.py
+++ b/dimos/models/embedding/test_embedding_models.py
@@ -16,44 +16,16 @@
 import pytest
 
 from dimos.models.embedding.clip import CLIPModel
+from dimos.models.embedding.mobileclip import MobileCLIPModel
+from dimos.models.embedding.treid import TorchReIDModel
 from dimos.msgs.sensor_msgs import Image
 from dimos.utils.data import get_data
 
-# Try to import MobileCLIP, skip if not available
-try:
-    from dimos.models.embedding.mobileclip import MobileCLIPModel
 
-    HAS_OPENCLIP = True
-except ImportError:
-    HAS_OPENCLIP = False
-    MobileCLIPModel = None
-
-# Try to import MobileCLIP, skip if not available
-try:
-    from dimos.models.embedding.treid import TorchReIDModel
-
-    HAS_TORCHREID = True
-except ImportError:
-    HAS_TORCHREID = False
-    TorchReIDModel = None
-
-
-def _get_test_params():
-    """Get test parameters based on available packages."""
-    params = ["clip"]
-    if HAS_OPENCLIP:
-        params.append("mobileclip")
-    if HAS_TORCHREID:
-        params.append("treid")
-    return params
-
-
-@pytest.fixture(scope="session", params=_get_test_params())
+@pytest.fixture(scope="session", params=["clip", "mobileclip", "treid"])
 def embedding_model(request):
     """Load embedding model once for all tests. Parametrized for different models."""
     if request.param == "mobileclip":
-        if not HAS_OPENCLIP:
-            pytest.skip("open_clip_torch not installed. Install with: pip install dimos[openclip]")
         model_path = get_data("models_mobileclip") / "mobileclip2_s0.pt"
         model = MobileCLIPModel(model_name="MobileCLIP2-S0", model_path=model_path)
     elif request.param == "clip":
diff --git a/dimos/perception/detection/reid/module.py b/dimos/perception/detection/reid/module.py
index cf22d1b573..64769b1038 100644
--- a/dimos/perception/detection/reid/module.py
+++ b/dimos/perception/detection/reid/module.py
@@ -45,9 +45,14 @@ class ReidModule(Module):
     def __init__(self, idsystem: IDSystem | None = None, **kwargs):
         super().__init__(**kwargs)
         if idsystem is None:
-            from dimos.models.embedding import TorchReIDModel
-
-            idsystem = EmbeddingIDSystem(model=TorchReIDModel, padding=0)
+            try:
+                from dimos.models.embedding import TorchReIDModel
+
+                idsystem = EmbeddingIDSystem(model=TorchReIDModel, padding=0)
+            except Exception as e:
+                raise RuntimeError(
+                    "TorchReIDModel not available. Please install with: pip install dimos[torchreid]"
+                ) from e
 
         self.idsystem = idsystem
 
diff --git a/dimos/perception/detection/reid/test_module.py b/dimos/perception/detection/reid/test_module.py
index 71fffa1d8f..6c977e13a5 100644
--- a/dimos/perception/detection/reid/test_module.py
+++ b/dimos/perception/detection/reid/test_module.py
@@ -22,7 +22,10 @@
 
 @pytest.mark.tool
 def test_reid_ingress(imageDetections2d):
-    from dimos.models.embedding import TorchReIDModel
+    try:
+        from dimos.models.embedding import TorchReIDModel
+    except Exception:
+        pytest.skip("TorchReIDModel not available")
 
     # Create TorchReID-based IDSystem for testing
     reid_model = TorchReIDModel(model_name="osnet_x1_0")
diff --git a/pyproject.toml b/pyproject.toml
index f495e12d2a..7a71035d27 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,8 +80,7 @@ dependencies = [
     "transformers[torch]==4.49.0",
     
     # Vector Embedding
-    "sentence_transformers",
-    
+    "sentence_transformers",    
     
     # Perception Dependencies
     "ultralytics>=8.3.70",
@@ -99,7 +98,6 @@ dependencies = [
     "googlemaps>=4.10.0",
 
     # Inference
-
     "onnx",
 
     # Multiprocess 
@@ -140,9 +138,6 @@ manipulation = [
     "plotly>=5.9.0",
 ]
 
-openclip = [
-    "open_clip_torch>=3.0.0",
-]
 
 cpu = [
     # CPU inference backends
@@ -169,6 +164,10 @@ cuda = [
     "nltk",
     "clip @ git+https://github.com/openai/CLIP.git",
     "detectron2 @ git+https://github.com/facebookresearch/detectron2.git@v0.6",
+
+    # embedding models
+    "open_clip_torch>=3.0.0",
+    "torchreid==0.2.5",
 ]
 
 dev = [

From 338693f69e2a2c5fdb5284734a6610aa3de98e3c Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Thu, 16 Oct 2025 15:57:56 -0700
Subject: [PATCH 43/47] embedding models heavy tests import fix

---
 .../models/embedding/test_embedding_models.py | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/dimos/models/embedding/test_embedding_models.py b/dimos/models/embedding/test_embedding_models.py
index 6126138d1c..52e9fd08af 100644
--- a/dimos/models/embedding/test_embedding_models.py
+++ b/dimos/models/embedding/test_embedding_models.py
@@ -15,9 +15,6 @@
 import numpy as np
 import pytest
 
-from dimos.models.embedding.clip import CLIPModel
-from dimos.models.embedding.mobileclip import MobileCLIPModel
-from dimos.models.embedding.treid import TorchReIDModel
 from dimos.msgs.sensor_msgs import Image
 from dimos.utils.data import get_data
 
@@ -26,11 +23,17 @@
 def embedding_model(request):
     """Load embedding model once for all tests. Parametrized for different models."""
     if request.param == "mobileclip":
+        from dimos.models.embedding.mobileclip import MobileCLIPModel
+
         model_path = get_data("models_mobileclip") / "mobileclip2_s0.pt"
         model = MobileCLIPModel(model_name="MobileCLIP2-S0", model_path=model_path)
     elif request.param == "clip":
+        from dimos.models.embedding.clip import CLIPModel
+
         model = CLIPModel(model_name="openai/clip-vit-base-patch32")
     elif request.param == "treid":
+        from dimos.models.embedding.treid import TorchReIDModel
+
         model = TorchReIDModel(model_name="osnet_x1_0")
     else:
         raise ValueError(f"Unknown model: {request.param}")
@@ -93,8 +96,8 @@ def test_single_text_embedding(embedding_model):
     """Test embedding a single text string."""
     import torch
 
-    if isinstance(embedding_model, TorchReIDModel):
-        pytest.skip("TorchReID does not support text embeddings")
+    if not hasattr(embedding_model, "embed_text"):
+        pytest.skip("Model does not support text embeddings")
 
     embedding = embedding_model.embed_text("a cafe")
 
@@ -118,8 +121,8 @@ def test_batch_text_embedding(embedding_model):
     """Test embedding multiple text strings at once."""
     import torch
 
-    if isinstance(embedding_model, TorchReIDModel):
-        pytest.skip("TorchReID does not support text embeddings")
+    if not hasattr(embedding_model, "embed_text"):
+        pytest.skip("Model does not support text embeddings")
 
     embeddings = embedding_model.embed_text("a cafe", "a person", "a dog")
 
@@ -136,8 +139,8 @@ def test_batch_text_embedding(embedding_model):
 @pytest.mark.heavy
 def test_text_image_similarity(embedding_model, test_image):
     """Test cross-modal text-image similarity using @ operator."""
-    if isinstance(embedding_model, TorchReIDModel):
-        pytest.skip("TorchReID does not support text embeddings")
+    if not hasattr(embedding_model, "embed_text"):
+        pytest.skip("Model does not support text embeddings")
 
     img_embedding = embedding_model.embed(test_image)
 
@@ -179,8 +182,8 @@ def test_cosine_distance(embedding_model, test_image):
 @pytest.mark.heavy
 def test_query_functionality(embedding_model, test_image):
     """Test query method for top-k retrieval."""
-    if isinstance(embedding_model, TorchReIDModel):
-        pytest.skip("TorchReID does not support text embeddings")
+    if not hasattr(embedding_model, "embed_text"):
+        pytest.skip("Model does not support text embeddings")
 
     # Create a query and some candidates
     query_text = embedding_model.embed_text("a cafe")
@@ -254,8 +257,8 @@ def test_compare_many_to_many(embedding_model):
     """Test GPU-accelerated many-to-many comparison."""
     import torch
 
-    if isinstance(embedding_model, TorchReIDModel):
-        pytest.skip("TorchReID does not support text embeddings")
+    if not hasattr(embedding_model, "embed_text"):
+        pytest.skip("Model does not support text embeddings")
 
     # Create queries and candidates
     queries = embedding_model.embed_text("a cafe", "a person")
@@ -367,8 +370,8 @@ def test_embedding_performance(embedding_model):
     assert all(e.vector is not None for e in batch_embeddings)
 
     # Sanity check: verify embeddings are meaningful by testing text-image similarity
-    # Skip for TorchReID since it doesn't support text embeddings
-    if not isinstance(embedding_model, TorchReIDModel):
+    # Skip for models that don't support text embeddings
+    if hasattr(embedding_model, "embed_text"):
         print("\n" + "=" * 60)
         print("Sanity Check: Text-Image Similarity on First Frame")
         print("=" * 60)

From 8d5c0ae50068537f07591f1e4b97bf57a78ed2cb Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Thu, 16 Oct 2025 16:03:06 -0700
Subject: [PATCH 44/47] resolved import issues

---
 dimos/models/embedding/__init__.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/dimos/models/embedding/__init__.py b/dimos/models/embedding/__init__.py
index a8f3784ca5..f286dfe27b 100644
--- a/dimos/models/embedding/__init__.py
+++ b/dimos/models/embedding/__init__.py
@@ -1,7 +1,6 @@
 from dimos.models.embedding.base import Embedding, EmbeddingModel
 from dimos.models.embedding.clip import CLIPEmbedding, CLIPModel
 from dimos.models.embedding.mobileclip import MobileCLIPEmbedding, MobileCLIPModel
-from dimos.models.embedding.treid import TorchReIDEmbedding, TorchReIDModel
 
 __all__ = [
     "Embedding",
@@ -10,6 +9,12 @@
     "CLIPModel",
     "MobileCLIPEmbedding",
     "MobileCLIPModel",
-    "TorchReIDEmbedding",
-    "TorchReIDModel",
 ]
+
+# Optional: TorchReID support (requires torchreid package)
+try:
+    from dimos.models.embedding.treid import TorchReIDEmbedding, TorchReIDModel
+
+    __all__.extend(["TorchReIDEmbedding", "TorchReIDModel"])
+except ImportError:
+    pass

From c4ebc93100c9de8d114feba5677a1bfb98f38237 Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Thu, 16 Oct 2025 16:08:36 -0700
Subject: [PATCH 45/47] unified import resolution strategy

---
 dimos/models/embedding/__init__.py            | 24 +++++++++++++------
 dimos/models/embedding/mobileclip.py          |  8 +------
 dimos/models/embedding/treid.py               |  5 ++++
 .../reid/test_embedding_id_system.py          |  3 ++-
 4 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/dimos/models/embedding/__init__.py b/dimos/models/embedding/__init__.py
index f286dfe27b..981e25e5c2 100644
--- a/dimos/models/embedding/__init__.py
+++ b/dimos/models/embedding/__init__.py
@@ -1,17 +1,27 @@
 from dimos.models.embedding.base import Embedding, EmbeddingModel
-from dimos.models.embedding.clip import CLIPEmbedding, CLIPModel
-from dimos.models.embedding.mobileclip import MobileCLIPEmbedding, MobileCLIPModel
 
 __all__ = [
     "Embedding",
     "EmbeddingModel",
-    "CLIPEmbedding",
-    "CLIPModel",
-    "MobileCLIPEmbedding",
-    "MobileCLIPModel",
 ]
 
-# Optional: TorchReID support (requires torchreid package)
+# Optional: CLIP support
+try:
+    from dimos.models.embedding.clip import CLIPEmbedding, CLIPModel
+
+    __all__.extend(["CLIPEmbedding", "CLIPModel"])
+except ImportError:
+    pass
+
+# Optional: MobileCLIP support
+try:
+    from dimos.models.embedding.mobileclip import MobileCLIPEmbedding, MobileCLIPModel
+
+    __all__.extend(["MobileCLIPEmbedding", "MobileCLIPModel"])
+except ImportError:
+    pass
+
+# Optional: TorchReID support
 try:
     from dimos.models.embedding.treid import TorchReIDEmbedding, TorchReIDModel
 
diff --git a/dimos/models/embedding/mobileclip.py b/dimos/models/embedding/mobileclip.py
index 755010d5a7..c0295a78ef 100644
--- a/dimos/models/embedding/mobileclip.py
+++ b/dimos/models/embedding/mobileclip.py
@@ -14,13 +14,7 @@
 
 from pathlib import Path
 
-try:
-    import open_clip
-
-    OPEN_CLIP_AVAILABLE = True
-except ImportError:
-    OPEN_CLIP_AVAILABLE = False
-
+import open_clip
 import torch
 import torch.nn.functional as F
 from PIL import Image as PILImage
diff --git a/dimos/models/embedding/treid.py b/dimos/models/embedding/treid.py
index b56aeab714..bdd00627a0 100644
--- a/dimos/models/embedding/treid.py
+++ b/dimos/models/embedding/treid.py
@@ -46,6 +46,11 @@ def __init__(
             device: Device to run on (cuda/cpu), auto-detects if None
             normalize: Whether to L2 normalize embeddings
         """
+        if not TORCHREID_AVAILABLE:
+            raise ImportError(
+                "torchreid is required for TorchReIDModel. Install it with: pip install torchreid"
+            )
+
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
         self.normalize = normalize
 
diff --git a/dimos/perception/detection/reid/test_embedding_id_system.py b/dimos/perception/detection/reid/test_embedding_id_system.py
index 2aa54ee2ee..6a7df7d575 100644
--- a/dimos/perception/detection/reid/test_embedding_id_system.py
+++ b/dimos/perception/detection/reid/test_embedding_id_system.py
@@ -15,7 +15,6 @@
 import pytest
 import torch
 
-from dimos.models.embedding.mobileclip import MobileCLIPModel
 from dimos.msgs.sensor_msgs import Image
 from dimos.perception.detection.reid.embedding_id_system import EmbeddingIDSystem
 from dimos.utils.data import get_data
@@ -24,6 +23,8 @@
 @pytest.fixture(scope="session")
 def mobileclip_model():
     """Load MobileCLIP model once for all tests."""
+    from dimos.models.embedding.mobileclip import MobileCLIPModel
+
     model_path = get_data("models_mobileclip") / "mobileclip2_s0.pt"
     model = MobileCLIPModel(model_name="MobileCLIP2-S0", model_path=model_path)
     model.warmup()

From 0aa462c391d1a4785c73af40c565a27dd431ad3e Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Thu, 16 Oct 2025 18:01:15 -0700
Subject: [PATCH 46/47] disabled embedding tests for now

---
 ...est_embedding_models.py => embedding_models_disabled_tests.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename dimos/models/embedding/{test_embedding_models.py => embedding_models_disabled_tests.py} (100%)

diff --git a/dimos/models/embedding/test_embedding_models.py b/dimos/models/embedding/embedding_models_disabled_tests.py
similarity index 100%
rename from dimos/models/embedding/test_embedding_models.py
rename to dimos/models/embedding/embedding_models_disabled_tests.py

From 4ca85ecf5e3402fd99e42aaac727246e6fdc0c3c Mon Sep 17 00:00:00 2001
From: lesh <lesh@sysphere.org>
Date: Thu, 16 Oct 2025 19:45:58 -0700
Subject: [PATCH 47/47] marking tests as gpu, not heavy

---
 dimos/models/vl/test_models.py                |  2 +-
 .../reid/test_embedding_id_system.py          | 24 +++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/dimos/models/vl/test_models.py b/dimos/models/vl/test_models.py
index 66c6a2326a..3871626ae1 100644
--- a/dimos/models/vl/test_models.py
+++ b/dimos/models/vl/test_models.py
@@ -21,7 +21,7 @@
     ],
     ids=["moondream", "qwen"],
 )
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_vlm(model_class, model_name):
     image = Image.from_file(get_data("cafe.jpg")).to_rgb()
 
diff --git a/dimos/perception/detection/reid/test_embedding_id_system.py b/dimos/perception/detection/reid/test_embedding_id_system.py
index 6a7df7d575..b2bc84bc55 100644
--- a/dimos/perception/detection/reid/test_embedding_id_system.py
+++ b/dimos/perception/detection/reid/test_embedding_id_system.py
@@ -43,7 +43,7 @@ def test_image():
     return Image.from_file(get_data("cafe.jpg")).to_rgb()
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_update_embedding_single(track_associator, mobileclip_model, test_image):
     """Test updating embedding for a single track."""
     embedding = mobileclip_model.embed(test_image)
@@ -62,7 +62,7 @@ def test_update_embedding_single(track_associator, mobileclip_model, test_image)
     assert abs(norm - 1.0) < 0.01, "Embedding should be normalized"
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_update_embedding_running_average(track_associator, mobileclip_model, test_image):
     """Test running average of embeddings."""
     embedding1 = mobileclip_model.embed(test_image)
@@ -87,7 +87,7 @@ def test_update_embedding_running_average(track_associator, mobileclip_model, te
     assert similarity1 > 0.99, "Average should be very similar to original"
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_negative_constraints(track_associator):
     """Test negative constraint recording."""
     # Simulate frame with 3 tracks
@@ -103,7 +103,7 @@ def test_negative_constraints(track_associator):
     assert 2 in track_associator.negative_pairs[3]
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_associate_new_track(track_associator, mobileclip_model, test_image):
     """Test associating a new track creates new long_term_id."""
     embedding = mobileclip_model.embed(test_image)
@@ -117,7 +117,7 @@ def test_associate_new_track(track_associator, mobileclip_model, test_image):
     assert track_associator.long_term_counter == 1
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_associate_similar_tracks(track_associator, mobileclip_model, test_image):
     """Test associating similar tracks to same long_term_id."""
     # Create embeddings from same image (should be very similar)
@@ -137,7 +137,7 @@ def test_associate_similar_tracks(track_associator, mobileclip_model, test_image
     assert track_associator.long_term_counter == 1, "Only one long_term_id should be created"
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_associate_with_negative_constraint(track_associator, mobileclip_model, test_image):
     """Test that negative constraints prevent association."""
     # Create similar embeddings
@@ -162,7 +162,7 @@ def test_associate_with_negative_constraint(track_associator, mobileclip_model,
     assert track_associator.long_term_counter == 2, "Two long_term_ids should be created"
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_associate_different_objects(track_associator, mobileclip_model, test_image):
     """Test that dissimilar embeddings get different long_term_ids."""
     # Create embeddings for image and text (very different)
@@ -182,7 +182,7 @@ def test_associate_different_objects(track_associator, mobileclip_model, test_im
     assert track_associator.long_term_counter == 2
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_associate_returns_cached(track_associator, mobileclip_model, test_image):
     """Test that repeated calls return same long_term_id."""
     embedding = mobileclip_model.embed(test_image)
@@ -198,14 +198,14 @@ def test_associate_returns_cached(track_associator, mobileclip_model, test_image
     assert track_associator.long_term_counter == 1, "Should not create new ID"
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_associate_not_ready(track_associator):
     """Test that associate returns -1 for track without embedding."""
     long_term_id = track_associator.associate(track_id=999)
     assert long_term_id == -1, "Should return -1 for track without embedding"
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_gpu_performance(track_associator, mobileclip_model, test_image):
     """Test that embeddings stay on GPU for performance."""
     embedding = mobileclip_model.embed(test_image)
@@ -226,7 +226,7 @@ def test_gpu_performance(track_associator, mobileclip_model, test_image):
     assert avg_vec.device.type == torch.device(expected_device).type
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_similarity_threshold_configurable(mobileclip_model):
     """Test that similarity threshold is configurable."""
     associator_strict = EmbeddingIDSystem(model=lambda: mobileclip_model, similarity_threshold=0.95)
@@ -236,7 +236,7 @@ def test_similarity_threshold_configurable(mobileclip_model):
     assert associator_loose.similarity_threshold == 0.50
 
 
-@pytest.mark.heavy
+@pytest.mark.gpu
 def test_multi_track_scenario(track_associator, mobileclip_model, test_image):
     """Test realistic scenario with multiple tracks across frames."""
     # Frame 1: Track 1 appears