dimensionalOS · spomichter · Jul 9, 2025 · Jul 1, 2025 · Jul 3, 2025 · Jul 4, 2025
diff --git a/bin/cuda/fix_ort.sh b/bin/cuda/fix_ort.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# This script fixes the onnxruntime <--> onnxruntime-gpu package clash
+# that occurs when chromadb and other dependencies require the CPU-only
+# onnxruntime package. It removes onnxruntime and reinstalls the GPU version.
+set -euo pipefail
+
+: "${GPU_VER:=1.18.1}"
+
+python - <<PY
+import subprocess, sys, importlib.metadata as md
+
+gpu_ver = "${GPU_VER}"
+
+def has_dist(name):
+    try:
+        md.version(name)
+        return True
+    except md.PackageNotFoundError:
+        return False
+
+if has_dist("onnxruntime"):
+    print("Removing CPU-only onnxruntime wheel …")
+    subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "onnxruntime"])
+
+print(f"Reinstalling onnxruntime-gpu=={gpu_ver} …")
+subprocess.check_call([
+    sys.executable, "-m", "pip", "install",
+    "--no-deps", "--force-reinstall", f"onnxruntime-gpu=={gpu_ver}"
+])
+PY
diff --git a/dimos/perception/detection2d/yolo_2d_det.py b/dimos/perception/detection2d/yolo_2d_det.py
@@ -47,7 +47,8 @@ def __init__(self, model_path="models_yolo", model_name="yolo11n.onnx", device="
         module_dir = os.path.dirname(__file__)
         self.tracker_config = os.path.join(module_dir, "config", "custom_tracker.yaml")
         if is_cuda_available():
-            onnxruntime.preload_dlls(cuda=True, cudnn=True)
+            if hasattr(onnxruntime, "preload_dlls"):  # Handles CUDA 11 / onnxruntime-gpu<=1.18
+                onnxruntime.preload_dlls(cuda=True, cudnn=True)
             self.device = "cuda"
             logger.info("Using CUDA for YOLO 2d detector")
         else:

diff --git a/dimos/perception/object_detection_stream.py b/dimos/perception/object_detection_stream.py
@@ -19,7 +19,14 @@
 from reactivex import operators as ops
 
 from dimos.perception.detection2d.yolo_2d_det import Yolo2DDetector
-from dimos.perception.detection2d.detic_2d_det import Detic2DDetector
+
+try:
+    from dimos.perception.detection2d.detic_2d_det import Detic2DDetector
+
+    DETIC_AVAILABLE = True
+except (ModuleNotFoundError, ImportError):
+    DETIC_AVAILABLE = False
+    Detic2DDetector = None
 from dimos.models.depth.metric3d import Metric3D
 from dimos.perception.detection2d.utils import (
     calculate_depth_from_bbox,
@@ -83,25 +90,47 @@ def __init__(
         self.disable_depth = disable_depth
         self.draw_masks = draw_masks
         # Initialize object detector
-        self.detector = detector or Detic2DDetector(vocabulary=None, threshold=min_confidence)
+        if detector is not None:
+            self.detector = detector
+        else:
+            if DETIC_AVAILABLE:
+                try:
+                    self.detector = Detic2DDetector(vocabulary=None, threshold=min_confidence)
+                    logger.info("Using Detic2DDetector")
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to initialize Detic2DDetector: {e}. Falling back to Yolo2DDetector."
+                    )
+                    self.detector = Yolo2DDetector()
+            else:
+                logger.info("Detic not available. Using Yolo2DDetector.")
+                self.detector = Yolo2DDetector()
         # Set up camera intrinsics
         self.camera_intrinsics = camera_intrinsics
 
         # Initialize depth estimation model
         self.depth_model = None
         if not disable_depth:
-            self.depth_model = Metric3D(gt_depth_scale)
+            try:
+                self.depth_model = Metric3D(gt_depth_scale)
 
-            if camera_intrinsics is not None:
-                self.depth_model.update_intrinsic(camera_intrinsics)
+                if camera_intrinsics is not None:
+                    self.depth_model.update_intrinsic(camera_intrinsics)
 
-                # Create 3x3 camera matrix for calculations
-                fx, fy, cx, cy = camera_intrinsics
-                self.camera_matrix = np.array(
-                    [[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32
-                )
-            else:
-                raise ValueError("camera_intrinsics must be provided")
+                    # Create 3x3 camera matrix for calculations
+                    fx, fy, cx, cy = camera_intrinsics
+                    self.camera_matrix = np.array(
+                        [[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32
+                    )
+                else:
+                    raise ValueError("camera_intrinsics must be provided")
+
+                logger.info("Depth estimation enabled with Metric3D")
+            except Exception as e:
+                logger.warning(f"Failed to initialize Metric3D depth model: {e}")
+                logger.warning("Falling back to disable_depth=True mode")
+                self.disable_depth = True
+                self.depth_model = None
         else:
             logger.info("Depth estimation disabled")
 
@@ -123,9 +152,15 @@ def create_stream(self, video_stream: Observable) -> Observable:
         """
 
         def process_frame(frame):
-            # Detect objects
-            bboxes, track_ids, class_ids, confidences, names, masks = self.detector.process_image(
-                frame
+            # TODO: More modular detector output interface
+            bboxes, track_ids, class_ids, confidences, names, *mask_data = (
+                self.detector.process_image(frame) + ([],)
+            )
+
+            masks = (
+                mask_data[0]
+                if mask_data and len(mask_data[0]) == len(bboxes)
+                else [None] * len(bboxes)
             )
 
             # Create visualization

diff --git a/dimos/perception/segmentation/sam_2d_seg.py b/dimos/perception/segmentation/sam_2d_seg.py
@@ -51,7 +51,8 @@ def __init__(
         self.device = device
         if is_cuda_available():
             logger.info("Using CUDA for SAM 2d segmenter")
-            onnxruntime.preload_dlls(cuda=True, cudnn=True)
+            if hasattr(onnxruntime, "preload_dlls"):  # Handles CUDA 11 / onnxruntime-gpu<=1.18
+                onnxruntime.preload_dlls(cuda=True, cudnn=True)
             self.device = "cuda"
         else:
             logger.info("Using CPU for SAM 2d segmenter")

diff --git a/dimos/robot/unitree_webrtc/type/lidar.py b/dimos/robot/unitree_webrtc/type/lidar.py
@@ -53,7 +53,7 @@ class LidarMessage(PointCloud2):
     resolution: float  # we lose resolution when encoding PointCloud2
     origin: Vector3
     raw_msg: Optional[RawLidarMsg]
-    _costmap: Optional[Costmap]
+    _costmap: Optional[Costmap] = None
 
     def __init__(self, **kwargs):
         super().__init__(

diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile
@@ -23,8 +23,9 @@ RUN apt-get update && apt-get install -y \
 # Configure git to trust any directory (resolves dubious ownership issues in containers)
 RUN git config --global --add safe.directory '*'
 
-COPY docker/dev/dev-requirements.txt /app/
-RUN --mount=type=cache,target=/root/.cache/pip pip install -r dev-requirements.txt
+COPY . /app/
+WORKDIR /app
+RUN --mount=type=cache,target=/root/.cache/pip pip install .[dev]
 
 # Copy files and add version to motd
 COPY /assets/dimensionalascii.txt /etc/motd

diff --git a/docker/dev/dev-requirements.txt b/docker/dev/dev-requirements.txt
diff --git a/docker/python/Dockerfile b/docker/python/Dockerfile
@@ -31,13 +31,13 @@ RUN apt-get install -y \
     qtbase5-dev-tools \
     supervisor
 
+# Fix distutils-installed packages that block pip upgrades
+RUN apt-get purge -y python3-blinker python3-sympy python3-oauthlib || true
+
 RUN mkdir -p /app/dimos
 
-COPY requirements.txt /app/
-COPY base-requirements.txt /app/
+COPY . /app/
 
 WORKDIR /app
 
-RUN --mount=type=cache,target=/root/.cache/pip pip install -r base-requirements.txt
-
-RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip bash -c "pip install --upgrade 'pip>=24' 'setuptools>=70' 'wheel' 'packaging>=24' && pip install '.[cpu]'"
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=42", "wheel"]
+requires = ["setuptools>=70", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools]
@@ -30,15 +30,13 @@ dependencies = [
     "lark",
     "plum-dispatch==2.5.7",
     "ffmpeg-python",
-    "pytest",
     "tiktoken>=0.8.0",
     "Flask>=2.2",
     "python-multipart==0.0.20",
     "reactivex",
     "rxpy-backpressure @ git+https://github.com/dimensionalOS/rxpy-backpressure.git",
-    "pytest-asyncio==0.26.0",
     "asyncio==3.4.3",
-    "go2-webrtc-connect @ git+https://github.com/legion1581/go2_webrtc_connect.git@fe64abb5987594e8c048427a98445799f6f6a9cc",
+    "go2-webrtc-connect @ git+https://github.com/dimensionalOS/go2_webrtc_connect.git",
 
     # Web Extensions
     "fastapi>=0.115.6",
@@ -73,34 +71,27 @@ dependencies = [
     # Vector Embedding
     "sentence_transformers",
 
-    # CTransforms GGUF
-    "ctransformers[cuda]==0.2.27",
 
     # Perception Dependencies
     "ultralytics>=8.3.70",
     "filterpy>=1.4.5",
     "scipy>=1.15.1",
     "scikit-learn",
     "Pillow",
-    "mmengine>=0.10.3",
-    "mmcv>=2.1.0",
+    "clip @ git+https://github.com/openai/CLIP.git",
     "timm>=1.0.15",
     "lap>=0.5.12",
-    "xformers==0.0.20",
-
-    # Detic
-    "mss",
-    "dataclasses",
-    "ftfy",
-    "regex",
-    "fasttext",
-    "lvis",
-    "nltk",
-    "clip @ git+https://github.com/openai/CLIP.git",
-    "detectron2 @ git+https://github.com/facebookresearch/detectron2.git@v0.6",
-
+
     # Mapping
     "open3d",
+
+    # Inference
+
+    "onnx",
+
+    # Multiprocess 
+    "dask[complete]==2025.5.1",
+    "lcm_msgs @ git+https://github.com/dimensionalOS/python_lcm_msgs.git@main#egg=lcm_msgs"
 ]
 
 [project.optional-dependencies]
@@ -124,9 +115,38 @@ manipulation = [
     "plotly>=5.9.0",
 ]
 
+cpu = [
+    # CPU inference backends
+    "onnxruntime",
+    "ctransformers==0.2.27",
+]
+
 cuda = [
     "pycuda",
-    "onnxruntime-gpu[cuda,cudnn]"
+    "onnxruntime-gpu>=1.17.1,<=1.18.1", # Only versions supporting both cuda11 and cuda12
+    "ctransformers[cuda]==0.2.27",
+    "mmengine>=0.10.3",
+    "mmcv>=2.1.0",
+    "xformers==0.0.20",
+
+    # Detic GPU stack 
+    "mss",
+    "dataclasses",
+    "ftfy",
+    "regex",
+    "fasttext",
+    "lvis",
+    "nltk",
+    "clip @ git+https://github.com/openai/CLIP.git",
+    "detectron2 @ git+https://github.com/facebookresearch/detectron2.git@v0.6",
+]
+
+dev = [
+    "ruff==0.11.10",
+    "mypy==1.15.0",
+    "pre_commit==4.2.0",
+    "pytest",
+    "pytest-asyncio==0.26.0"
 ]
 
 [tool.ruff]
@@ -160,7 +180,6 @@ files = [
 
 [tool.pytest.ini_options]
 testpaths = ["dimos"]
-norecursedirs = ["dimos/robot/unitree/external"]
 markers = [
     "vis: marks tests that run visuals and require a visual check by dev",
     "benchmark: benchmark, executes something multiple times, calculates avg, prints to console",

diff --git a/requirements.txt b/requirements.txt
@@ -24,9 +24,7 @@ reactivex
 git+https://github.com/dimensionalOS/rxpy-backpressure.git
 pytest-asyncio==0.26.0
 asyncio==3.4.3
--e git+https://github.com/legion1581/go2_webrtc_connect.git@fe64abb5987594e8c048427a98445799f6f6a9cc#egg=go2_webrtc_connect
-#-e git+https://github.com/legion1581/aioice.git@ff5755a1e37127411b5fc797c105804db8437445#egg=aioice
-
+-e git+https://github.com/dimensionalOS/go2_webrtc_connect.git#egg=go2_webrtc_connect
 # Web Extensions
 fastapi>=0.115.6
 sse-starlette>=2.2.1
@@ -42,15 +40,6 @@ pydantic
 # Developer Specific
 ipykernel
 
-# Unitree webrtc streaming
-aiortc==1.9.0
-pycryptodome
-opencv-python
-sounddevice
-pyaudio
-requests
-wasmtime
-
 # Audio
 openai-whisper
 soundfile
@@ -97,6 +86,7 @@ open3d
 
 # Inference (CPU)
 onnxruntime
+onnx
 
 # Terminal colors
 rich==14.0.0

diff --git a/tests/test_object_detection_stream.py b/tests/test_object_detection_stream.py
@@ -27,8 +27,6 @@
 from dimos.perception.object_detection_stream import ObjectDetectionStream
 from dimos.types.vector import Vector
 from dimos.utils.reactive import backpressure
-from dimos.perception.detection2d.detic_2d_det import Detic2DDetector
-
 from dotenv import load_dotenv
 
 
@@ -103,9 +101,6 @@ def main():
     class_filter = None  # No class filtering
     web_port = 5555
 
-    # Initialize detector
-    detector = Detic2DDetector(vocabulary=None, threshold=min_confidence)
-
     # Initialize based on mode
     if args.mode == "robot":
         print("Initializing in robot mode...")
@@ -166,9 +161,9 @@ def main():
             camera_intrinsics=camera_intrinsics,
             min_confidence=min_confidence,
             class_filter=class_filter,
-            detector=detector,
             video_stream=video_stream,
             disable_depth=False,
+            draw_masks=True,
         )
 
         # Set placeholder robot for cleanup