diff --git a/.gitattributes b/.gitattributes
index feb56d98ed..e808d54903 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,17 +1,16 @@
 # Handle line endings automatically for files Git considers text,
 # converting them to LF on checkout.
 * text=auto eol=lf
-
 # Ensure Python files always use LF for line endings.
 *.py text eol=lf
-
 # Treat designated file types as binary and do not alter their contents or line endings.
 *.png binary
 *.jpg binary
-*.gif binary
 *.ico binary
 *.pdf binary
-*.mp4 binary
-
 # Explicit LFS tracking for test files
 tests/data/.lfs/*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text binary
+*.mp4 filter=lfs diff=lfs merge=lfs -text binary
+*.mov filter=lfs diff=lfs merge=lfs -text binary
+*.gif filter=lfs diff=lfs merge=lfs -text binary
diff --git a/assets/dimos_interface.gif b/assets/dimos_interface.gif
index c42fe4e903..e610a2b390 100644
Binary files a/assets/dimos_interface.gif and b/assets/dimos_interface.gif differ
diff --git a/assets/framecount.mp4 b/assets/framecount.mp4
index 74b0b2322b..759ee6ab27 100644
Binary files a/assets/framecount.mp4 and b/assets/framecount.mp4 differ
diff --git a/assets/simple_demo.mp4 b/assets/simple_demo.mp4
index 086a636383..cb8a635e78 100644
Binary files a/assets/simple_demo.mp4 and b/assets/simple_demo.mp4 differ
diff --git a/assets/simple_demo_small.gif b/assets/simple_demo_small.gif
index 294a2a0879..3c2cf54ef4 100644
Binary files a/assets/simple_demo_small.gif and b/assets/simple_demo_small.gif differ
diff --git a/assets/trimmed_video_office.mov b/assets/trimmed_video_office.mov
index 278582f74e..a3072be8fc 100644
Binary files a/assets/trimmed_video_office.mov and b/assets/trimmed_video_office.mov differ
diff --git a/dimos/agents/memory/image_embedding.py b/dimos/agents/memory/image_embedding.py
index 8bcd225d85..265d15892f 100644
--- a/dimos/agents/memory/image_embedding.py
+++ b/dimos/agents/memory/image_embedding.py
@@ -27,6 +27,7 @@
 import cv2
 import base64
 from dimos.utils.logging_config import setup_logger
+from dimos.utils.testing import testData
 
 logger = setup_logger("dimos.agents.memory.image_embedding")
 
@@ -60,12 +61,14 @@ def _initialize_model(self):
         """Initialize the specified embedding model."""
         try:
             import torch
-            from transformers import CLIPProcessor, CLIPModel, AutoFeatureExtractor, AutoModel
+            from transformers import CLIPProcessor, AutoFeatureExtractor, AutoModel
+            import onnxruntime as ort
 
             if self.model_name == "clip":
-                model_id = "openai/clip-vit-base-patch32"
-                self.model = CLIPModel.from_pretrained(model_id)
-                self.processor = CLIPProcessor.from_pretrained(model_id)
+                model_id = testData("models_clip") / "model.onnx"
+                processor_id = "openai/clip-vit-base-patch32"
+                self.model = ort.InferenceSession(model_id)
+                self.processor = CLIPProcessor.from_pretrained(processor_id)
                 logger.info(f"Loaded CLIP model: {model_id}")
             elif self.model_name == "resnet":
                 model_id = "microsoft/resnet-50"
@@ -103,13 +106,35 @@ def get_embedding(self, image: Union[np.ndarray, str, bytes]) -> np.ndarray:
             import torch
 
             if self.model_name == "clip":
-                inputs = self.processor(images=pil_image, return_tensors="pt")
+                inputs = self.processor(images=pil_image, return_tensors="np")
 
                 with torch.no_grad():
-                    image_features = self.model.get_image_features(**inputs)
-
-                image_embedding = image_features / image_features.norm(dim=1, keepdim=True)
-                embedding = image_embedding.numpy()[0]
+                    ort_inputs = {
+                        inp.name: inputs[inp.name]
+                        for inp in self.model.get_inputs()
+                        if inp.name in inputs
+                    }
+
+                    # If required, add dummy text inputs
+                    input_names = [i.name for i in self.model.get_inputs()]
+                    batch_size = inputs["pixel_values"].shape[0]
+                    if "input_ids" in input_names:
+                        ort_inputs["input_ids"] = np.zeros((batch_size, 1), dtype=np.int64)
+                    if "attention_mask" in input_names:
+                        ort_inputs["attention_mask"] = np.ones((batch_size, 1), dtype=np.int64)
+
+                    # Run inference
+                    ort_outputs = self.model.run(None, ort_inputs)
+
+                    # Look up correct output name
+                    output_names = [o.name for o in self.model.get_outputs()]
+                    if "image_embeds" in output_names:
+                        image_embedding = ort_outputs[output_names.index("image_embeds")]
+                    else:
+                        raise RuntimeError(f"No 'image_embeds' found in outputs: {output_names}")
+
+                embedding = image_embedding / np.linalg.norm(image_embedding, axis=1, keepdims=True)
+                embedding = embedding[0]
 
             elif self.model_name == "resnet":
                 inputs = self.processor(images=pil_image, return_tensors="pt")
@@ -156,19 +181,45 @@ def get_text_embedding(self, text: str) -> np.ndarray:
         try:
             import torch
 
-            inputs = self.processor(text=[text], return_tensors="pt", padding=True)
+            inputs = self.processor(text=[text], return_tensors="np", padding=True)
 
             with torch.no_grad():
-                text_features = self.model.get_text_features(**inputs)
-
-            # Normalize the features
-            text_embedding = text_features / text_features.norm(dim=1, keepdim=True)
-            embedding = text_embedding.numpy()[0]
+                # Prepare ONNX input dict (handle only what's needed)
+                ort_inputs = {
+                    inp.name: inputs[inp.name]
+                    for inp in self.model.get_inputs()
+                    if inp.name in inputs
+                }
+                # Determine which inputs are expected by the ONNX model
+                input_names = [i.name for i in self.model.get_inputs()]
+                batch_size = inputs["input_ids"].shape[0]  # pulled from text input
+
+                # If the model expects pixel_values (i.e., fused model), add dummy vision input
+                if "pixel_values" in input_names:
+                    ort_inputs["pixel_values"] = np.zeros(
+                        (batch_size, 3, 224, 224), dtype=np.float32
+                    )
+
+                # Run inference
+                ort_outputs = self.model.run(None, ort_inputs)
+
+                # Determine correct output (usually 'last_hidden_state' or 'text_embeds')
+                output_names = [o.name for o in self.model.get_outputs()]
+                if "text_embeds" in output_names:
+                    text_embedding = ort_outputs[output_names.index("text_embeds")]
+                else:
+                    text_embedding = ort_outputs[0]  # fallback to first output
+
+                # Normalize
+                text_embedding = text_embedding / np.linalg.norm(
+                    text_embedding, axis=1, keepdims=True
+                )
+                text_embedding = text_embedding[0]  # shape: (512,)
 
             logger.debug(
-                f"Generated text embedding with shape {embedding.shape} for text: '{text}'"
+                f"Generated text embedding with shape {text_embedding.shape} for text: '{text}'"
             )
-            return embedding
+            return text_embedding
 
         except Exception as e:
             logger.error(f"Error generating text embedding: {e}")
diff --git a/dimos/agents/memory/test_image_embedding.py b/dimos/agents/memory/test_image_embedding.py
new file mode 100644
index 0000000000..38877b1461
--- /dev/null
+++ b/dimos/agents/memory/test_image_embedding.py
@@ -0,0 +1,209 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Test module for the CLIP image embedding functionality in dimos.
+"""
+
+import os
+import time
+import numpy as np
+import pytest
+import reactivex as rx
+from reactivex import operators as ops
+from dimos.stream.video_provider import VideoProvider
+from dimos.agents.memory.image_embedding import ImageEmbeddingProvider
+
+
+class TestImageEmbedding:
+    """Test class for CLIP image embedding functionality."""
+
+    def test_clip_embedding_initialization(self):
+        """Test CLIP embedding provider initializes correctly."""
+        try:
+            # Initialize the embedding provider with CLIP model
+            embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)
+            assert embedding_provider.model is not None, "CLIP model failed to initialize"
+            assert embedding_provider.processor is not None, "CLIP processor failed to initialize"
+            assert embedding_provider.model_name == "clip", "Model name should be 'clip'"
+            assert embedding_provider.dimensions == 512, "Embedding dimensions should be 512"
+        except Exception as e:
+            pytest.skip(f"Skipping test due to model initialization error: {e}")
+
+    def test_clip_embedding_process_video(self):
+        """Test CLIP embedding provider can process video frames and return embeddings."""
+        try:
+            from dimos.utils.testing import testData
+
+            video_path = testData("assets") / "trimmed_video_office.mov"
+
+            embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)
+
+            assert os.path.exists(video_path), f"Test video not found: {video_path}"
+            video_provider = VideoProvider(dev_name="test_video", video_source=video_path)
+
+            video_stream = video_provider.capture_video_as_observable(realtime=False, fps=15)
+
+            # Use ReactiveX operators to process the stream
+            def process_frame(frame):
+                try:
+                    # Process frame with CLIP
+                    embedding = embedding_provider.get_embedding(frame)
+                    print(
+                        f"Generated CLIP embedding with shape: {embedding.shape}, norm: {np.linalg.norm(embedding):.4f}"
+                    )
+
+                    return {"frame": frame, "embedding": embedding}
+                except Exception as e:
+                    print(f"Error in process_frame: {e}")
+                    return None
+
+            embedding_stream = video_stream.pipe(ops.map(process_frame))
+
+            results = []
+            frames_processed = 0
+            target_frames = 10
+
+            def on_next(result):
+                nonlocal frames_processed, results
+                if not result:  # Skip None results
+                    return
+
+                results.append(result)
+                frames_processed += 1
+
+                # Stop processing after target frames
+                if frames_processed >= target_frames:
+                    subscription.dispose()
+
+            def on_error(error):
+                pytest.fail(f"Error in embedding stream: {error}")
+
+            def on_completed():
+                pass
+
+            # Subscribe and wait for results
+            subscription = embedding_stream.subscribe(
+                on_next=on_next, on_error=on_error, on_completed=on_completed
+            )
+
+            timeout = 60.0
+            start_time = time.time()
+            while frames_processed < target_frames and time.time() - start_time < timeout:
+                time.sleep(0.5)
+                print(f"Processed {frames_processed}/{target_frames} frames")
+
+            # Clean up subscription
+            subscription.dispose()
+            video_provider.dispose_all()
+
+            # Check if we have results
+            if len(results) == 0:
+                pytest.skip("No embeddings generated, but test connection established correctly")
+                return
+
+            print(f"Processed {len(results)} frames with CLIP embeddings")
+
+            # Analyze the results
+            assert len(results) > 0, "No embeddings generated"
+
+            # Check properties of first embedding
+            first_result = results[0]
+            assert "embedding" in first_result, "Result doesn't contain embedding"
+            assert "frame" in first_result, "Result doesn't contain frame"
+
+            # Check embedding shape and normalization
+            embedding = first_result["embedding"]
+            assert isinstance(embedding, np.ndarray), "Embedding is not a numpy array"
+            assert embedding.shape == (512,), (
+                f"Embedding has wrong shape: {embedding.shape}, expected (512,)"
+            )
+            assert abs(np.linalg.norm(embedding) - 1.0) < 1e-5, "Embedding is not normalized"
+
+            # Save the first embedding for similarity tests
+            if len(results) > 1 and "embedding" in results[0]:
+                # Create a class variable to store embeddings for the similarity test
+                TestImageEmbedding.test_embeddings = {
+                    "embedding1": results[0]["embedding"],
+                    "embedding2": results[1]["embedding"] if len(results) > 1 else None,
+                }
+                print(f"Saved embeddings for similarity testing")
+
+            print("CLIP embedding test passed successfully!")
+
+        except Exception as e:
+            pytest.fail(f"Test failed with error: {e}")
+
+    def test_clip_embedding_similarity(self):
+        """Test CLIP embedding similarity search and text-to-image queries."""
+        try:
+            # Skip if previous test didn't generate embeddings
+            if not hasattr(TestImageEmbedding, "test_embeddings"):
+                pytest.skip("No embeddings available from previous test")
+                return
+
+            # Get embeddings from previous test
+            embedding1 = TestImageEmbedding.test_embeddings["embedding1"]
+            embedding2 = TestImageEmbedding.test_embeddings["embedding2"]
+
+            # Initialize embedding provider for text embeddings
+            embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)
+
+            # Test frame-to-frame similarity
+            if embedding1 is not None and embedding2 is not None:
+                # Compute cosine similarity
+                similarity = np.dot(embedding1, embedding2)
+                print(f"Similarity between first two frames: {similarity:.4f}")
+
+                # Should be in range [-1, 1]
+                assert -1.0 <= similarity <= 1.0, f"Similarity out of valid range: {similarity}"
+
+            # Test text-to-image similarity
+            if embedding1 is not None:
+                # Generate a list of text queries to test
+                text_queries = ["a video frame", "a person", "an outdoor scene", "a kitchen"]
+
+                # Test each text query
+                for text_query in text_queries:
+                    # Get text embedding
+                    text_embedding = embedding_provider.get_text_embedding(text_query)
+
+                    # Check text embedding properties
+                    assert isinstance(text_embedding, np.ndarray), (
+                        "Text embedding is not a numpy array"
+                    )
+                    assert text_embedding.shape == (512,), (
+                        f"Text embedding has wrong shape: {text_embedding.shape}"
+                    )
+                    assert abs(np.linalg.norm(text_embedding) - 1.0) < 1e-5, (
+                        "Text embedding is not normalized"
+                    )
+
+                    # Compute similarity between frame and text
+                    text_similarity = np.dot(embedding1, text_embedding)
+                    print(f"Similarity between frame and '{text_query}': {text_similarity:.4f}")
+
+                    # Should be in range [-1, 1]
+                    assert -1.0 <= text_similarity <= 1.0, (
+                        f"Text-image similarity out of range: {text_similarity}"
+                    )
+
+            print("CLIP embedding similarity tests passed successfully!")
+
+        except Exception as e:
+            pytest.fail(f"Similarity test failed with error: {e}")
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", "--disable-warnings", __file__])
diff --git a/dimos/perception/detection2d/test_yolo_2d_det.py b/dimos/perception/detection2d/test_yolo_2d_det.py
new file mode 100644
index 0000000000..5316bfee90
--- /dev/null
+++ b/dimos/perception/detection2d/test_yolo_2d_det.py
@@ -0,0 +1,175 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import pytest
+import cv2
+import numpy as np
+import reactivex as rx
+from reactivex import operators as ops
+from dimos.perception.detection2d.yolo_2d_det import Yolo2DDetector
+from dimos.stream.video_provider import VideoProvider
+
+
+class TestYolo2DDetector:
+    def test_yolo_detector_initialization(self):
+        """Test YOLO detector initializes correctly with default model path."""
+        try:
+            detector = Yolo2DDetector()
+            assert detector is not None
+            assert detector.model is not None
+        except Exception as e:
+            # If the model file doesn't exist, the test should still pass with a warning
+            pytest.skip(f"Skipping test due to model initialization error: {e}")
+
+    def test_yolo_detector_process_image(self):
+        """Test YOLO detector can process video frames and return detection results."""
+        try:
+            # Import testData inside method to avoid pytest fixture confusion
+            from dimos.utils.testing import testData
+
+            detector = Yolo2DDetector()
+
+            video_path = testData("assets") / "trimmed_video_office.mov"
+
+            # Create video provider and directly get a video stream observable
+            assert os.path.exists(video_path), f"Test video not found: {video_path}"
+            video_provider = VideoProvider(dev_name="test_video", video_source=video_path)
+            # Process more frames for thorough testing
+            video_stream = video_provider.capture_video_as_observable(realtime=False, fps=15)
+
+            # Use ReactiveX operators to process the stream
+            def process_frame(frame):
+                try:
+                    # Process frame with YOLO
+                    bboxes, track_ids, class_ids, confidences, names = detector.process_image(frame)
+                    print(
+                        f"YOLO results - boxes: {(bboxes)}, tracks: {len(track_ids)}, classes: {(class_ids)}, confidences: {(confidences)}, names: {(names)}"
+                    )
+
+                    return {
+                        "frame": frame,
+                        "bboxes": bboxes,
+                        "track_ids": track_ids,
+                        "class_ids": class_ids,
+                        "confidences": confidences,
+                        "names": names,
+                    }
+                except Exception as e:
+                    print(f"Exception in process_frame: {e}")
+                    return {}
+
+            # Create the detection stream using pipe and map operator
+            detection_stream = video_stream.pipe(ops.map(process_frame))
+
+            # Collect results from the stream
+            results = []
+
+            frames_processed = 0
+            target_frames = 10
+
+            def on_next(result):
+                nonlocal frames_processed
+                if not result:
+                    return
+
+                results.append(result)
+                frames_processed += 1
+
+                # Stop after processing target number of frames
+                if frames_processed >= target_frames:
+                    subscription.dispose()
+
+            def on_error(error):
+                pytest.fail(f"Error in detection stream: {error}")
+
+            def on_completed():
+                pass
+
+            # Subscribe and wait for results
+            subscription = detection_stream.subscribe(
+                on_next=on_next, on_error=on_error, on_completed=on_completed
+            )
+
+            timeout = 10.0
+            start_time = time.time()
+            while frames_processed < target_frames and time.time() - start_time < timeout:
+                time.sleep(0.5)
+
+            # Clean up subscription
+            subscription.dispose()
+            video_provider.dispose_all()
+            # Check that we got detection results
+            if len(results) == 0:
+                pytest.skip("Skipping test due to error: Failed to get any detection results")
+
+            # Verify we have detection results with expected properties
+            assert len(results) > 0, "No detection results were received"
+
+            # Print statistics about detections
+            total_detections = sum(len(r["bboxes"]) for r in results if r.get("bboxes"))
+            avg_detections = total_detections / len(results) if results else 0
+            print(f"Total detections: {total_detections}, Average per frame: {avg_detections:.2f}")
+
+            # Print most common detected objects
+            object_counts = {}
+            for r in results:
+                if r.get("names"):
+                    for name in r["names"]:
+                        if name:
+                            object_counts[name] = object_counts.get(name, 0) + 1
+
+            if object_counts:
+                print("Detected objects:")
+                for obj, count in sorted(object_counts.items(), key=lambda x: x[1], reverse=True)[
+                    :5
+                ]:
+                    print(f"  - {obj}: {count} times")
+
+            # Analyze the first result
+            result = results[0]
+
+            # Check that we have a frame
+            assert "frame" in result, "Result doesn't contain a frame"
+            assert isinstance(result["frame"], np.ndarray), "Frame is not a numpy array"
+
+            # Check that detection results are valid
+            assert isinstance(result["bboxes"], list)
+            assert isinstance(result["track_ids"], list)
+            assert isinstance(result["class_ids"], list)
+            assert isinstance(result["confidences"], list)
+            assert isinstance(result["names"], list)
+
+            # All result lists should be the same length
+            assert (
+                len(result["bboxes"])
+                == len(result["track_ids"])
+                == len(result["class_ids"])
+                == len(result["confidences"])
+                == len(result["names"])
+            )
+
+            # If we have detections, check that bbox format is valid
+            if result["bboxes"]:
+                assert len(result["bboxes"][0]) == 4, (
+                    "Bounding boxes should be in [x1, y1, x2, y2] format"
+                )
+
+        except Exception as e:
+            pytest.skip(f"Skipping test due to error: {e}")
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", __file__])
diff --git a/dimos/perception/detection2d/yolo_2d_det.py b/dimos/perception/detection2d/yolo_2d_det.py
index f058a97147..34e094b425 100644
--- a/dimos/perception/detection2d/yolo_2d_det.py
+++ b/dimos/perception/detection2d/yolo_2d_det.py
@@ -20,22 +20,37 @@
     filter_detections,
 )
 import os
+import onnxruntime
+from dimos.utils.logging_config import setup_logger
+from dimos.utils.path_utils import get_project_root
+from dimos.utils.testing import testData
+from dimos.utils.gpu_utils import is_cuda_available
+
+logger = setup_logger("dimos.perception.detection2d.yolo_2d_det")
 
 
 class Yolo2DDetector:
-    def __init__(self, model_path="models/yolo11n.engine", device="cuda"):
+    def __init__(self, model_path="models_yolo", model_name="yolo11n.onnx", device="cpu"):
         """
         Initialize the YOLO detector.
 
         Args:
-            model_path (str): Path to the YOLO model weights
+            model_path (str): Path to the YOLO model weights in tests/data LFS directory
+            model_name (str): Name of the YOLO model weights file
             device (str): Device to run inference on ('cuda' or 'cpu')
         """
         self.device = device
-        self.model = YOLO(model_path)
+        self.model = YOLO(testData(model_path) / model_name)
 
         module_dir = os.path.dirname(__file__)
         self.tracker_config = os.path.join(module_dir, "config", "custom_tracker.yaml")
+        if is_cuda_available():
+            onnxruntime.preload_dlls(cuda=True, cudnn=True)
+            self.device = "cuda"
+            logger.info("Using CUDA for YOLO 2d detector")
+        else:
+            self.device = "cpu"
+            logger.info("Using CPU for YOLO 2d detector")
 
     def process_image(self, image):
         """
diff --git a/dimos/perception/segmentation/sam_2d_seg.py b/dimos/perception/segmentation/sam_2d_seg.py
index 2640408414..568d09e1e4 100644
--- a/dimos/perception/segmentation/sam_2d_seg.py
+++ b/dimos/perception/segmentation/sam_2d_seg.py
@@ -21,26 +21,41 @@
     plot_results,
     crop_images_from_bboxes,
 )
+from dimos.utils.gpu_utils import is_cuda_available
 from dimos.perception.common.detection2d_tracker import target2dTracker, get_tracked_results
 from dimos.perception.segmentation.image_analyzer import ImageAnalyzer
 import os
 from collections import deque
 from concurrent.futures import ThreadPoolExecutor
+from dimos.utils.logging_config import setup_logger
+from dimos.utils.path_utils import get_project_root
+import onnxruntime
+from dimos.utils.testing import testData
+
+logger = setup_logger("dimos.perception.segmentation.sam_2d_seg")
 
 
 class Sam2DSegmenter:
     def __init__(
         self,
-        model_path="FastSAM-s.pt",
-        device="cuda",
+        model_path="models_fastsam",
+        model_name="FastSAM-s.onnx",
+        device="cpu",
         min_analysis_interval=5.0,
         use_tracker=True,
         use_analyzer=True,
         use_rich_labeling=False,
     ):
-        # Core components
         self.device = device
-        self.model = FastSAM(model_path)
+        if is_cuda_available():
+            logger.info("Using CUDA for SAM 2d segmenter")
+            onnxruntime.preload_dlls(cuda=True, cudnn=True)
+            self.device = "cuda"
+        else:
+            logger.info("Using CPU for SAM 2d segmenter")
+            self.device = "cpu"
+        # Core components
+        self.model = FastSAM(testData(model_path) / model_name)
         self.use_tracker = use_tracker
         self.use_analyzer = use_analyzer
         self.use_rich_labeling = use_rich_labeling
diff --git a/dimos/perception/segmentation/test_sam_2d_seg.py b/dimos/perception/segmentation/test_sam_2d_seg.py
new file mode 100644
index 0000000000..fc7e488e51
--- /dev/null
+++ b/dimos/perception/segmentation/test_sam_2d_seg.py
@@ -0,0 +1,211 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+from dimos.stream import video_provider
+import pytest
+import cv2
+import numpy as np
+import reactivex as rx
+from reactivex import operators as ops
+from dimos.stream.video_provider import VideoProvider
+from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter
+from dimos.perception.segmentation.utils import extract_masks_bboxes_probs_names
+
+
+class TestSam2DSegmenter:
+    def test_sam_segmenter_initialization(self):
+        """Test FastSAM segmenter initializes correctly with default model path."""
+        try:
+            # Try to initialize with the default model path and existing device setting
+            segmenter = Sam2DSegmenter(use_analyzer=False)
+            assert segmenter is not None
+            assert segmenter.model is not None
+        except Exception as e:
+            # If the model file doesn't exist, the test should still pass with a warning
+            pytest.skip(f"Skipping test due to model initialization error: {e}")
+
+    def test_sam_segmenter_process_image(self):
+        """Test FastSAM segmenter can process video frames and return segmentation masks."""
+        # Import testData inside method to avoid pytest fixture confusion
+        from dimos.utils.testing import testData
+
+        # Get test video path directly
+        video_path = testData("assets") / "trimmed_video_office.mov"
+        try:
+            # Initialize segmenter without analyzer for faster testing
+            segmenter = Sam2DSegmenter(use_analyzer=False)
+
+            # Note: conf and iou are parameters for process_image, not constructor
+            # We'll monkey patch the process_image method to use lower thresholds
+            original_process_image = segmenter.process_image
+
+            def patched_process_image(image):
+                results = segmenter.model.track(
+                    source=image,
+                    device=segmenter.device,
+                    retina_masks=True,
+                    conf=0.1,  # Lower confidence threshold for testing
+                    iou=0.5,  # Lower IoU threshold
+                    persist=True,
+                    verbose=False,
+                    tracker=segmenter.tracker_config
+                    if hasattr(segmenter, "tracker_config")
+                    else None,
+                )
+
+                if len(results) > 0:
+                    masks, bboxes, track_ids, probs, names, areas = (
+                        extract_masks_bboxes_probs_names(results[0])
+                    )
+                    return masks, bboxes, track_ids, probs, names
+                return [], [], [], [], []
+
+            # Replace the method
+            segmenter.process_image = patched_process_image
+
+            # Create video provider and directly get a video stream observable
+            assert os.path.exists(video_path), f"Test video not found: {video_path}"
+            video_provider = VideoProvider(dev_name="test_video", video_source=video_path)
+
+            video_stream = video_provider.capture_video_as_observable(realtime=False, fps=1)
+
+            # Use ReactiveX operators to process the stream
+            def process_frame(frame):
+                try:
+                    # Process frame with FastSAM
+                    masks, bboxes, track_ids, probs, names = segmenter.process_image(frame)
+                    print(
+                        f"SAM results - masks: {len(masks)}, bboxes: {len(bboxes)}, track_ids: {len(track_ids)}, names: {len(names)}"
+                    )
+
+                    return {
+                        "frame": frame,
+                        "masks": masks,
+                        "bboxes": bboxes,
+                        "track_ids": track_ids,
+                        "probs": probs,
+                        "names": names,
+                    }
+                except Exception as e:
+                    print(f"Error in process_frame: {e}")
+                    return {}
+
+            # Create the segmentation stream using pipe and map operator
+            segmentation_stream = video_stream.pipe(ops.map(process_frame))
+
+            # Collect results from the stream
+            results = []
+            frames_processed = 0
+            target_frames = 5
+
+            def on_next(result):
+                nonlocal frames_processed, results
+                if not result:
+                    return
+
+                results.append(result)
+                frames_processed += 1
+
+                # Stop processing after target frames
+                if frames_processed >= target_frames:
+                    subscription.dispose()
+
+            def on_error(error):
+                pytest.fail(f"Error in segmentation stream: {error}")
+
+            def on_completed():
+                pass
+
+            # Subscribe and wait for results
+            subscription = segmentation_stream.subscribe(
+                on_next=on_next, on_error=on_error, on_completed=on_completed
+            )
+
+            # Wait for frames to be processed
+            timeout = 30.0  # seconds
+            start_time = time.time()
+            while frames_processed < target_frames and time.time() - start_time < timeout:
+                time.sleep(0.5)
+
+            # Clean up subscription
+            subscription.dispose()
+            video_provider.dispose_all()
+
+            # Check if we have results
+            if len(results) == 0:
+                pytest.skip(
+                    "No segmentation results found, but test connection established correctly"
+                )
+                return
+
+            print(f"Processed {len(results)} frames with segmentation results")
+
+            # Analyze the first result
+            result = results[0]
+
+            # Check that we have a frame
+            assert "frame" in result, "Result doesn't contain a frame"
+            assert isinstance(result["frame"], np.ndarray), "Frame is not a numpy array"
+
+            # Check that segmentation results are valid
+            assert isinstance(result["masks"], list)
+            assert isinstance(result["bboxes"], list)
+            assert isinstance(result["track_ids"], list)
+            assert isinstance(result["probs"], list)
+            assert isinstance(result["names"], list)
+
+            # All result lists should be the same length
+            assert (
+                len(result["masks"])
+                == len(result["bboxes"])
+                == len(result["track_ids"])
+                == len(result["probs"])
+                == len(result["names"])
+            )
+
+            # If we have masks, check that they have valid shape
+            if result.get("masks") and len(result["masks"]) > 0:
+                assert result["masks"][0].shape == (
+                    result["frame"].shape[0],
+                    result["frame"].shape[1],
+                ), "Mask shape should match image dimensions"
+                print(f"Found {len(result['masks'])} masks in first frame")
+            else:
+                print("No masks found in first frame, but test connection established correctly")
+
+            # Test visualization function
+            if result["masks"]:
+                vis_frame = segmenter.visualize_results(
+                    result["frame"],
+                    result["masks"],
+                    result["bboxes"],
+                    result["track_ids"],
+                    result["probs"],
+                    result["names"],
+                )
+                assert isinstance(vis_frame, np.ndarray), "Visualization output should be an image"
+                assert vis_frame.shape == result["frame"].shape, (
+                    "Visualization should have same dimensions as input frame"
+                )
+
+            # We've already tested visualization above, so no need for a duplicate test
+
+        except Exception as e:
+            pytest.skip(f"Skipping test due to error: {e}")
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", __file__])
diff --git a/dimos/perception/semantic_seg.py b/dimos/perception/semantic_seg.py
index 3ef2eb7399..a07e69c279 100644
--- a/dimos/perception/semantic_seg.py
+++ b/dimos/perception/semantic_seg.py
@@ -25,7 +25,6 @@
 class SemanticSegmentationStream:
     def __init__(
         self,
-        model_path: str = "FastSAM-s.pt",
         device: str = "cuda",
         enable_mono_depth: bool = True,
         enable_rich_labeling: bool = True,
@@ -36,7 +35,6 @@ def __init__(
         Initialize a semantic segmentation stream using Sam2DSegmenter.
 
         Args:
-            model_path: Path to the FastSAM model file
             device: Computation device ("cuda" or "cpu")
             enable_mono_depth: Whether to enable monocular depth processing
             enable_rich_labeling: Whether to enable rich labeling
@@ -45,7 +43,6 @@ def __init__(
                 - Physical parameters: resolution, focal_length, sensor_size
         """
         self.segmenter = Sam2DSegmenter(
-            model_path=model_path,
             device=device,
             min_analysis_interval=5.0,
             use_tracker=True,
diff --git a/dimos/perception/test_spatial_memory.py b/dimos/perception/test_spatial_memory.py
new file mode 100644
index 0000000000..a6f4fcfa69
--- /dev/null
+++ b/dimos/perception/test_spatial_memory.py
@@ -0,0 +1,213 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import tempfile
+import pytest
+import numpy as np
+import cv2
+import shutil
+import reactivex as rx
+from reactivex import operators as ops
+from reactivex.subject import Subject
+from reactivex import Observable
+
+from dimos.perception.spatial_perception import SpatialMemory
+from dimos.types.position import Position
+from dimos.stream.video_provider import VideoProvider
+from dimos.types.position import Position
+from dimos.types.vector import Vector
+
+
+class TestSpatialMemory:
+    @pytest.fixture(scope="function")
+    def temp_dir(self):
+        # Create a temporary directory for storing spatial memory data
+        temp_dir = tempfile.mkdtemp()
+        yield temp_dir
+        # Clean up
+        shutil.rmtree(temp_dir)
+
+    def test_spatial_memory_initialization(self):
+        """Test SpatialMemory initializes correctly with CLIP model."""
+        try:
+            # Initialize spatial memory with default CLIP model
+            memory = SpatialMemory(
+                collection_name="test_collection", embedding_model="clip", new_memory=True
+            )
+            assert memory is not None
+            assert memory.embedding_model == "clip"
+            assert memory.embedding_provider is not None
+        except Exception as e:
+            # If the model doesn't initialize, skip the test
+            pytest.fail(f"Failed to initialize model: {e}")
+
+    def test_image_embedding(self):
+        """Test generating image embeddings using CLIP."""
+        try:
+            # Initialize spatial memory with CLIP model
+            memory = SpatialMemory(
+                collection_name="test_collection", embedding_model="clip", new_memory=True
+            )
+
+            # Create a test image - use a simple colored square
+            test_image = np.zeros((224, 224, 3), dtype=np.uint8)
+            test_image[50:150, 50:150] = [0, 0, 255]  # Blue square
+
+            # Generate embedding
+            embedding = memory.embedding_provider.get_embedding(test_image)
+
+            # Check embedding shape and characteristics
+            assert embedding is not None
+            assert isinstance(embedding, np.ndarray)
+            assert embedding.shape[0] == memory.embedding_dimensions
+
+            # Check that embedding is normalized (unit vector)
+            assert np.isclose(np.linalg.norm(embedding), 1.0, atol=1e-5)
+
+            # Test text embedding
+            text_embedding = memory.embedding_provider.get_text_embedding("a blue square")
+            assert text_embedding is not None
+            assert isinstance(text_embedding, np.ndarray)
+            assert text_embedding.shape[0] == memory.embedding_dimensions
+            assert np.isclose(np.linalg.norm(text_embedding), 1.0, atol=1e-5)
+        except Exception as e:
+            pytest.fail(f"Error in test: {e}")
+
+    def test_spatial_memory_processing(self, temp_dir):
+        """Test processing video frames and building spatial memory with CLIP embeddings."""
+        try:
+            # Initialize spatial memory with temporary storage
+            memory = SpatialMemory(
+                collection_name="test_collection",
+                embedding_model="clip",
+                new_memory=True,
+                db_path=os.path.join(temp_dir, "chroma_db"),
+                visual_memory_path=os.path.join(temp_dir, "visual_memory.pkl"),
+                output_dir=os.path.join(temp_dir, "images"),
+                min_distance_threshold=0.01,
+                min_time_threshold=0.01,
+            )
+
+            from dimos.utils.testing import testData
+
+            video_path = testData("assets") / "trimmed_video_office.mov"
+            assert os.path.exists(video_path), f"Test video not found: {video_path}"
+            video_provider = VideoProvider(dev_name="test_video", video_source=video_path)
+            video_stream = video_provider.capture_video_as_observable(realtime=False, fps=15)
+
+            # Create a frame counter for position generation
+            frame_counter = 0
+
+            # Process each video frame directly
+            def process_frame(frame):
+                nonlocal frame_counter
+
+                # Generate a unique position for this frame to ensure minimum distance threshold is met
+                pos = Position(frame_counter * 0.5, frame_counter * 0.5, 0)
+                transform = {"position": pos, "timestamp": time.time()}
+                frame_counter += 1
+
+                # Create a dictionary with frame, position and rotation for SpatialMemory.process_stream
+                return {
+                    "frame": frame,
+                    "position": transform["position"],
+                    "rotation": transform["position"],  # Using position as rotation for testing
+                }
+
+            # Create a stream that processes each frame
+            formatted_stream = video_stream.pipe(ops.map(process_frame))
+
+            # Process the stream using SpatialMemory's built-in processing
+            print("Creating spatial memory stream...")
+            spatial_stream = memory.process_stream(formatted_stream)
+
+            # Stream is now created above using memory.process_stream()
+
+            # Collect results from the stream
+            results = []
+
+            frames_processed = 0
+            target_frames = 100  # Process more frames for thorough testing
+
+            def on_next(result):
+                nonlocal results, frames_processed
+                if not result:  # Skip None results
+                    return
+
+                results.append(result)
+                frames_processed += 1
+
+                # Stop processing after target frames
+                if frames_processed >= target_frames:
+                    subscription.dispose()
+
+            def on_error(error):
+                pytest.fail(f"Error in spatial stream: {error}")
+
+            def on_completed():
+                pass
+
+            # Subscribe and wait for results
+            subscription = spatial_stream.subscribe(
+                on_next=on_next, on_error=on_error, on_completed=on_completed
+            )
+
+            # Wait for frames to be processed
+            timeout = 30.0  # seconds
+            start_time = time.time()
+            while frames_processed < target_frames and time.time() - start_time < timeout:
+                time.sleep(0.5)
+
+            subscription.dispose()
+
+            assert len(results) > 0, "Failed to process any frames with spatial memory"
+
+            relevant_queries = ["office", "room with furniture"]
+            irrelevant_query = "star wars"
+
+            for query in relevant_queries:
+                results = memory.query_by_text(query, limit=2)
+                print(f"\nResults for query: '{query}'")
+
+                assert len(results) > 0, f"No results found for relevant query: {query}"
+
+                similarities = [1 - r.get("distance") for r in results]
+                print(f"Similarities: {similarities}")
+
+                assert any(d > 0.24 for d in similarities), (
+                    f"Expected at least one result with similarity > 0.24 for query '{query}'"
+                )
+
+            results = memory.query_by_text(irrelevant_query, limit=2)
+            print(f"\nResults for query: '{irrelevant_query}'")
+
+            if results:
+                similarities = [1 - r.get("distance") for r in results]
+                print(f"Similarities: {similarities}")
+
+                assert all(d < 0.25 for d in similarities), (
+                    f"Expected all results to have similarity < 0.25 for irrelevant query '{irrelevant_query}'"
+                )
+
+        except Exception as e:
+            pytest.fail(f"Error in test: {e}")
+        finally:
+            memory.cleanup()
+            video_provider.dispose_all()
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", __file__])
diff --git a/dimos/utils/gpu_utils.py b/dimos/utils/gpu_utils.py
new file mode 100644
index 0000000000..e40516deec
--- /dev/null
+++ b/dimos/utils/gpu_utils.py
@@ -0,0 +1,24 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def is_cuda_available():
+    try:
+        import pycuda.driver as cuda
+        import pycuda.autoinit  # implicitly initializes the CUDA driver
+
+        cuda.init()
+        return cuda.Device.count() > 0
+    except Exception:
+        return False
diff --git a/dimos/utils/path_utils.py b/dimos/utils/path_utils.py
new file mode 100644
index 0000000000..d60014d068
--- /dev/null
+++ b/dimos/utils/path_utils.py
@@ -0,0 +1,22 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+
+def get_project_root() -> Path:
+    """
+    Returns the absolute path to the project root directory.
+    """
+    return Path(__file__).resolve().parent.parent.parent
diff --git a/pyproject.toml b/pyproject.toml
index 2f81fd62b1..7a9a0a23dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,7 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
 [project]
 name = "dimos"
 authors = [
@@ -6,6 +10,15 @@ authors = [
 version = "0.0.2"
 description = "Powering agentive generalist robotics"
 
+[project.optional-dependencies]
+cuda = [
+    "pycuda",
+    "onnxruntime-gpu[cuda,cudnn]"
+]
+
+[tool.setuptools]
+packages = ["dimos"]
+
 [tool.ruff]
 line-length = 100
 exclude = [
diff --git a/requirements.txt b/requirements.txt
index 7f71ec17cd..10dc835c57 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -95,4 +95,5 @@ git+https://github.com/facebookresearch/detectron2.git@v0.6
 # Mapping
 open3d
 
-# Touch for rebuild
+# Inference (CPU)
+onnxruntime
\ No newline at end of file
diff --git a/tests/data/.lfs/assets.tar.gz b/tests/data/.lfs/assets.tar.gz
new file mode 100644
index 0000000000..b7a2fcbd1c
--- /dev/null
+++ b/tests/data/.lfs/assets.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b14b01f5c907f117331213abfce9ef5d0c41d0524e14327b5cc706520fb2035
+size 2306191
diff --git a/tests/data/.lfs/models_clip.tar.gz b/tests/data/.lfs/models_clip.tar.gz
new file mode 100644
index 0000000000..a4ab2b5f88
--- /dev/null
+++ b/tests/data/.lfs/models_clip.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:102f11bb0aa952b3cebc4491c5ed3f2122e8c38c76002e22400da4f1e5ca90c5
+size 392327708
diff --git a/tests/data/.lfs/models_fastsam.tar.gz b/tests/data/.lfs/models_fastsam.tar.gz
new file mode 100644
index 0000000000..77278f4323
--- /dev/null
+++ b/tests/data/.lfs/models_fastsam.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:682cb3816451bd73722cc430fdfce15bbe72a07e50ef2ea81ddaed61d1f22a25
+size 39971209
diff --git a/tests/data/.lfs/models_yolo.tar.gz b/tests/data/.lfs/models_yolo.tar.gz
new file mode 100644
index 0000000000..aca0915dfd
--- /dev/null
+++ b/tests/data/.lfs/models_yolo.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ed4a5160d4edfda145b6752b5c49ad22bc2887b66b9b9c38bd8c35fb5ffaf8f
+size 9315806