diff --git a/.gitattributes b/.gitattributes index feb56d98ed..e808d54903 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,17 +1,16 @@ # Handle line endings automatically for files Git considers text, # converting them to LF on checkout. * text=auto eol=lf - # Ensure Python files always use LF for line endings. *.py text eol=lf - # Treat designated file types as binary and do not alter their contents or line endings. *.png binary *.jpg binary -*.gif binary *.ico binary *.pdf binary -*.mp4 binary - # Explicit LFS tracking for test files tests/data/.lfs/*.tar.gz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text binary +*.mp4 filter=lfs diff=lfs merge=lfs -text binary +*.mov filter=lfs diff=lfs merge=lfs -text binary +*.gif filter=lfs diff=lfs merge=lfs -text binary diff --git a/assets/dimos_interface.gif b/assets/dimos_interface.gif index c42fe4e903..e610a2b390 100644 Binary files a/assets/dimos_interface.gif and b/assets/dimos_interface.gif differ diff --git a/assets/framecount.mp4 b/assets/framecount.mp4 index 74b0b2322b..759ee6ab27 100644 Binary files a/assets/framecount.mp4 and b/assets/framecount.mp4 differ diff --git a/assets/simple_demo.mp4 b/assets/simple_demo.mp4 index 086a636383..cb8a635e78 100644 Binary files a/assets/simple_demo.mp4 and b/assets/simple_demo.mp4 differ diff --git a/assets/simple_demo_small.gif b/assets/simple_demo_small.gif index 294a2a0879..3c2cf54ef4 100644 Binary files a/assets/simple_demo_small.gif and b/assets/simple_demo_small.gif differ diff --git a/assets/trimmed_video_office.mov b/assets/trimmed_video_office.mov index 278582f74e..a3072be8fc 100644 Binary files a/assets/trimmed_video_office.mov and b/assets/trimmed_video_office.mov differ diff --git a/dimos/agents/memory/image_embedding.py b/dimos/agents/memory/image_embedding.py index 8bcd225d85..265d15892f 100644 --- a/dimos/agents/memory/image_embedding.py +++ b/dimos/agents/memory/image_embedding.py @@ -27,6 +27,7 @@ import cv2 import base64 from dimos.utils.logging_config import setup_logger +from dimos.utils.testing import testData logger = setup_logger("dimos.agents.memory.image_embedding") @@ -60,12 +61,14 @@ def _initialize_model(self): """Initialize the specified embedding model.""" try: import torch - from transformers import CLIPProcessor, CLIPModel, AutoFeatureExtractor, AutoModel + from transformers import CLIPProcessor, AutoFeatureExtractor, AutoModel + import onnxruntime as ort if self.model_name == "clip": - model_id = "openai/clip-vit-base-patch32" - self.model = CLIPModel.from_pretrained(model_id) - self.processor = CLIPProcessor.from_pretrained(model_id) + model_id = testData("models_clip") / "model.onnx" + processor_id = "openai/clip-vit-base-patch32" + self.model = ort.InferenceSession(model_id) + self.processor = CLIPProcessor.from_pretrained(processor_id) logger.info(f"Loaded CLIP model: {model_id}") elif self.model_name == "resnet": model_id = "microsoft/resnet-50" @@ -103,13 +106,35 @@ def get_embedding(self, image: Union[np.ndarray, str, bytes]) -> np.ndarray: import torch if self.model_name == "clip": - inputs = self.processor(images=pil_image, return_tensors="pt") + inputs = self.processor(images=pil_image, return_tensors="np") with torch.no_grad(): - image_features = self.model.get_image_features(**inputs) - - image_embedding = image_features / image_features.norm(dim=1, keepdim=True) - embedding = image_embedding.numpy()[0] + ort_inputs = { + inp.name: inputs[inp.name] + for inp in self.model.get_inputs() + if inp.name in inputs + } + + # If required, add dummy text inputs + input_names = [i.name for i in self.model.get_inputs()] + batch_size = inputs["pixel_values"].shape[0] + if "input_ids" in input_names: + ort_inputs["input_ids"] = np.zeros((batch_size, 1), dtype=np.int64) + if "attention_mask" in input_names: + ort_inputs["attention_mask"] = np.ones((batch_size, 1), dtype=np.int64) + + # Run inference + ort_outputs = self.model.run(None, ort_inputs) + + # Look up correct output name + output_names = [o.name for o in self.model.get_outputs()] + if "image_embeds" in output_names: + image_embedding = ort_outputs[output_names.index("image_embeds")] + else: + raise RuntimeError(f"No 'image_embeds' found in outputs: {output_names}") + + embedding = image_embedding / np.linalg.norm(image_embedding, axis=1, keepdims=True) + embedding = embedding[0] elif self.model_name == "resnet": inputs = self.processor(images=pil_image, return_tensors="pt") @@ -156,19 +181,45 @@ def get_text_embedding(self, text: str) -> np.ndarray: try: import torch - inputs = self.processor(text=[text], return_tensors="pt", padding=True) + inputs = self.processor(text=[text], return_tensors="np", padding=True) with torch.no_grad(): - text_features = self.model.get_text_features(**inputs) - - # Normalize the features - text_embedding = text_features / text_features.norm(dim=1, keepdim=True) - embedding = text_embedding.numpy()[0] + # Prepare ONNX input dict (handle only what's needed) + ort_inputs = { + inp.name: inputs[inp.name] + for inp in self.model.get_inputs() + if inp.name in inputs + } + # Determine which inputs are expected by the ONNX model + input_names = [i.name for i in self.model.get_inputs()] + batch_size = inputs["input_ids"].shape[0] # pulled from text input + + # If the model expects pixel_values (i.e., fused model), add dummy vision input + if "pixel_values" in input_names: + ort_inputs["pixel_values"] = np.zeros( + (batch_size, 3, 224, 224), dtype=np.float32 + ) + + # Run inference + ort_outputs = self.model.run(None, ort_inputs) + + # Determine correct output (usually 'last_hidden_state' or 'text_embeds') + output_names = [o.name for o in self.model.get_outputs()] + if "text_embeds" in output_names: + text_embedding = ort_outputs[output_names.index("text_embeds")] + else: + text_embedding = ort_outputs[0] # fallback to first output + + # Normalize + text_embedding = text_embedding / np.linalg.norm( + text_embedding, axis=1, keepdims=True + ) + text_embedding = text_embedding[0] # shape: (512,) logger.debug( - f"Generated text embedding with shape {embedding.shape} for text: '{text}'" + f"Generated text embedding with shape {text_embedding.shape} for text: '{text}'" ) - return embedding + return text_embedding except Exception as e: logger.error(f"Error generating text embedding: {e}") diff --git a/dimos/agents/memory/test_image_embedding.py b/dimos/agents/memory/test_image_embedding.py new file mode 100644 index 0000000000..38877b1461 --- /dev/null +++ b/dimos/agents/memory/test_image_embedding.py @@ -0,0 +1,209 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Test module for the CLIP image embedding functionality in dimos. +""" + +import os +import time +import numpy as np +import pytest +import reactivex as rx +from reactivex import operators as ops +from dimos.stream.video_provider import VideoProvider +from dimos.agents.memory.image_embedding import ImageEmbeddingProvider + + +class TestImageEmbedding: + """Test class for CLIP image embedding functionality.""" + + def test_clip_embedding_initialization(self): + """Test CLIP embedding provider initializes correctly.""" + try: + # Initialize the embedding provider with CLIP model + embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512) + assert embedding_provider.model is not None, "CLIP model failed to initialize" + assert embedding_provider.processor is not None, "CLIP processor failed to initialize" + assert embedding_provider.model_name == "clip", "Model name should be 'clip'" + assert embedding_provider.dimensions == 512, "Embedding dimensions should be 512" + except Exception as e: + pytest.skip(f"Skipping test due to model initialization error: {e}") + + def test_clip_embedding_process_video(self): + """Test CLIP embedding provider can process video frames and return embeddings.""" + try: + from dimos.utils.testing import testData + + video_path = testData("assets") / "trimmed_video_office.mov" + + embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512) + + assert os.path.exists(video_path), f"Test video not found: {video_path}" + video_provider = VideoProvider(dev_name="test_video", video_source=video_path) + + video_stream = video_provider.capture_video_as_observable(realtime=False, fps=15) + + # Use ReactiveX operators to process the stream + def process_frame(frame): + try: + # Process frame with CLIP + embedding = embedding_provider.get_embedding(frame) + print( + f"Generated CLIP embedding with shape: {embedding.shape}, norm: {np.linalg.norm(embedding):.4f}" + ) + + return {"frame": frame, "embedding": embedding} + except Exception as e: + print(f"Error in process_frame: {e}") + return None + + embedding_stream = video_stream.pipe(ops.map(process_frame)) + + results = [] + frames_processed = 0 + target_frames = 10 + + def on_next(result): + nonlocal frames_processed, results + if not result: # Skip None results + return + + results.append(result) + frames_processed += 1 + + # Stop processing after target frames + if frames_processed >= target_frames: + subscription.dispose() + + def on_error(error): + pytest.fail(f"Error in embedding stream: {error}") + + def on_completed(): + pass + + # Subscribe and wait for results + subscription = embedding_stream.subscribe( + on_next=on_next, on_error=on_error, on_completed=on_completed + ) + + timeout = 60.0 + start_time = time.time() + while frames_processed < target_frames and time.time() - start_time < timeout: + time.sleep(0.5) + print(f"Processed {frames_processed}/{target_frames} frames") + + # Clean up subscription + subscription.dispose() + video_provider.dispose_all() + + # Check if we have results + if len(results) == 0: + pytest.skip("No embeddings generated, but test connection established correctly") + return + + print(f"Processed {len(results)} frames with CLIP embeddings") + + # Analyze the results + assert len(results) > 0, "No embeddings generated" + + # Check properties of first embedding + first_result = results[0] + assert "embedding" in first_result, "Result doesn't contain embedding" + assert "frame" in first_result, "Result doesn't contain frame" + + # Check embedding shape and normalization + embedding = first_result["embedding"] + assert isinstance(embedding, np.ndarray), "Embedding is not a numpy array" + assert embedding.shape == (512,), ( + f"Embedding has wrong shape: {embedding.shape}, expected (512,)" + ) + assert abs(np.linalg.norm(embedding) - 1.0) < 1e-5, "Embedding is not normalized" + + # Save the first embedding for similarity tests + if len(results) > 1 and "embedding" in results[0]: + # Create a class variable to store embeddings for the similarity test + TestImageEmbedding.test_embeddings = { + "embedding1": results[0]["embedding"], + "embedding2": results[1]["embedding"] if len(results) > 1 else None, + } + print(f"Saved embeddings for similarity testing") + + print("CLIP embedding test passed successfully!") + + except Exception as e: + pytest.fail(f"Test failed with error: {e}") + + def test_clip_embedding_similarity(self): + """Test CLIP embedding similarity search and text-to-image queries.""" + try: + # Skip if previous test didn't generate embeddings + if not hasattr(TestImageEmbedding, "test_embeddings"): + pytest.skip("No embeddings available from previous test") + return + + # Get embeddings from previous test + embedding1 = TestImageEmbedding.test_embeddings["embedding1"] + embedding2 = TestImageEmbedding.test_embeddings["embedding2"] + + # Initialize embedding provider for text embeddings + embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512) + + # Test frame-to-frame similarity + if embedding1 is not None and embedding2 is not None: + # Compute cosine similarity + similarity = np.dot(embedding1, embedding2) + print(f"Similarity between first two frames: {similarity:.4f}") + + # Should be in range [-1, 1] + assert -1.0 <= similarity <= 1.0, f"Similarity out of valid range: {similarity}" + + # Test text-to-image similarity + if embedding1 is not None: + # Generate a list of text queries to test + text_queries = ["a video frame", "a person", "an outdoor scene", "a kitchen"] + + # Test each text query + for text_query in text_queries: + # Get text embedding + text_embedding = embedding_provider.get_text_embedding(text_query) + + # Check text embedding properties + assert isinstance(text_embedding, np.ndarray), ( + "Text embedding is not a numpy array" + ) + assert text_embedding.shape == (512,), ( + f"Text embedding has wrong shape: {text_embedding.shape}" + ) + assert abs(np.linalg.norm(text_embedding) - 1.0) < 1e-5, ( + "Text embedding is not normalized" + ) + + # Compute similarity between frame and text + text_similarity = np.dot(embedding1, text_embedding) + print(f"Similarity between frame and '{text_query}': {text_similarity:.4f}") + + # Should be in range [-1, 1] + assert -1.0 <= text_similarity <= 1.0, ( + f"Text-image similarity out of range: {text_similarity}" + ) + + print("CLIP embedding similarity tests passed successfully!") + + except Exception as e: + pytest.fail(f"Similarity test failed with error: {e}") + + +if __name__ == "__main__": + pytest.main(["-v", "--disable-warnings", __file__]) diff --git a/dimos/perception/detection2d/test_yolo_2d_det.py b/dimos/perception/detection2d/test_yolo_2d_det.py new file mode 100644 index 0000000000..5316bfee90 --- /dev/null +++ b/dimos/perception/detection2d/test_yolo_2d_det.py @@ -0,0 +1,175 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import pytest +import cv2 +import numpy as np +import reactivex as rx +from reactivex import operators as ops +from dimos.perception.detection2d.yolo_2d_det import Yolo2DDetector +from dimos.stream.video_provider import VideoProvider + + +class TestYolo2DDetector: + def test_yolo_detector_initialization(self): + """Test YOLO detector initializes correctly with default model path.""" + try: + detector = Yolo2DDetector() + assert detector is not None + assert detector.model is not None + except Exception as e: + # If the model file doesn't exist, the test should still pass with a warning + pytest.skip(f"Skipping test due to model initialization error: {e}") + + def test_yolo_detector_process_image(self): + """Test YOLO detector can process video frames and return detection results.""" + try: + # Import testData inside method to avoid pytest fixture confusion + from dimos.utils.testing import testData + + detector = Yolo2DDetector() + + video_path = testData("assets") / "trimmed_video_office.mov" + + # Create video provider and directly get a video stream observable + assert os.path.exists(video_path), f"Test video not found: {video_path}" + video_provider = VideoProvider(dev_name="test_video", video_source=video_path) + # Process more frames for thorough testing + video_stream = video_provider.capture_video_as_observable(realtime=False, fps=15) + + # Use ReactiveX operators to process the stream + def process_frame(frame): + try: + # Process frame with YOLO + bboxes, track_ids, class_ids, confidences, names = detector.process_image(frame) + print( + f"YOLO results - boxes: {(bboxes)}, tracks: {len(track_ids)}, classes: {(class_ids)}, confidences: {(confidences)}, names: {(names)}" + ) + + return { + "frame": frame, + "bboxes": bboxes, + "track_ids": track_ids, + "class_ids": class_ids, + "confidences": confidences, + "names": names, + } + except Exception as e: + print(f"Exception in process_frame: {e}") + return {} + + # Create the detection stream using pipe and map operator + detection_stream = video_stream.pipe(ops.map(process_frame)) + + # Collect results from the stream + results = [] + + frames_processed = 0 + target_frames = 10 + + def on_next(result): + nonlocal frames_processed + if not result: + return + + results.append(result) + frames_processed += 1 + + # Stop after processing target number of frames + if frames_processed >= target_frames: + subscription.dispose() + + def on_error(error): + pytest.fail(f"Error in detection stream: {error}") + + def on_completed(): + pass + + # Subscribe and wait for results + subscription = detection_stream.subscribe( + on_next=on_next, on_error=on_error, on_completed=on_completed + ) + + timeout = 10.0 + start_time = time.time() + while frames_processed < target_frames and time.time() - start_time < timeout: + time.sleep(0.5) + + # Clean up subscription + subscription.dispose() + video_provider.dispose_all() + # Check that we got detection results + if len(results) == 0: + pytest.skip("Skipping test due to error: Failed to get any detection results") + + # Verify we have detection results with expected properties + assert len(results) > 0, "No detection results were received" + + # Print statistics about detections + total_detections = sum(len(r["bboxes"]) for r in results if r.get("bboxes")) + avg_detections = total_detections / len(results) if results else 0 + print(f"Total detections: {total_detections}, Average per frame: {avg_detections:.2f}") + + # Print most common detected objects + object_counts = {} + for r in results: + if r.get("names"): + for name in r["names"]: + if name: + object_counts[name] = object_counts.get(name, 0) + 1 + + if object_counts: + print("Detected objects:") + for obj, count in sorted(object_counts.items(), key=lambda x: x[1], reverse=True)[ + :5 + ]: + print(f" - {obj}: {count} times") + + # Analyze the first result + result = results[0] + + # Check that we have a frame + assert "frame" in result, "Result doesn't contain a frame" + assert isinstance(result["frame"], np.ndarray), "Frame is not a numpy array" + + # Check that detection results are valid + assert isinstance(result["bboxes"], list) + assert isinstance(result["track_ids"], list) + assert isinstance(result["class_ids"], list) + assert isinstance(result["confidences"], list) + assert isinstance(result["names"], list) + + # All result lists should be the same length + assert ( + len(result["bboxes"]) + == len(result["track_ids"]) + == len(result["class_ids"]) + == len(result["confidences"]) + == len(result["names"]) + ) + + # If we have detections, check that bbox format is valid + if result["bboxes"]: + assert len(result["bboxes"][0]) == 4, ( + "Bounding boxes should be in [x1, y1, x2, y2] format" + ) + + except Exception as e: + pytest.skip(f"Skipping test due to error: {e}") + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/dimos/perception/detection2d/yolo_2d_det.py b/dimos/perception/detection2d/yolo_2d_det.py index f058a97147..34e094b425 100644 --- a/dimos/perception/detection2d/yolo_2d_det.py +++ b/dimos/perception/detection2d/yolo_2d_det.py @@ -20,22 +20,37 @@ filter_detections, ) import os +import onnxruntime +from dimos.utils.logging_config import setup_logger +from dimos.utils.path_utils import get_project_root +from dimos.utils.testing import testData +from dimos.utils.gpu_utils import is_cuda_available + +logger = setup_logger("dimos.perception.detection2d.yolo_2d_det") class Yolo2DDetector: - def __init__(self, model_path="models/yolo11n.engine", device="cuda"): + def __init__(self, model_path="models_yolo", model_name="yolo11n.onnx", device="cpu"): """ Initialize the YOLO detector. Args: - model_path (str): Path to the YOLO model weights + model_path (str): Path to the YOLO model weights in tests/data LFS directory + model_name (str): Name of the YOLO model weights file device (str): Device to run inference on ('cuda' or 'cpu') """ self.device = device - self.model = YOLO(model_path) + self.model = YOLO(testData(model_path) / model_name) module_dir = os.path.dirname(__file__) self.tracker_config = os.path.join(module_dir, "config", "custom_tracker.yaml") + if is_cuda_available(): + onnxruntime.preload_dlls(cuda=True, cudnn=True) + self.device = "cuda" + logger.info("Using CUDA for YOLO 2d detector") + else: + self.device = "cpu" + logger.info("Using CPU for YOLO 2d detector") def process_image(self, image): """ diff --git a/dimos/perception/segmentation/sam_2d_seg.py b/dimos/perception/segmentation/sam_2d_seg.py index 2640408414..568d09e1e4 100644 --- a/dimos/perception/segmentation/sam_2d_seg.py +++ b/dimos/perception/segmentation/sam_2d_seg.py @@ -21,26 +21,41 @@ plot_results, crop_images_from_bboxes, ) +from dimos.utils.gpu_utils import is_cuda_available from dimos.perception.common.detection2d_tracker import target2dTracker, get_tracked_results from dimos.perception.segmentation.image_analyzer import ImageAnalyzer import os from collections import deque from concurrent.futures import ThreadPoolExecutor +from dimos.utils.logging_config import setup_logger +from dimos.utils.path_utils import get_project_root +import onnxruntime +from dimos.utils.testing import testData + +logger = setup_logger("dimos.perception.segmentation.sam_2d_seg") class Sam2DSegmenter: def __init__( self, - model_path="FastSAM-s.pt", - device="cuda", + model_path="models_fastsam", + model_name="FastSAM-s.onnx", + device="cpu", min_analysis_interval=5.0, use_tracker=True, use_analyzer=True, use_rich_labeling=False, ): - # Core components self.device = device - self.model = FastSAM(model_path) + if is_cuda_available(): + logger.info("Using CUDA for SAM 2d segmenter") + onnxruntime.preload_dlls(cuda=True, cudnn=True) + self.device = "cuda" + else: + logger.info("Using CPU for SAM 2d segmenter") + self.device = "cpu" + # Core components + self.model = FastSAM(testData(model_path) / model_name) self.use_tracker = use_tracker self.use_analyzer = use_analyzer self.use_rich_labeling = use_rich_labeling diff --git a/dimos/perception/segmentation/test_sam_2d_seg.py b/dimos/perception/segmentation/test_sam_2d_seg.py new file mode 100644 index 0000000000..fc7e488e51 --- /dev/null +++ b/dimos/perception/segmentation/test_sam_2d_seg.py @@ -0,0 +1,211 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +from dimos.stream import video_provider +import pytest +import cv2 +import numpy as np +import reactivex as rx +from reactivex import operators as ops +from dimos.stream.video_provider import VideoProvider +from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter +from dimos.perception.segmentation.utils import extract_masks_bboxes_probs_names + + +class TestSam2DSegmenter: + def test_sam_segmenter_initialization(self): + """Test FastSAM segmenter initializes correctly with default model path.""" + try: + # Try to initialize with the default model path and existing device setting + segmenter = Sam2DSegmenter(use_analyzer=False) + assert segmenter is not None + assert segmenter.model is not None + except Exception as e: + # If the model file doesn't exist, the test should still pass with a warning + pytest.skip(f"Skipping test due to model initialization error: {e}") + + def test_sam_segmenter_process_image(self): + """Test FastSAM segmenter can process video frames and return segmentation masks.""" + # Import testData inside method to avoid pytest fixture confusion + from dimos.utils.testing import testData + + # Get test video path directly + video_path = testData("assets") / "trimmed_video_office.mov" + try: + # Initialize segmenter without analyzer for faster testing + segmenter = Sam2DSegmenter(use_analyzer=False) + + # Note: conf and iou are parameters for process_image, not constructor + # We'll monkey patch the process_image method to use lower thresholds + original_process_image = segmenter.process_image + + def patched_process_image(image): + results = segmenter.model.track( + source=image, + device=segmenter.device, + retina_masks=True, + conf=0.1, # Lower confidence threshold for testing + iou=0.5, # Lower IoU threshold + persist=True, + verbose=False, + tracker=segmenter.tracker_config + if hasattr(segmenter, "tracker_config") + else None, + ) + + if len(results) > 0: + masks, bboxes, track_ids, probs, names, areas = ( + extract_masks_bboxes_probs_names(results[0]) + ) + return masks, bboxes, track_ids, probs, names + return [], [], [], [], [] + + # Replace the method + segmenter.process_image = patched_process_image + + # Create video provider and directly get a video stream observable + assert os.path.exists(video_path), f"Test video not found: {video_path}" + video_provider = VideoProvider(dev_name="test_video", video_source=video_path) + + video_stream = video_provider.capture_video_as_observable(realtime=False, fps=1) + + # Use ReactiveX operators to process the stream + def process_frame(frame): + try: + # Process frame with FastSAM + masks, bboxes, track_ids, probs, names = segmenter.process_image(frame) + print( + f"SAM results - masks: {len(masks)}, bboxes: {len(bboxes)}, track_ids: {len(track_ids)}, names: {len(names)}" + ) + + return { + "frame": frame, + "masks": masks, + "bboxes": bboxes, + "track_ids": track_ids, + "probs": probs, + "names": names, + } + except Exception as e: + print(f"Error in process_frame: {e}") + return {} + + # Create the segmentation stream using pipe and map operator + segmentation_stream = video_stream.pipe(ops.map(process_frame)) + + # Collect results from the stream + results = [] + frames_processed = 0 + target_frames = 5 + + def on_next(result): + nonlocal frames_processed, results + if not result: + return + + results.append(result) + frames_processed += 1 + + # Stop processing after target frames + if frames_processed >= target_frames: + subscription.dispose() + + def on_error(error): + pytest.fail(f"Error in segmentation stream: {error}") + + def on_completed(): + pass + + # Subscribe and wait for results + subscription = segmentation_stream.subscribe( + on_next=on_next, on_error=on_error, on_completed=on_completed + ) + + # Wait for frames to be processed + timeout = 30.0 # seconds + start_time = time.time() + while frames_processed < target_frames and time.time() - start_time < timeout: + time.sleep(0.5) + + # Clean up subscription + subscription.dispose() + video_provider.dispose_all() + + # Check if we have results + if len(results) == 0: + pytest.skip( + "No segmentation results found, but test connection established correctly" + ) + return + + print(f"Processed {len(results)} frames with segmentation results") + + # Analyze the first result + result = results[0] + + # Check that we have a frame + assert "frame" in result, "Result doesn't contain a frame" + assert isinstance(result["frame"], np.ndarray), "Frame is not a numpy array" + + # Check that segmentation results are valid + assert isinstance(result["masks"], list) + assert isinstance(result["bboxes"], list) + assert isinstance(result["track_ids"], list) + assert isinstance(result["probs"], list) + assert isinstance(result["names"], list) + + # All result lists should be the same length + assert ( + len(result["masks"]) + == len(result["bboxes"]) + == len(result["track_ids"]) + == len(result["probs"]) + == len(result["names"]) + ) + + # If we have masks, check that they have valid shape + if result.get("masks") and len(result["masks"]) > 0: + assert result["masks"][0].shape == ( + result["frame"].shape[0], + result["frame"].shape[1], + ), "Mask shape should match image dimensions" + print(f"Found {len(result['masks'])} masks in first frame") + else: + print("No masks found in first frame, but test connection established correctly") + + # Test visualization function + if result["masks"]: + vis_frame = segmenter.visualize_results( + result["frame"], + result["masks"], + result["bboxes"], + result["track_ids"], + result["probs"], + result["names"], + ) + assert isinstance(vis_frame, np.ndarray), "Visualization output should be an image" + assert vis_frame.shape == result["frame"].shape, ( + "Visualization should have same dimensions as input frame" + ) + + # We've already tested visualization above, so no need for a duplicate test + + except Exception as e: + pytest.skip(f"Skipping test due to error: {e}") + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/dimos/perception/semantic_seg.py b/dimos/perception/semantic_seg.py index 3ef2eb7399..a07e69c279 100644 --- a/dimos/perception/semantic_seg.py +++ b/dimos/perception/semantic_seg.py @@ -25,7 +25,6 @@ class SemanticSegmentationStream: def __init__( self, - model_path: str = "FastSAM-s.pt", device: str = "cuda", enable_mono_depth: bool = True, enable_rich_labeling: bool = True, @@ -36,7 +35,6 @@ def __init__( Initialize a semantic segmentation stream using Sam2DSegmenter. Args: - model_path: Path to the FastSAM model file device: Computation device ("cuda" or "cpu") enable_mono_depth: Whether to enable monocular depth processing enable_rich_labeling: Whether to enable rich labeling @@ -45,7 +43,6 @@ def __init__( - Physical parameters: resolution, focal_length, sensor_size """ self.segmenter = Sam2DSegmenter( - model_path=model_path, device=device, min_analysis_interval=5.0, use_tracker=True, diff --git a/dimos/perception/test_spatial_memory.py b/dimos/perception/test_spatial_memory.py new file mode 100644 index 0000000000..a6f4fcfa69 --- /dev/null +++ b/dimos/perception/test_spatial_memory.py @@ -0,0 +1,213 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import tempfile +import pytest +import numpy as np +import cv2 +import shutil +import reactivex as rx +from reactivex import operators as ops +from reactivex.subject import Subject +from reactivex import Observable + +from dimos.perception.spatial_perception import SpatialMemory +from dimos.types.position import Position +from dimos.stream.video_provider import VideoProvider +from dimos.types.position import Position +from dimos.types.vector import Vector + + +class TestSpatialMemory: + @pytest.fixture(scope="function") + def temp_dir(self): + # Create a temporary directory for storing spatial memory data + temp_dir = tempfile.mkdtemp() + yield temp_dir + # Clean up + shutil.rmtree(temp_dir) + + def test_spatial_memory_initialization(self): + """Test SpatialMemory initializes correctly with CLIP model.""" + try: + # Initialize spatial memory with default CLIP model + memory = SpatialMemory( + collection_name="test_collection", embedding_model="clip", new_memory=True + ) + assert memory is not None + assert memory.embedding_model == "clip" + assert memory.embedding_provider is not None + except Exception as e: + # If the model doesn't initialize, skip the test + pytest.fail(f"Failed to initialize model: {e}") + + def test_image_embedding(self): + """Test generating image embeddings using CLIP.""" + try: + # Initialize spatial memory with CLIP model + memory = SpatialMemory( + collection_name="test_collection", embedding_model="clip", new_memory=True + ) + + # Create a test image - use a simple colored square + test_image = np.zeros((224, 224, 3), dtype=np.uint8) + test_image[50:150, 50:150] = [0, 0, 255] # Blue square + + # Generate embedding + embedding = memory.embedding_provider.get_embedding(test_image) + + # Check embedding shape and characteristics + assert embedding is not None + assert isinstance(embedding, np.ndarray) + assert embedding.shape[0] == memory.embedding_dimensions + + # Check that embedding is normalized (unit vector) + assert np.isclose(np.linalg.norm(embedding), 1.0, atol=1e-5) + + # Test text embedding + text_embedding = memory.embedding_provider.get_text_embedding("a blue square") + assert text_embedding is not None + assert isinstance(text_embedding, np.ndarray) + assert text_embedding.shape[0] == memory.embedding_dimensions + assert np.isclose(np.linalg.norm(text_embedding), 1.0, atol=1e-5) + except Exception as e: + pytest.fail(f"Error in test: {e}") + + def test_spatial_memory_processing(self, temp_dir): + """Test processing video frames and building spatial memory with CLIP embeddings.""" + try: + # Initialize spatial memory with temporary storage + memory = SpatialMemory( + collection_name="test_collection", + embedding_model="clip", + new_memory=True, + db_path=os.path.join(temp_dir, "chroma_db"), + visual_memory_path=os.path.join(temp_dir, "visual_memory.pkl"), + output_dir=os.path.join(temp_dir, "images"), + min_distance_threshold=0.01, + min_time_threshold=0.01, + ) + + from dimos.utils.testing import testData + + video_path = testData("assets") / "trimmed_video_office.mov" + assert os.path.exists(video_path), f"Test video not found: {video_path}" + video_provider = VideoProvider(dev_name="test_video", video_source=video_path) + video_stream = video_provider.capture_video_as_observable(realtime=False, fps=15) + + # Create a frame counter for position generation + frame_counter = 0 + + # Process each video frame directly + def process_frame(frame): + nonlocal frame_counter + + # Generate a unique position for this frame to ensure minimum distance threshold is met + pos = Position(frame_counter * 0.5, frame_counter * 0.5, 0) + transform = {"position": pos, "timestamp": time.time()} + frame_counter += 1 + + # Create a dictionary with frame, position and rotation for SpatialMemory.process_stream + return { + "frame": frame, + "position": transform["position"], + "rotation": transform["position"], # Using position as rotation for testing + } + + # Create a stream that processes each frame + formatted_stream = video_stream.pipe(ops.map(process_frame)) + + # Process the stream using SpatialMemory's built-in processing + print("Creating spatial memory stream...") + spatial_stream = memory.process_stream(formatted_stream) + + # Stream is now created above using memory.process_stream() + + # Collect results from the stream + results = [] + + frames_processed = 0 + target_frames = 100 # Process more frames for thorough testing + + def on_next(result): + nonlocal results, frames_processed + if not result: # Skip None results + return + + results.append(result) + frames_processed += 1 + + # Stop processing after target frames + if frames_processed >= target_frames: + subscription.dispose() + + def on_error(error): + pytest.fail(f"Error in spatial stream: {error}") + + def on_completed(): + pass + + # Subscribe and wait for results + subscription = spatial_stream.subscribe( + on_next=on_next, on_error=on_error, on_completed=on_completed + ) + + # Wait for frames to be processed + timeout = 30.0 # seconds + start_time = time.time() + while frames_processed < target_frames and time.time() - start_time < timeout: + time.sleep(0.5) + + subscription.dispose() + + assert len(results) > 0, "Failed to process any frames with spatial memory" + + relevant_queries = ["office", "room with furniture"] + irrelevant_query = "star wars" + + for query in relevant_queries: + results = memory.query_by_text(query, limit=2) + print(f"\nResults for query: '{query}'") + + assert len(results) > 0, f"No results found for relevant query: {query}" + + similarities = [1 - r.get("distance") for r in results] + print(f"Similarities: {similarities}") + + assert any(d > 0.24 for d in similarities), ( + f"Expected at least one result with similarity > 0.24 for query '{query}'" + ) + + results = memory.query_by_text(irrelevant_query, limit=2) + print(f"\nResults for query: '{irrelevant_query}'") + + if results: + similarities = [1 - r.get("distance") for r in results] + print(f"Similarities: {similarities}") + + assert all(d < 0.25 for d in similarities), ( + f"Expected all results to have similarity < 0.25 for irrelevant query '{irrelevant_query}'" + ) + + except Exception as e: + pytest.fail(f"Error in test: {e}") + finally: + memory.cleanup() + video_provider.dispose_all() + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/dimos/utils/gpu_utils.py b/dimos/utils/gpu_utils.py new file mode 100644 index 0000000000..e40516deec --- /dev/null +++ b/dimos/utils/gpu_utils.py @@ -0,0 +1,24 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def is_cuda_available(): + try: + import pycuda.driver as cuda + import pycuda.autoinit # implicitly initializes the CUDA driver + + cuda.init() + return cuda.Device.count() > 0 + except Exception: + return False diff --git a/dimos/utils/path_utils.py b/dimos/utils/path_utils.py new file mode 100644 index 0000000000..d60014d068 --- /dev/null +++ b/dimos/utils/path_utils.py @@ -0,0 +1,22 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + + +def get_project_root() -> Path: + """ + Returns the absolute path to the project root directory. + """ + return Path(__file__).resolve().parent.parent.parent diff --git a/pyproject.toml b/pyproject.toml index 2f81fd62b1..7a9a0a23dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,7 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + [project] name = "dimos" authors = [ @@ -6,6 +10,15 @@ authors = [ version = "0.0.2" description = "Powering agentive generalist robotics" +[project.optional-dependencies] +cuda = [ + "pycuda", + "onnxruntime-gpu[cuda,cudnn]" +] + +[tool.setuptools] +packages = ["dimos"] + [tool.ruff] line-length = 100 exclude = [ diff --git a/requirements.txt b/requirements.txt index 7f71ec17cd..10dc835c57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -95,4 +95,5 @@ git+https://github.com/facebookresearch/detectron2.git@v0.6 # Mapping open3d -# Touch for rebuild +# Inference (CPU) +onnxruntime \ No newline at end of file diff --git a/tests/data/.lfs/assets.tar.gz b/tests/data/.lfs/assets.tar.gz new file mode 100644 index 0000000000..b7a2fcbd1c --- /dev/null +++ b/tests/data/.lfs/assets.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b14b01f5c907f117331213abfce9ef5d0c41d0524e14327b5cc706520fb2035 +size 2306191 diff --git a/tests/data/.lfs/models_clip.tar.gz b/tests/data/.lfs/models_clip.tar.gz new file mode 100644 index 0000000000..a4ab2b5f88 --- /dev/null +++ b/tests/data/.lfs/models_clip.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:102f11bb0aa952b3cebc4491c5ed3f2122e8c38c76002e22400da4f1e5ca90c5 +size 392327708 diff --git a/tests/data/.lfs/models_fastsam.tar.gz b/tests/data/.lfs/models_fastsam.tar.gz new file mode 100644 index 0000000000..77278f4323 --- /dev/null +++ b/tests/data/.lfs/models_fastsam.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682cb3816451bd73722cc430fdfce15bbe72a07e50ef2ea81ddaed61d1f22a25 +size 39971209 diff --git a/tests/data/.lfs/models_yolo.tar.gz b/tests/data/.lfs/models_yolo.tar.gz new file mode 100644 index 0000000000..aca0915dfd --- /dev/null +++ b/tests/data/.lfs/models_yolo.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed4a5160d4edfda145b6752b5c49ad22bc2887b66b9b9c38bd8c35fb5ffaf8f +size 9315806