dimensionalOS · spomichter · Jun 27, 2025 · Jun 11, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -1,17 +1,16 @@
 # Handle line endings automatically for files Git considers text,
 # converting them to LF on checkout.
 * text=auto eol=lf
-
 # Ensure Python files always use LF for line endings.
 *.py text eol=lf
-
 # Treat designated file types as binary and do not alter their contents or line endings.
 *.png binary
 *.jpg binary
-*.gif binary
 *.ico binary
 *.pdf binary
-*.mp4 binary
-
 # Explicit LFS tracking for test files
 tests/data/.lfs/*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text binary
+*.mp4 filter=lfs diff=lfs merge=lfs -text binary
+*.mov filter=lfs diff=lfs merge=lfs -text binary
+*.gif filter=lfs diff=lfs merge=lfs -text binary
diff --git a/assets/dimos_interface.gif b/assets/dimos_interface.gif
diff --git a/assets/framecount.mp4 b/assets/framecount.mp4
diff --git a/assets/simple_demo.mp4 b/assets/simple_demo.mp4
diff --git a/assets/simple_demo_small.gif b/assets/simple_demo_small.gif
diff --git a/assets/trimmed_video_office.mov b/assets/trimmed_video_office.mov
diff --git a/dimos/agents/memory/image_embedding.py b/dimos/agents/memory/image_embedding.py
@@ -27,6 +27,7 @@
 import cv2
 import base64
 from dimos.utils.logging_config import setup_logger
+from dimos.utils.testing import testData
 
 logger = setup_logger("dimos.agents.memory.image_embedding")
 
@@ -60,12 +61,14 @@ def _initialize_model(self):
         """Initialize the specified embedding model."""
         try:
             import torch
-            from transformers import CLIPProcessor, CLIPModel, AutoFeatureExtractor, AutoModel
+            from transformers import CLIPProcessor, AutoFeatureExtractor, AutoModel
+            import onnxruntime as ort
 
             if self.model_name == "clip":
-                model_id = "openai/clip-vit-base-patch32"
-                self.model = CLIPModel.from_pretrained(model_id)
-                self.processor = CLIPProcessor.from_pretrained(model_id)
+                model_id = testData("models_clip") / "model.onnx"
+                processor_id = "openai/clip-vit-base-patch32"
+                self.model = ort.InferenceSession(model_id)
+                self.processor = CLIPProcessor.from_pretrained(processor_id)
                 logger.info(f"Loaded CLIP model: {model_id}")
             elif self.model_name == "resnet":
                 model_id = "microsoft/resnet-50"
@@ -103,13 +106,35 @@ def get_embedding(self, image: Union[np.ndarray, str, bytes]) -> np.ndarray:
             import torch
 
             if self.model_name == "clip":
-                inputs = self.processor(images=pil_image, return_tensors="pt")
+                inputs = self.processor(images=pil_image, return_tensors="np")
 
                 with torch.no_grad():
-                    image_features = self.model.get_image_features(**inputs)
-
-                image_embedding = image_features / image_features.norm(dim=1, keepdim=True)
-                embedding = image_embedding.numpy()[0]
+                    ort_inputs = {
+                        inp.name: inputs[inp.name]
+                        for inp in self.model.get_inputs()
+                        if inp.name in inputs
+                    }
+
+                    # If required, add dummy text inputs
+                    input_names = [i.name for i in self.model.get_inputs()]
+                    batch_size = inputs["pixel_values"].shape[0]
+                    if "input_ids" in input_names:
+                        ort_inputs["input_ids"] = np.zeros((batch_size, 1), dtype=np.int64)
+                    if "attention_mask" in input_names:
+                        ort_inputs["attention_mask"] = np.ones((batch_size, 1), dtype=np.int64)
+
+                    # Run inference
+                    ort_outputs = self.model.run(None, ort_inputs)
+
+                    # Look up correct output name
+                    output_names = [o.name for o in self.model.get_outputs()]
+                    if "image_embeds" in output_names:
+                        image_embedding = ort_outputs[output_names.index("image_embeds")]
+                    else:
+                        raise RuntimeError(f"No 'image_embeds' found in outputs: {output_names}")
+
+                embedding = image_embedding / np.linalg.norm(image_embedding, axis=1, keepdims=True)
+                embedding = embedding[0]
 
             elif self.model_name == "resnet":
                 inputs = self.processor(images=pil_image, return_tensors="pt")
@@ -156,19 +181,45 @@ def get_text_embedding(self, text: str) -> np.ndarray:
         try:
             import torch
 
-            inputs = self.processor(text=[text], return_tensors="pt", padding=True)
+            inputs = self.processor(text=[text], return_tensors="np", padding=True)
 
             with torch.no_grad():
-                text_features = self.model.get_text_features(**inputs)
-
-            # Normalize the features
-            text_embedding = text_features / text_features.norm(dim=1, keepdim=True)
-            embedding = text_embedding.numpy()[0]
+                # Prepare ONNX input dict (handle only what's needed)
+                ort_inputs = {
+                    inp.name: inputs[inp.name]
+                    for inp in self.model.get_inputs()
+                    if inp.name in inputs
+                }
+                # Determine which inputs are expected by the ONNX model
+                input_names = [i.name for i in self.model.get_inputs()]
+                batch_size = inputs["input_ids"].shape[0]  # pulled from text input
+
+                # If the model expects pixel_values (i.e., fused model), add dummy vision input
+                if "pixel_values" in input_names:
+                    ort_inputs["pixel_values"] = np.zeros(
+                        (batch_size, 3, 224, 224), dtype=np.float32
+                    )
+
+                # Run inference
+                ort_outputs = self.model.run(None, ort_inputs)
+
+                # Determine correct output (usually 'last_hidden_state' or 'text_embeds')
+                output_names = [o.name for o in self.model.get_outputs()]
+                if "text_embeds" in output_names:
+                    text_embedding = ort_outputs[output_names.index("text_embeds")]
+                else:
+                    text_embedding = ort_outputs[0]  # fallback to first output
+
+                # Normalize
+                text_embedding = text_embedding / np.linalg.norm(
+                    text_embedding, axis=1, keepdims=True
+                )
+                text_embedding = text_embedding[0]  # shape: (512,)
 
             logger.debug(
-                f"Generated text embedding with shape {embedding.shape} for text: '{text}'"
+                f"Generated text embedding with shape {text_embedding.shape} for text: '{text}'"
             )
-            return embedding
+            return text_embedding
 
         except Exception as e:
             logger.error(f"Error generating text embedding: {e}")

diff --git a/dimos/agents/memory/test_image_embedding.py b/dimos/agents/memory/test_image_embedding.py
@@ -0,0 +1,209 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Test module for the CLIP image embedding functionality in dimos.
+"""
+
+import os
+import time
+import numpy as np
+import pytest
+import reactivex as rx
+from reactivex import operators as ops
+from dimos.stream.video_provider import VideoProvider
+from dimos.agents.memory.image_embedding import ImageEmbeddingProvider
+
+
+class TestImageEmbedding:
+    """Test class for CLIP image embedding functionality."""
+
+    def test_clip_embedding_initialization(self):
+        """Test CLIP embedding provider initializes correctly."""
+        try:
+            # Initialize the embedding provider with CLIP model
+            embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)
+            assert embedding_provider.model is not None, "CLIP model failed to initialize"
+            assert embedding_provider.processor is not None, "CLIP processor failed to initialize"
+            assert embedding_provider.model_name == "clip", "Model name should be 'clip'"
+            assert embedding_provider.dimensions == 512, "Embedding dimensions should be 512"
+        except Exception as e:
+            pytest.skip(f"Skipping test due to model initialization error: {e}")
+
+    def test_clip_embedding_process_video(self):
+        """Test CLIP embedding provider can process video frames and return embeddings."""
+        try:
+            from dimos.utils.testing import testData
+
+            video_path = testData("assets") / "trimmed_video_office.mov"
+
+            embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)
+
+            assert os.path.exists(video_path), f"Test video not found: {video_path}"
+            video_provider = VideoProvider(dev_name="test_video", video_source=video_path)
+
+            video_stream = video_provider.capture_video_as_observable(realtime=False, fps=15)
+
+            # Use ReactiveX operators to process the stream
+            def process_frame(frame):
+                try:
+                    # Process frame with CLIP
+                    embedding = embedding_provider.get_embedding(frame)
+                    print(
+                        f"Generated CLIP embedding with shape: {embedding.shape}, norm: {np.linalg.norm(embedding):.4f}"
+                    )
+
+                    return {"frame": frame, "embedding": embedding}
+                except Exception as e:
+                    print(f"Error in process_frame: {e}")
+                    return None
+
+            embedding_stream = video_stream.pipe(ops.map(process_frame))
+
+            results = []
+            frames_processed = 0
+            target_frames = 10
+
+            def on_next(result):
+                nonlocal frames_processed, results
+                if not result:  # Skip None results
+                    return
+
+                results.append(result)
+                frames_processed += 1
+
+                # Stop processing after target frames
+                if frames_processed >= target_frames:
+                    subscription.dispose()
+
+            def on_error(error):
+                pytest.fail(f"Error in embedding stream: {error}")
+
+            def on_completed():
+                pass
+
+            # Subscribe and wait for results
+            subscription = embedding_stream.subscribe(
+                on_next=on_next, on_error=on_error, on_completed=on_completed
+            )
+
+            timeout = 60.0
+            start_time = time.time()
+            while frames_processed < target_frames and time.time() - start_time < timeout:
+                time.sleep(0.5)
+                print(f"Processed {frames_processed}/{target_frames} frames")
+
+            # Clean up subscription
+            subscription.dispose()
+            video_provider.dispose_all()
+
+            # Check if we have results
+            if len(results) == 0:
+                pytest.skip("No embeddings generated, but test connection established correctly")
+                return
+
+            print(f"Processed {len(results)} frames with CLIP embeddings")
+
+            # Analyze the results
+            assert len(results) > 0, "No embeddings generated"
+
+            # Check properties of first embedding
+            first_result = results[0]
+            assert "embedding" in first_result, "Result doesn't contain embedding"
+            assert "frame" in first_result, "Result doesn't contain frame"
+
+            # Check embedding shape and normalization
+            embedding = first_result["embedding"]
+            assert isinstance(embedding, np.ndarray), "Embedding is not a numpy array"
+            assert embedding.shape == (512,), (
+                f"Embedding has wrong shape: {embedding.shape}, expected (512,)"
+            )
+            assert abs(np.linalg.norm(embedding) - 1.0) < 1e-5, "Embedding is not normalized"
+
+            # Save the first embedding for similarity tests
+            if len(results) > 1 and "embedding" in results[0]:
+                # Create a class variable to store embeddings for the similarity test
+                TestImageEmbedding.test_embeddings = {
+                    "embedding1": results[0]["embedding"],
+                    "embedding2": results[1]["embedding"] if len(results) > 1 else None,
+                }
+                print(f"Saved embeddings for similarity testing")
+
+            print("CLIP embedding test passed successfully!")
+
+        except Exception as e:
+            pytest.fail(f"Test failed with error: {e}")
+
+    def test_clip_embedding_similarity(self):
+        """Test CLIP embedding similarity search and text-to-image queries."""
+        try:
+            # Skip if previous test didn't generate embeddings
+            if not hasattr(TestImageEmbedding, "test_embeddings"):
+                pytest.skip("No embeddings available from previous test")
+                return
+
+            # Get embeddings from previous test
+            embedding1 = TestImageEmbedding.test_embeddings["embedding1"]
+            embedding2 = TestImageEmbedding.test_embeddings["embedding2"]
+
+            # Initialize embedding provider for text embeddings
+            embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)
+
+            # Test frame-to-frame similarity
+            if embedding1 is not None and embedding2 is not None:
+                # Compute cosine similarity
+                similarity = np.dot(embedding1, embedding2)
+                print(f"Similarity between first two frames: {similarity:.4f}")
+
+                # Should be in range [-1, 1]
+                assert -1.0 <= similarity <= 1.0, f"Similarity out of valid range: {similarity}"
+
+            # Test text-to-image similarity
+            if embedding1 is not None:
+                # Generate a list of text queries to test
+                text_queries = ["a video frame", "a person", "an outdoor scene", "a kitchen"]
+
+                # Test each text query
+                for text_query in text_queries:
+                    # Get text embedding
+                    text_embedding = embedding_provider.get_text_embedding(text_query)
+
+                    # Check text embedding properties
+                    assert isinstance(text_embedding, np.ndarray), (
+                        "Text embedding is not a numpy array"
+                    )
+                    assert text_embedding.shape == (512,), (
+                        f"Text embedding has wrong shape: {text_embedding.shape}"
+                    )
+                    assert abs(np.linalg.norm(text_embedding) - 1.0) < 1e-5, (
+                        "Text embedding is not normalized"
+                    )
+
+                    # Compute similarity between frame and text
+                    text_similarity = np.dot(embedding1, text_embedding)
+                    print(f"Similarity between frame and '{text_query}': {text_similarity:.4f}")
+
+                    # Should be in range [-1, 1]
+                    assert -1.0 <= text_similarity <= 1.0, (
+                        f"Text-image similarity out of range: {text_similarity}"
+                    )
+
+            print("CLIP embedding similarity tests passed successfully!")
+
+        except Exception as e:
+            pytest.fail(f"Similarity test failed with error: {e}")
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", "--disable-warnings", __file__])