Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
b0efd99
Unit tests for CLIP, YOLO, SAM2
spomichter Jun 11, 2025
912d00f
ONNX conversions for YOLOv11 and FastSAM
mdaiter Jun 18, 2025
32edfe7
Adding CUDA to requirements.txt and explicit check for non-Torch-base…
mdaiter Jun 18, 2025
4a4bb51
Move ONNX model files to dimos/models/onnx directory
spomichter Jun 20, 2025
02aee75
Cleaned up YOLO and SAM models, changed paths to use ONNX and added l…
spomichter Jun 20, 2025
9733495
Started building pyproject dependency management, added pip install .…
spomichter Jun 20, 2025
e823c02
Add GIF, MP4, and MOV files to Git LFS with binary designation
spomichter Jun 20, 2025
b94ee3e
LFS fix
spomichter Jun 20, 2025
0c10ab4
LFS fix
spomichter Jun 20, 2025
f4e130f
Merge pull request #350 - ONNX Conversations for YOLOv11 and FastSAM
spomichter Jun 20, 2025
c76055d
Add CLIP ONNX conversion and support, with passing vision and text tests
mdaiter Jun 20, 2025
7f96932
Clip ONNX var cleanup
spomichter Jun 26, 2025
b1112bb
Merge branch 'tests_clip_yolo_sam' into tests_clip_yolo_sam_fix_clip_…
spomichter Jun 26, 2025
20db26f
Merge pull request #353 from dimensionalOS/tests_clip_yolo_sam_fix_cl…
spomichter Jun 26, 2025
61425b0
Merged requirements.txt
spomichter Jun 26, 2025
2525502
Merge branch 'dev' into tests_clip_yolo_sam
spomichter Jun 26, 2025
8e9c851
CI code cleanup
spomichter Jun 26, 2025
763fe5b
Added get proj root path utility
spomichter Jun 26, 2025
5141a5b
Merge branch 'tests_clip_yolo_sam' of github.com:dimensionalOS/dimos …
spomichter Jun 26, 2025
92e291d
CI code cleanup
spomichter Jun 26, 2025
e3369f5
Added CLIP to LFS /models/onnx
spomichter Jun 27, 2025
f6bb8ad
Updated CLIP to LFS correct path
spomichter Jun 27, 2025
abfdfe0
YOLO and FastSAM added to LFS
spomichter Jun 27, 2025
2951893
Unit test for continuous spatial memory processing
spomichter Jun 27, 2025
def4182
Deleted deprecated model files
spomichter Jun 27, 2025
28cf5ec
Changed model paths to use LFS
spomichter Jun 27, 2025
56326b1
Added office video to tests/data
spomichter Jun 27, 2025
cb5afab
Fixed unit tests to use LFS video stream
spomichter Jun 27, 2025
c7a4e89
Added GPU utils
spomichter Jun 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
# Handle line endings automatically for files Git considers text,
# converting them to LF on checkout.
* text=auto eol=lf

# Ensure Python files always use LF for line endings.
*.py text eol=lf

# Treat designated file types as binary and do not alter their contents or line endings.
*.png binary
*.jpg binary
*.gif binary
*.ico binary
*.pdf binary
*.mp4 binary

# Explicit LFS tracking for test files
tests/data/.lfs/*.tar.gz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text binary
*.mp4 filter=lfs diff=lfs merge=lfs -text binary
*.mov filter=lfs diff=lfs merge=lfs -text binary
*.gif filter=lfs diff=lfs merge=lfs -text binary
Binary file modified assets/dimos_interface.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified assets/framecount.mp4
Binary file not shown.
Binary file modified assets/simple_demo.mp4
Binary file not shown.
Binary file modified assets/simple_demo_small.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified assets/trimmed_video_office.mov
Binary file not shown.
85 changes: 68 additions & 17 deletions dimos/agents/memory/image_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import cv2
import base64
from dimos.utils.logging_config import setup_logger
from dimos.utils.testing import testData

logger = setup_logger("dimos.agents.memory.image_embedding")

Expand Down Expand Up @@ -60,12 +61,14 @@ def _initialize_model(self):
"""Initialize the specified embedding model."""
try:
import torch
from transformers import CLIPProcessor, CLIPModel, AutoFeatureExtractor, AutoModel
from transformers import CLIPProcessor, AutoFeatureExtractor, AutoModel
import onnxruntime as ort

if self.model_name == "clip":
model_id = "openai/clip-vit-base-patch32"
self.model = CLIPModel.from_pretrained(model_id)
self.processor = CLIPProcessor.from_pretrained(model_id)
model_id = testData("models_clip") / "model.onnx"
processor_id = "openai/clip-vit-base-patch32"
self.model = ort.InferenceSession(model_id)
self.processor = CLIPProcessor.from_pretrained(processor_id)
logger.info(f"Loaded CLIP model: {model_id}")
elif self.model_name == "resnet":
model_id = "microsoft/resnet-50"
Expand Down Expand Up @@ -103,13 +106,35 @@ def get_embedding(self, image: Union[np.ndarray, str, bytes]) -> np.ndarray:
import torch

if self.model_name == "clip":
inputs = self.processor(images=pil_image, return_tensors="pt")
inputs = self.processor(images=pil_image, return_tensors="np")

with torch.no_grad():
image_features = self.model.get_image_features(**inputs)

image_embedding = image_features / image_features.norm(dim=1, keepdim=True)
embedding = image_embedding.numpy()[0]
ort_inputs = {
inp.name: inputs[inp.name]
for inp in self.model.get_inputs()
if inp.name in inputs
}

# If required, add dummy text inputs
input_names = [i.name for i in self.model.get_inputs()]
batch_size = inputs["pixel_values"].shape[0]
if "input_ids" in input_names:
ort_inputs["input_ids"] = np.zeros((batch_size, 1), dtype=np.int64)
if "attention_mask" in input_names:
ort_inputs["attention_mask"] = np.ones((batch_size, 1), dtype=np.int64)

# Run inference
ort_outputs = self.model.run(None, ort_inputs)

# Look up correct output name
output_names = [o.name for o in self.model.get_outputs()]
if "image_embeds" in output_names:
image_embedding = ort_outputs[output_names.index("image_embeds")]
else:
raise RuntimeError(f"No 'image_embeds' found in outputs: {output_names}")

embedding = image_embedding / np.linalg.norm(image_embedding, axis=1, keepdims=True)
embedding = embedding[0]

elif self.model_name == "resnet":
inputs = self.processor(images=pil_image, return_tensors="pt")
Expand Down Expand Up @@ -156,19 +181,45 @@ def get_text_embedding(self, text: str) -> np.ndarray:
try:
import torch

inputs = self.processor(text=[text], return_tensors="pt", padding=True)
inputs = self.processor(text=[text], return_tensors="np", padding=True)

with torch.no_grad():
text_features = self.model.get_text_features(**inputs)

# Normalize the features
text_embedding = text_features / text_features.norm(dim=1, keepdim=True)
embedding = text_embedding.numpy()[0]
# Prepare ONNX input dict (handle only what's needed)
ort_inputs = {
inp.name: inputs[inp.name]
for inp in self.model.get_inputs()
if inp.name in inputs
}
# Determine which inputs are expected by the ONNX model
input_names = [i.name for i in self.model.get_inputs()]
batch_size = inputs["input_ids"].shape[0] # pulled from text input

# If the model expects pixel_values (i.e., fused model), add dummy vision input
if "pixel_values" in input_names:
ort_inputs["pixel_values"] = np.zeros(
(batch_size, 3, 224, 224), dtype=np.float32
)

# Run inference
ort_outputs = self.model.run(None, ort_inputs)

# Determine correct output (usually 'last_hidden_state' or 'text_embeds')
output_names = [o.name for o in self.model.get_outputs()]
if "text_embeds" in output_names:
text_embedding = ort_outputs[output_names.index("text_embeds")]
else:
text_embedding = ort_outputs[0] # fallback to first output

# Normalize
text_embedding = text_embedding / np.linalg.norm(
text_embedding, axis=1, keepdims=True
)
text_embedding = text_embedding[0] # shape: (512,)

logger.debug(
f"Generated text embedding with shape {embedding.shape} for text: '{text}'"
f"Generated text embedding with shape {text_embedding.shape} for text: '{text}'"
)
return embedding
return text_embedding

except Exception as e:
logger.error(f"Error generating text embedding: {e}")
Expand Down
209 changes: 209 additions & 0 deletions dimos/agents/memory/test_image_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
# Copyright 2025 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Test module for the CLIP image embedding functionality in dimos.
"""

import os
import time
import numpy as np
import pytest
import reactivex as rx
from reactivex import operators as ops
from dimos.stream.video_provider import VideoProvider
from dimos.agents.memory.image_embedding import ImageEmbeddingProvider


class TestImageEmbedding:
Copy link
Contributor

@leshy leshy Jun 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all of this works as a non-class, not sure why we are embedding tests in class methods? not very important, might care about it for dimos-wide uniformity

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will fix

"""Test class for CLIP image embedding functionality."""

def test_clip_embedding_initialization(self):
"""Test CLIP embedding provider initializes correctly."""
try:
# Initialize the embedding provider with CLIP model
embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)
assert embedding_provider.model is not None, "CLIP model failed to initialize"
assert embedding_provider.processor is not None, "CLIP processor failed to initialize"
assert embedding_provider.model_name == "clip", "Model name should be 'clip'"
assert embedding_provider.dimensions == 512, "Embedding dimensions should be 512"
except Exception as e:
pytest.skip(f"Skipping test due to model initialization error: {e}")

def test_clip_embedding_process_video(self):
"""Test CLIP embedding provider can process video frames and return embeddings."""
try:
from dimos.utils.testing import testData

video_path = testData("assets") / "trimmed_video_office.mov"

embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)

assert os.path.exists(video_path), f"Test video not found: {video_path}"
video_provider = VideoProvider(dev_name="test_video", video_source=video_path)

video_stream = video_provider.capture_video_as_observable(realtime=False, fps=15)

# Use ReactiveX operators to process the stream
def process_frame(frame):
try:
# Process frame with CLIP
embedding = embedding_provider.get_embedding(frame)
print(
f"Generated CLIP embedding with shape: {embedding.shape}, norm: {np.linalg.norm(embedding):.4f}"
)

return {"frame": frame, "embedding": embedding}
except Exception as e:
print(f"Error in process_frame: {e}")
return None

embedding_stream = video_stream.pipe(ops.map(process_frame))

results = []
frames_processed = 0
target_frames = 10

def on_next(result):
nonlocal frames_processed, results
if not result: # Skip None results
return

results.append(result)
frames_processed += 1

# Stop processing after target frames
if frames_processed >= target_frames:
subscription.dispose()

def on_error(error):
pytest.fail(f"Error in embedding stream: {error}")

def on_completed():
pass

# Subscribe and wait for results
subscription = embedding_stream.subscribe(
on_next=on_next, on_error=on_error, on_completed=on_completed
)

timeout = 60.0
start_time = time.time()
while frames_processed < target_frames and time.time() - start_time < timeout:
time.sleep(0.5)
print(f"Processed {frames_processed}/{target_frames} frames")

# Clean up subscription
subscription.dispose()
video_provider.dispose_all()

# Check if we have results
if len(results) == 0:
pytest.skip("No embeddings generated, but test connection established correctly")
return

print(f"Processed {len(results)} frames with CLIP embeddings")

# Analyze the results
assert len(results) > 0, "No embeddings generated"

# Check properties of first embedding
first_result = results[0]
assert "embedding" in first_result, "Result doesn't contain embedding"
assert "frame" in first_result, "Result doesn't contain frame"

# Check embedding shape and normalization
embedding = first_result["embedding"]
assert isinstance(embedding, np.ndarray), "Embedding is not a numpy array"
assert embedding.shape == (512,), (
f"Embedding has wrong shape: {embedding.shape}, expected (512,)"
)
assert abs(np.linalg.norm(embedding) - 1.0) < 1e-5, "Embedding is not normalized"

# Save the first embedding for similarity tests
if len(results) > 1 and "embedding" in results[0]:
# Create a class variable to store embeddings for the similarity test
TestImageEmbedding.test_embeddings = {
"embedding1": results[0]["embedding"],
"embedding2": results[1]["embedding"] if len(results) > 1 else None,
}
print(f"Saved embeddings for similarity testing")

print("CLIP embedding test passed successfully!")

except Exception as e:
pytest.fail(f"Test failed with error: {e}")

def test_clip_embedding_similarity(self):
"""Test CLIP embedding similarity search and text-to-image queries."""
try:
# Skip if previous test didn't generate embeddings
if not hasattr(TestImageEmbedding, "test_embeddings"):
pytest.skip("No embeddings available from previous test")
return

# Get embeddings from previous test
embedding1 = TestImageEmbedding.test_embeddings["embedding1"]
embedding2 = TestImageEmbedding.test_embeddings["embedding2"]

# Initialize embedding provider for text embeddings
embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)

# Test frame-to-frame similarity
if embedding1 is not None and embedding2 is not None:
# Compute cosine similarity
similarity = np.dot(embedding1, embedding2)
print(f"Similarity between first two frames: {similarity:.4f}")

# Should be in range [-1, 1]
assert -1.0 <= similarity <= 1.0, f"Similarity out of valid range: {similarity}"

# Test text-to-image similarity
if embedding1 is not None:
# Generate a list of text queries to test
text_queries = ["a video frame", "a person", "an outdoor scene", "a kitchen"]

# Test each text query
for text_query in text_queries:
# Get text embedding
text_embedding = embedding_provider.get_text_embedding(text_query)

# Check text embedding properties
assert isinstance(text_embedding, np.ndarray), (
"Text embedding is not a numpy array"
)
assert text_embedding.shape == (512,), (
f"Text embedding has wrong shape: {text_embedding.shape}"
)
assert abs(np.linalg.norm(text_embedding) - 1.0) < 1e-5, (
"Text embedding is not normalized"
)

# Compute similarity between frame and text
text_similarity = np.dot(embedding1, text_embedding)
print(f"Similarity between frame and '{text_query}': {text_similarity:.4f}")

# Should be in range [-1, 1]
assert -1.0 <= text_similarity <= 1.0, (
f"Text-image similarity out of range: {text_similarity}"
)

print("CLIP embedding similarity tests passed successfully!")

except Exception as e:
pytest.fail(f"Similarity test failed with error: {e}")


if __name__ == "__main__":
pytest.main(["-v", "--disable-warnings", __file__])
Loading