Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,28 @@ permissions:
packages: read

jobs:
run-tests:
run-tests-ubuntu:
runs-on: dimos-runner-ubuntu-2204

container:
image: ghcr.io/dimensionalos/dev:${{ inputs.branch-tag }}
steps:
- uses: actions/checkout@v4
- name: Run tests
run: |
git config --global --add safe.directory '*'
/entrypoint.sh bash -c "pytest"

run-tests-macos:
runs-on: macos-latest
steps:
- uses: actions/checkout@v4

- name: Install Docker (macOS)
run: |
brew install docker
colima start

- name: Run tests
run: |
git config --global --add safe.directory '*'
/entrypoint.sh bash -c "pytest"
docker run --rm -v $PWD:/workspace -w /workspace ghcr.io/dimensionalos/dev:${{ inputs.branch-tag }} bash -c "pytest"
224 changes: 224 additions & 0 deletions dimos/agents/memory/test_image_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
# Copyright 2025 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Test module for the CLIP image embedding functionality in dimos.
"""

import os
import time
import numpy as np
import pytest
import reactivex as rx
from reactivex import operators as ops
from dimos.stream.video_provider import VideoProvider
from dimos.agents.memory.image_embedding import ImageEmbeddingProvider


class TestImageEmbedding:
"""Test class for CLIP image embedding functionality."""

@pytest.fixture(scope="class")
def video_path(self):
"""Return the path to the test video."""
# Use a video file from assets directory
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../assets"))
video_file = "trimmed_video_office.mov" # Use the same test video as YOLO test
video_path = os.path.join(base_dir, video_file)

# Fallback to any video file in assets directory if the specific one isn't found
if not os.path.exists(video_path):
for filename in os.listdir(base_dir):
if filename.endswith((".mp4", ".avi", ".mov")):
video_path = os.path.join(base_dir, filename)
break

return video_path

def test_clip_embedding_initialization(self):
"""Test CLIP embedding provider initializes correctly."""
try:
# Initialize the embedding provider with CLIP model
embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)
assert embedding_provider.model is not None, "CLIP model failed to initialize"
assert embedding_provider.processor is not None, "CLIP processor failed to initialize"
assert embedding_provider.model_name == "clip", "Model name should be 'clip'"
assert embedding_provider.dimensions == 512, "Embedding dimensions should be 512"
except Exception as e:
pytest.skip(f"Skipping test due to model initialization error: {e}")

def test_clip_embedding_process_video(self, video_path):
"""Test CLIP embedding provider can process video frames and return embeddings."""
try:
# Initialize the embedding provider
embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)

# Create video provider and get video stream observable
assert os.path.exists(video_path), f"Test video not found: {video_path}"
video_provider = VideoProvider(dev_name="test_video", video_source=video_path)

video_stream = video_provider.capture_video_as_observable(realtime=False, fps=15)

# Use ReactiveX operators to process the stream
def process_frame(frame):
try:
# Process frame with CLIP
embedding = embedding_provider.get_embedding(frame)
print(
f"Generated CLIP embedding with shape: {embedding.shape}, norm: {np.linalg.norm(embedding):.4f}"
)

return {"frame": frame, "embedding": embedding}
except Exception as e:
print(f"Error in process_frame: {e}")
return None

embedding_stream = video_stream.pipe(ops.map(process_frame))

results = []
frames_processed = 0
target_frames = 10

def on_next(result):
nonlocal frames_processed, results
if not result: # Skip None results
return

results.append(result)
frames_processed += 1

# Stop processing after target frames
if frames_processed >= target_frames:
subscription.dispose()

def on_error(error):
pytest.fail(f"Error in embedding stream: {error}")

def on_completed():
pass

# Subscribe and wait for results
subscription = embedding_stream.subscribe(
on_next=on_next, on_error=on_error, on_completed=on_completed
)

timeout = 60.0
start_time = time.time()
while frames_processed < target_frames and time.time() - start_time < timeout:
time.sleep(0.5)
print(f"Processed {frames_processed}/{target_frames} frames")

# Clean up subscription
subscription.dispose()
video_provider.dispose_all()

# Check if we have results
if len(results) == 0:
pytest.skip("No embeddings generated, but test connection established correctly")
return

print(f"Processed {len(results)} frames with CLIP embeddings")

# Analyze the results
assert len(results) > 0, "No embeddings generated"

# Check properties of first embedding
first_result = results[0]
assert "embedding" in first_result, "Result doesn't contain embedding"
assert "frame" in first_result, "Result doesn't contain frame"

# Check embedding shape and normalization
embedding = first_result["embedding"]
assert isinstance(embedding, np.ndarray), "Embedding is not a numpy array"
assert embedding.shape == (512,), (
f"Embedding has wrong shape: {embedding.shape}, expected (512,)"
)
assert abs(np.linalg.norm(embedding) - 1.0) < 1e-5, "Embedding is not normalized"

# Save the first embedding for similarity tests
if len(results) > 1 and "embedding" in results[0]:
# Create a class variable to store embeddings for the similarity test
TestImageEmbedding.test_embeddings = {
"embedding1": results[0]["embedding"],
"embedding2": results[1]["embedding"] if len(results) > 1 else None,
}
print(f"Saved embeddings for similarity testing")

print("CLIP embedding test passed successfully!")

except Exception as e:
pytest.fail(f"Test failed with error: {e}")

def test_clip_embedding_similarity(self):
"""Test CLIP embedding similarity search and text-to-image queries."""
try:
# Skip if previous test didn't generate embeddings
if not hasattr(TestImageEmbedding, "test_embeddings"):
pytest.skip("No embeddings available from previous test")
return

# Get embeddings from previous test
embedding1 = TestImageEmbedding.test_embeddings["embedding1"]
embedding2 = TestImageEmbedding.test_embeddings["embedding2"]

# Initialize embedding provider for text embeddings
embedding_provider = ImageEmbeddingProvider(model_name="clip", dimensions=512)

# Test frame-to-frame similarity
if embedding1 is not None and embedding2 is not None:
# Compute cosine similarity
similarity = np.dot(embedding1, embedding2)
print(f"Similarity between first two frames: {similarity:.4f}")

# Should be in range [-1, 1]
assert -1.0 <= similarity <= 1.0, f"Similarity out of valid range: {similarity}"

# Test text-to-image similarity
if embedding1 is not None:
# Generate a list of text queries to test
text_queries = ["a video frame", "a person", "an outdoor scene", "a kitchen"]

# Test each text query
for text_query in text_queries:
# Get text embedding
text_embedding = embedding_provider.get_text_embedding(text_query)

# Check text embedding properties
assert isinstance(text_embedding, np.ndarray), (
"Text embedding is not a numpy array"
)
assert text_embedding.shape == (512,), (
f"Text embedding has wrong shape: {text_embedding.shape}"
)
assert abs(np.linalg.norm(text_embedding) - 1.0) < 1e-5, (
"Text embedding is not normalized"
)

# Compute similarity between frame and text
text_similarity = np.dot(embedding1, text_embedding)
print(f"Similarity between frame and '{text_query}': {text_similarity:.4f}")

# Should be in range [-1, 1]
assert -1.0 <= text_similarity <= 1.0, (
f"Text-image similarity out of range: {text_similarity}"
)

print("CLIP embedding similarity tests passed successfully!")

except Exception as e:
pytest.fail(f"Similarity test failed with error: {e}")


if __name__ == "__main__":
pytest.main(["-v", "--disable-warnings", __file__])
Loading
Loading