illuin-tech · tonywu71 · Jan 10, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,17 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
-## ## [0.3.5] - 2024-12-13
+## [0.3.6] - 2025-01-10
+
+## Added
+
+- Add expected scores in ColPali E2E test
+
+## Changed
+
+- Loosen package dependencies
+
+## [0.3.5] - 2024-12-13
 
 ## Added
 
@@ -22,7 +32,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 
 - General `CorpusQueryCollator` for BEIR style dataset training or hard negative training. This deprecates `HardNegCollator` but all changes to the training loop are made for a seemless update.
 
-
 ### Changed
 
 - Updates BiPali config files
@@ -31,7 +40,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 - Removed `add_suffix` in the VisualRetrieverCollator and let the `suffix` be added in the individual processors.
 - Changed the incorrect `<pad>` token to `<|endoftext|>` fo query augmentation `ColQwen2Processor`. Note that previous models were trained with `<|endoftext|>` so this is simply a non-breaking inference upgrade patch.
 
-
 ## [0.3.3] - 2024-10-29
 
 ### Added

diff --git a/pyproject.toml b/pyproject.toml
@@ -34,9 +34,9 @@ classifiers = [
 
 dependencies = [
     "GPUtil",
-    "numpy<2.0.0",
-    "peft>=0.11.0,<0.12.0",
-    "pillow>=9.2.0,<11.0.0",
+    "numpy",
+    "peft>=0.11.0",
+    "pillow>=9.2.0",
     "requests",
     "torch>=2.2.0",
     "transformers>=4.46.1,<4.47.0",
@@ -49,7 +49,9 @@ train = [
     "configue>=5.0.0",
     "datasets>=2.19.1",
     "mteb>=1.16.3,<1.17.0",
-    "typer>=0.12.3, <1.0.0",
+    "peft>=0.11.0,<0.12.0",
+    "pillow>=9.2.0,<11.0.0",
+    "typer>=0.15.1",
 ]
 
 interpretability = [
@@ -58,9 +60,13 @@ interpretability = [
     "seaborn>=0.13.2,<1.0.0",
 ]
 
-dev = ["pytest>=8.0.0", "ruff>=0.4.0"]
+dev = ["datasets>=2.19.1", "pytest>=8.0.0", "ruff>=0.4.0"]
 
-all = ["colpali-engine[dev]", "colpali-engine[train]"]
+all = [
+    "colpali-engine[dev]",
+    "colpali-engine[interpretability]",
+    "colpali-engine[train]",
+]
 
 [project.urls]
 homepage = "https://github.com/illuin-tech/colpali"

diff --git a/tests/models/paligemma/colpali/test_colpali_e2e.py b/tests/models/paligemma/colpali/test_colpali_e2e.py
@@ -2,7 +2,7 @@
 
 import pytest
 import torch
-from PIL import Image
+from datasets import load_dataset
 
 from colpali_engine.models import ColPali, ColPaliProcessor
 from colpali_engine.utils.torch_utils import get_torch_device
@@ -15,6 +15,7 @@ def model_name() -> str:
 
 @pytest.mark.slow
 def test_e2e_retrieval_and_scoring(model_name: str):
+    # Load the model and processor
     model = cast(
         ColPali,
         ColPali.from_pretrained(
@@ -23,31 +24,39 @@ def test_e2e_retrieval_and_scoring(model_name: str):
             device_map=get_torch_device("auto"),
         ),
     ).eval()
-
-    try:
-        processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))
-
-        # Your inputs
-        images = [
-            Image.new("RGB", (480, 480), color="white"),
-            Image.new("RGB", (250, 250), color="black"),
-        ]
-        queries = [
-            "Is attention really all you need?",
-            "Are Benjamin, Antoine, Merve, and Jo best friends?",
-        ]
-
-        # Process the inputs
-        batch_images = processor.process_images(images).to(model.device)
-        batch_queries = processor.process_queries(queries).to(model.device)
-
-        # Forward pass
-        with torch.no_grad():
-            image_embeddings = model(**batch_images)
-            query_embeddings = model(**batch_queries)
-
-        scores = processor.score_multi_vector(query_embeddings, image_embeddings)
-        assert isinstance(scores, torch.Tensor)
-
-    except Exception as e:
-        pytest.fail(f"Code raised an exception: {e}")
+    processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))
+
+    # Load the test dataset
+    ds = load_dataset("hf-internal-testing/document-visual-retrieval-test", split="test")
+
+    # Preprocess the examples
+    batch_images = processor.process_images(images=ds["image"]).to(model.device)
+    batch_queries = processor.process_queries(queries=ds["query"]).to(model.device)
+
+    # Run inference
+    with torch.inference_mode():
+        image_embeddings = model(**batch_images)
+        query_embeddings = model(**batch_queries)
+
+    # Compute retrieval scores
+    scores = processor.score_multi_vector(
+        qs=query_embeddings,
+        ps=image_embeddings,
+    )  # (len(qs), len(ps))
+
+    assert scores.ndim == 2, f"Expected 2D tensor, got {scores.ndim}"
+    assert scores.shape == (len(ds), len(ds)), f"Expected shape {(len(ds), len(ds))}, got {scores.shape}"
+
+    # Check if the maximum scores per row are in the diagonal of the matrix score
+    assert (scores.argmax(dim=1) == torch.arange(len(ds), device=scores.device)).all()
+
+    # Further validation: fine-grained check, with a hardcoded score from the original implementation
+    expected_scores = torch.tensor(
+        [
+            [16.5000, 7.5938, 15.6875],
+            [12.0625, 16.2500, 11.1250],
+            [15.2500, 12.6250, 21.0000],
+        ],
+        dtype=scores.dtype,
+    )
+    assert torch.allclose(scores, expected_scores, atol=1), f"Expected scores {expected_scores}, got {scores}"