diff --git a/CHANGELOG.md b/CHANGELOG.md index d99058f05..a714ad733 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). -## ## [0.3.5] - 2024-12-13 +## [0.3.6] - 2025-01-10 + +## Added + +- Add expected scores in ColPali E2E test + +## Changed + +- Loosen package dependencies + +## [0.3.5] - 2024-12-13 ## Added @@ -22,7 +32,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/). - General `CorpusQueryCollator` for BEIR style dataset training or hard negative training. This deprecates `HardNegCollator` but all changes to the training loop are made for a seemless update. - ### Changed - Updates BiPali config files @@ -31,7 +40,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/). - Removed `add_suffix` in the VisualRetrieverCollator and let the `suffix` be added in the individual processors. - Changed the incorrect `` token to `<|endoftext|>` fo query augmentation `ColQwen2Processor`. Note that previous models were trained with `<|endoftext|>` so this is simply a non-breaking inference upgrade patch. - ## [0.3.3] - 2024-10-29 ### Added diff --git a/pyproject.toml b/pyproject.toml index bf6737126..d620e007b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,9 +34,9 @@ classifiers = [ dependencies = [ "GPUtil", - "numpy<2.0.0", - "peft>=0.11.0,<0.12.0", - "pillow>=9.2.0,<11.0.0", + "numpy", + "peft>=0.11.0", + "pillow>=9.2.0", "requests", "torch>=2.2.0", "transformers>=4.46.1,<4.47.0", @@ -49,7 +49,9 @@ train = [ "configue>=5.0.0", "datasets>=2.19.1", "mteb>=1.16.3,<1.17.0", - "typer>=0.12.3, <1.0.0", + "peft>=0.11.0,<0.12.0", + "pillow>=9.2.0,<11.0.0", + "typer>=0.15.1", ] interpretability = [ @@ -58,9 +60,13 @@ interpretability = [ "seaborn>=0.13.2,<1.0.0", ] -dev = ["pytest>=8.0.0", "ruff>=0.4.0"] +dev = ["datasets>=2.19.1", "pytest>=8.0.0", "ruff>=0.4.0"] -all = ["colpali-engine[dev]", "colpali-engine[train]"] +all = [ + "colpali-engine[dev]", + "colpali-engine[interpretability]", + "colpali-engine[train]", +] [project.urls] homepage = "https://github.com/illuin-tech/colpali" diff --git a/tests/models/paligemma/colpali/test_colpali_e2e.py b/tests/models/paligemma/colpali/test_colpali_e2e.py index 7adb62b43..05e9d9f0a 100644 --- a/tests/models/paligemma/colpali/test_colpali_e2e.py +++ b/tests/models/paligemma/colpali/test_colpali_e2e.py @@ -2,7 +2,7 @@ import pytest import torch -from PIL import Image +from datasets import load_dataset from colpali_engine.models import ColPali, ColPaliProcessor from colpali_engine.utils.torch_utils import get_torch_device @@ -15,6 +15,7 @@ def model_name() -> str: @pytest.mark.slow def test_e2e_retrieval_and_scoring(model_name: str): + # Load the model and processor model = cast( ColPali, ColPali.from_pretrained( @@ -23,31 +24,39 @@ def test_e2e_retrieval_and_scoring(model_name: str): device_map=get_torch_device("auto"), ), ).eval() - - try: - processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name)) - - # Your inputs - images = [ - Image.new("RGB", (480, 480), color="white"), - Image.new("RGB", (250, 250), color="black"), - ] - queries = [ - "Is attention really all you need?", - "Are Benjamin, Antoine, Merve, and Jo best friends?", - ] - - # Process the inputs - batch_images = processor.process_images(images).to(model.device) - batch_queries = processor.process_queries(queries).to(model.device) - - # Forward pass - with torch.no_grad(): - image_embeddings = model(**batch_images) - query_embeddings = model(**batch_queries) - - scores = processor.score_multi_vector(query_embeddings, image_embeddings) - assert isinstance(scores, torch.Tensor) - - except Exception as e: - pytest.fail(f"Code raised an exception: {e}") + processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name)) + + # Load the test dataset + ds = load_dataset("hf-internal-testing/document-visual-retrieval-test", split="test") + + # Preprocess the examples + batch_images = processor.process_images(images=ds["image"]).to(model.device) + batch_queries = processor.process_queries(queries=ds["query"]).to(model.device) + + # Run inference + with torch.inference_mode(): + image_embeddings = model(**batch_images) + query_embeddings = model(**batch_queries) + + # Compute retrieval scores + scores = processor.score_multi_vector( + qs=query_embeddings, + ps=image_embeddings, + ) # (len(qs), len(ps)) + + assert scores.ndim == 2, f"Expected 2D tensor, got {scores.ndim}" + assert scores.shape == (len(ds), len(ds)), f"Expected shape {(len(ds), len(ds))}, got {scores.shape}" + + # Check if the maximum scores per row are in the diagonal of the matrix score + assert (scores.argmax(dim=1) == torch.arange(len(ds), device=scores.device)).all() + + # Further validation: fine-grained check, with a hardcoded score from the original implementation + expected_scores = torch.tensor( + [ + [16.5000, 7.5938, 15.6875], + [12.0625, 16.2500, 11.1250], + [15.2500, 12.6250, 21.0000], + ], + dtype=scores.dtype, + ) + assert torch.allclose(scores, expected_scores, atol=1), f"Expected scores {expected_scores}, got {scores}"