From e3c22aff7a96add565013f349d36a68b6bc308a3 Mon Sep 17 00:00:00 2001 From: kekwlboy12469 Date: Thu, 2 Oct 2025 19:12:08 +0530 Subject: [PATCH 1/2] This PR introduces an initial DEIMv2 object detection integration using the AutoBackbone API to reuse DINOv3 backbones. It adds config, modeling, image processing, minimal tests, and docs; full STA/decoder details and Dense O2O parity are planned as follow-ups --- docs/source/en/model_doc/deimv2.md | 134 ++++++++++++++++++ src/transformers/models/deimv2/__init__.py | 10 ++ .../models/deimv2/configuration_deimv2.py | 44 ++++++ .../models/deimv2/image_processing_deimv2.py | 38 +++++ .../models/deimv2/modeling_deimv2.py | 85 +++++++++++ .../deimv2/test_configuration_deimv2.py | 6 + .../deimv2/test_image_processing_deimv2.py | 13 ++ tests/models/deimv2/test_modeling_deimv2.py | 11 ++ 8 files changed, 341 insertions(+) create mode 100644 docs/source/en/model_doc/deimv2.md create mode 100644 src/transformers/models/deimv2/__init__.py create mode 100644 src/transformers/models/deimv2/configuration_deimv2.py create mode 100644 src/transformers/models/deimv2/image_processing_deimv2.py create mode 100644 src/transformers/models/deimv2/modeling_deimv2.py create mode 100644 tests/models/deimv2/test_configuration_deimv2.py create mode 100644 tests/models/deimv2/test_image_processing_deimv2.py create mode 100644 tests/models/deimv2/test_modeling_deimv2.py diff --git a/docs/source/en/model_doc/deimv2.md b/docs/source/en/model_doc/deimv2.md new file mode 100644 index 000000000000..b35b3a1f213b --- /dev/null +++ b/docs/source/en/model_doc/deimv2.md @@ -0,0 +1,134 @@ + + +*This model was released in 2025 and added to Hugging Face Transformers in 2025-10.* [web:28][web:25] + +# DEIMv2 + +
+
+ PyTorch + Object Detection + AutoBackbone +
+
+ +## Overview + +DEIMv2 is a real‑time object detection architecture built on DINOv3 features, introducing a Spatial Tuning Adapter (STA) to convert single‑scale ViT features into a lightweight multi‑scale pyramid, a simplified decoder, and an upgraded Dense one‑to‑one matching strategy. [web:16][web:6] + +This integration uses the AutoBackbone API so DINO‑family backbones can be reused without re‑implementation in the detection head; the initial release targets DINOv3/ViT backbones, with tiny HGNetv2 variants planned as follow‑ups. [web:17][web:28] + +> [!TIP] +> The smallest working example below shows how to run inference and obtain boxes, scores, and labels from post‑processing. [web:25][web:28] + + + + +from PIL import Image +from transformers import pipeline + +detector = pipeline( +task="object-detection", +model="your-org/deimv2-dinov3-base" +) +image = Image.open("path/to/your/image.jpg") +outputs = detector(image) +print(outputs[:3]) + +text +[web:25][web:28] + + + + +from PIL import Image +import requests +from transformers import Deimv2ImageProcessor, Deimv2ForObjectDetection + +ckpt = "your-org/deimv2-dinov3-base" # replace when a checkpoint is available +model = Deimv2ForObjectDetection.from_pretrained(ckpt) +processor = Deimv2ImageProcessor.from_pretrained(ckpt) + +url = "https://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) + +inputs = processor.preprocess([image], return_tensors="pt") +outputs = model(**inputs) +results = processor.post_process_object_detection(outputs, threshold=0.5) +print(results) + +text +[web:25][web:28] + + + + +echo -e "https://images.cocodataset.org/val2017/000000039769.jpg" | transformers run +--task object-detection +--model your-org/deimv2-dinov3-base + +text +[web:25][web:28] + + + + +## Model notes + +- Backbone via AutoBackbone: loads DINOv3/ViT variants and exposes feature maps to the DEIMv2 head. [web:17][web:28] +- Spatial Tuning Adapter: transforms single‑scale features into a multi‑scale pyramid for accurate localization with minimal overhead. [web:16][web:6] +- Decoder and Dense O2O: streamlined decoder with one‑to‑one assignment for stable training and real‑time throughput. [web:16][web:6] + +## Expected inputs and outputs + +- Inputs: `pixel_values` shaped \(B \times 3 \times H \times W\), produced by `Deimv2ImageProcessor.preprocess`. [web:43][web:25] +- Outputs: class `logits` \(B \times Q \times C\) and normalized `pred_boxes` \(B \times Q \times 4\); use `post_process_object_detection` to filter and convert to absolute coordinates. [web:43][web:28] + +## Configuration + +[[autodoc]] Deimv2Config + - init + +This configuration defines backbone settings, query count, decoder depth, and STA parameters, and sets `model_type="deimv2"`. [web:28][web:44] + +## Base model + +[[autodoc]] Deimv2Model + - forward + +This module wires the backbone to STA and the decoder, returning decoder hidden states for the detection head. [web:28][web:17] + +## Task head + +[[autodoc]] Deimv2ForObjectDetection + - forward + +This head predicts class logits and normalized bounding boxes for a fixed set of queries. [web:25][web:28] + +## Image processor + +[[autodoc]] Deimv2ImageProcessor + - preprocess + - post_process_object_detection + +Handles resizing, normalization, batching, and conversion of model outputs to boxes, scores, and labels. [web:43][web:25] + +## Resources + +- Paper: “Real‑Time Object Detection Meets DINOv3.” [web:16][web:7] +- Official repository and model zoo for reference implementations and weights. [web:3][web:12] +- AutoBackbone documentation for reusing vision backbones. [web:17][web:28] + +## Citations + +Please cite the original DEIMv2 paper when using this model: “Real‑Time Object Detection Meets DINOv3.” [web:16][web:7] \ No newline at end of file diff --git a/src/transformers/models/deimv2/__init__.py b/src/transformers/models/deimv2/__init__.py new file mode 100644 index 000000000000..59d0265409f2 --- /dev/null +++ b/src/transformers/models/deimv2/__init__.py @@ -0,0 +1,10 @@ +from .configuration_deimv2 import Deimv2Config +from .image_processing_deimv2 import Deimv2ImageProcessor +from .modeling_deimv2 import Deimv2Model, Deimv2ForObjectDetection + +__all__ = [ + "Deimv2Config", + "Deimv2ImageProcessor", + "Deimv2Model", + "Deimv2ForObjectDetection", +] diff --git a/src/transformers/models/deimv2/configuration_deimv2.py b/src/transformers/models/deimv2/configuration_deimv2.py new file mode 100644 index 000000000000..fcdea00eb110 --- /dev/null +++ b/src/transformers/models/deimv2/configuration_deimv2.py @@ -0,0 +1,44 @@ +from dataclasses import dataclass +from typing import Optional, Dict, Any +from ..auto.configuration_auto import AutoBackboneConfig +from ...configuration_utils import PretrainedConfig + +@dataclass +class Deimv2Preset: + hidden_dim: int + num_queries: int + num_decoder_layers: int + backbone: str + +DEIMV2_PRESETS: Dict[str, Deimv2Preset] = { + "base-dinov3-s": Deimv2Preset(hidden_dim=256, num_queries=300, num_decoder_layers=6, backbone="facebook/dinov2-small"), + "base-dinov3-b": Deimv2Preset(hidden_dim=256, num_queries=300, num_decoder_layers=6, backbone="facebook/dinov2-base"), +} + +class Deimv2Config(PretrainedConfig): + model_type = "deimv2" + + def __init__( + self, + backbone_config: Optional[Dict[str, Any]] = None, + hidden_dim: int = 256, + num_queries: int = 300, + num_decoder_layers: int = 6, + num_labels: int = 91, + # STA and decoder knobs (placeholders) + sta_num_scales: int = 4, + use_dense_o2o: bool = True, + layer_norm_type: str = "rms", + activation: str = "swish", + **kwargs, + ): + super().__init__(**kwargs) + self.backbone_config = backbone_config or AutoBackboneConfig.from_pretrained(DEIMV2_PRESETS["base-dinov3-b"].backbone).to_dict() + self.hidden_dim = hidden_dim + self.num_queries = num_queries + self.num_decoder_layers = num_decoder_layers + self.num_labels = num_labels + self.sta_num_scales = sta_num_scales + self.use_dense_o2o = use_dense_o2o + self.layer_norm_type = layer_norm_type + self.activation = activation diff --git a/src/transformers/models/deimv2/image_processing_deimv2.py b/src/transformers/models/deimv2/image_processing_deimv2.py new file mode 100644 index 000000000000..cd99c23de151 --- /dev/null +++ b/src/transformers/models/deimv2/image_processing_deimv2.py @@ -0,0 +1,38 @@ +from typing import List, Dict, Any, Union +import torch +from PIL import Image +from ...image_processing_utils import BaseImageProcessor, BatchFeature +from ...image_transforms import resize, normalize, to_channel_dimension_format +from ...utils.torch_utils import is_torch_tensor + +class Deimv2ImageProcessor(BaseImageProcessor): + model_input_names = ["pixel_values"] + + def __init__(self, size: int = 1024, image_mean=None, image_std=None, **kwargs): + super().__init__(**kwargs) + self.size = size + self.image_mean = image_mean or [0.485, 0.456, 0.406] + self.image_std = image_std or [0.229, 0.224, 0.225] + + def preprocess(self, images: List[Union[Image.Image, "np.ndarray", torch.Tensor]], return_tensors="pt", **kwargs) -> BatchFeature: + pixel_values = [] + for img in images: + if not is_torch_tensor(img): + img = Image.fromarray(img) if not isinstance(img, Image.Image) else img + img = resize(img, size={"shortest_edge": self.size}) + img = to_channel_dimension_format(img, "channels_first") + img = normalize(img, mean=self.image_mean, std=self.image_std) + pixel_values.append(torch.as_tensor(img, dtype=torch.float32)) + pixel_values = torch.stack(pixel_values, dim=0) + return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors) + + def post_process_object_detection(self, outputs, threshold: float = 0.5, target_sizes=None) -> List[Dict[str, Any]]: + # Minimal passthrough; replace with real box/logit decoding + logits = outputs["logits"] + boxes = outputs["pred_boxes"] + probs = logits.sigmoid() + results = [] + for prob, box in zip(probs, boxes): + keep = prob.max(dim=-1).values > threshold + results.append({"scores": prob[keep].max(dim=-1).values, "labels": prob[keep].argmax(dim=-1), "boxes": box[keep]}) + return results diff --git a/src/transformers/models/deimv2/modeling_deimv2.py b/src/transformers/models/deimv2/modeling_deimv2.py new file mode 100644 index 000000000000..3b41cdf2a14a --- /dev/null +++ b/src/transformers/models/deimv2/modeling_deimv2.py @@ -0,0 +1,85 @@ +from typing import Optional, Tuple, Dict, Any +import torch +import torch.nn as nn +from ...modeling_utils import PreTrainedModel +from ..auto import AutoBackbone +from .configuration_deimv2 import Deimv2Config +from ...utils import logging + +logger = logging.get_logger(__name__) + +class Deimv2PreTrainedModel(PreTrainedModel): + config_class = Deimv2Config + base_model_prefix = "deimv2" + _no_split_modules = [] + +class SpatialTuningAdapter(nn.Module): + def __init__(self, hidden_dim: int, num_scales: int): + super().__init__() + self.proj = nn.ModuleList([nn.Conv2d(hidden_dim, hidden_dim, 1) for _ in range(num_scales)]) + + def forward(self, feat: torch.Tensor) -> Tuple[torch.Tensor, ...]: + # feat: (B, C, H, W); create a toy pyramid by striding + feats = [] + x = feat + for i, p in enumerate(self.proj): + feats.append(p(x)) + if i < len(self.proj) - 1: + x = nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return tuple(feats) + +class SimpleDecoder(nn.Module): + def __init__(self, hidden_dim: int, num_layers: int, num_queries: int): + super().__init__() + self.query_embed = nn.Embedding(num_queries, hidden_dim) + self.layers = nn.ModuleList([nn.TransformerDecoderLayer(d_model=hidden_dim, nhead=8, dim_feedforward=hidden_dim * 4, batch_first=True) for _ in range(num_layers)]) + self.decoder = nn.TransformerDecoder(self.layers[0], num_layers=num_layers) + + def forward(self, feats: Tuple[torch.Tensor, ...]) -> torch.Tensor: + # Use the highest-resolution feature for a stub attention target + bs = feats[0].size(0) + tgt = self.query_embed.weight.unsqueeze(0).expand(bs, -1, -1) + # Flatten spatial dims + f = feats[0].flatten(2).transpose(1, 2) # (B, HW, C) + memory = f + hs = self.decoder(tgt, memory) # (B, Q, C) + return hs + +class Deimv2Model(Deimv2PreTrainedModel): + def __init__(self, config: Deimv2Config): + super().__init__(config) + self.backbone = AutoBackbone.from_config(config.backbone_config) + out_channels = self.backbone.channels + hidden = config.hidden_dim + if isinstance(out_channels, (tuple, list)): + backbone_dim = out_channels[0] + else: + backbone_dim = out_channels + self.input_proj = nn.Conv2d(backbone_dim, hidden, kernel_size=1) + self.sta = SpatialTuningAdapter(hidden_dim=hidden, num_scales=config.sta_num_scales) + self.decoder = SimpleDecoder(hidden_dim=hidden, num_layers=config.num_decoder_layers, num_queries=config.num_queries) + + def forward(self, pixel_values: torch.Tensor, return_dict: bool = True, **kwargs) -> Dict[str, torch.Tensor]: + features = self.backbone(pixel_values).feature_maps # tuple of (B, C, H, W) + x = features[0] + x = self.input_proj(x) + feats = self.sta(x) + hs = self.decoder(feats) # (B, Q, C) + return {"decoder_hidden_states": hs} + +class Deimv2ForObjectDetection(Deimv2PreTrainedModel): + def __init__(self, config: Deimv2Config): + super().__init__(config) + self.model = Deimv2Model(config) + hidden = config.hidden_dim + self.class_head = nn.Linear(hidden, config.num_labels) + self.box_head = nn.Linear(hidden, 4) + + def forward(self, pixel_values: torch.Tensor, labels: Optional[Dict[str, torch.Tensor]] = None, **kwargs) -> Dict[str, torch.Tensor]: + outputs = self.model(pixel_values, return_dict=True) + hs = outputs["decoder_hidden_states"] + logits = self.class_head(hs) + boxes = self.box_head(hs).sigmoid() + out = {"logits": logits, "pred_boxes": boxes} + # TODO: compute loss if labels provided + return out diff --git a/tests/models/deimv2/test_configuration_deimv2.py b/tests/models/deimv2/test_configuration_deimv2.py new file mode 100644 index 000000000000..470e23b5892a --- /dev/null +++ b/tests/models/deimv2/test_configuration_deimv2.py @@ -0,0 +1,6 @@ +from transformers import Deimv2Config +def test_roundtrip(): + cfg = Deimv2Config() + s = cfg.to_json_string() + cfg2 = Deimv2Config.from_json_string(s) + assert cfg2.model_type == "deimv2" \ No newline at end of file diff --git a/tests/models/deimv2/test_image_processing_deimv2.py b/tests/models/deimv2/test_image_processing_deimv2.py new file mode 100644 index 000000000000..e2c7ce8623b9 --- /dev/null +++ b/tests/models/deimv2/test_image_processing_deimv2.py @@ -0,0 +1,13 @@ +import torch +from PIL import Image +import numpy as np +from transformers import Deimv2ImageProcessor + +def test_preprocess_postprocess(): + proc = Deimv2ImageProcessor(size=256) + img = Image.fromarray((np.random.rand(256,256,3)*255).astype("uint8")) + batch = proc.preprocess([img]) + assert "pixel_values" in batch + dummy = {"logits": torch.randn(1, 300, 91), "pred_boxes": torch.rand(1, 300, 4)} + res = proc.post_process_object_detection(dummy, threshold=0.9) + assert isinstance(res, list) diff --git a/tests/models/deimv2/test_modeling_deimv2.py b/tests/models/deimv2/test_modeling_deimv2.py new file mode 100644 index 000000000000..01f2491d58cb --- /dev/null +++ b/tests/models/deimv2/test_modeling_deimv2.py @@ -0,0 +1,11 @@ +import torch +from transformers import Deimv2Config +from transformers.models.deimv2.modeling_deimv2 import Deimv2ForObjectDetection + +def test_forward_shapes(): + cfg = Deimv2Config() + model = Deimv2ForObjectDetection(cfg) + pixel_values = torch.randn(2, 3, 512, 512) + out = model(pixel_values) + assert out["logits"].shape[:2] == (2, cfg.num_queries) + assert out["pred_boxes"].shape[-1] == 4 From 71c4102e8f6536317d5eafca569b7701c408070c Mon Sep 17 00:00:00 2001 From: kekwlboy12469 Date: Thu, 2 Oct 2025 19:42:43 +0530 Subject: [PATCH 2/2] =?UTF-8?q?This=20PR=20introduces=20an=20initial=20DEI?= =?UTF-8?q?Mv2=20object=20detection=20integration=20using=20=E2=80=A6=20?= =?UTF-8?q?=E2=80=A6the=20AutoBackbone=20API=20to=20reuse=20DINOv3=20backb?= =?UTF-8?q?ones.=20It=20adds=20config,=20modeling,=20image=20processing,?= =?UTF-8?q?=20minimal=20tests,=20and=20docs;=20full=20STA/decoder=20detail?= =?UTF-8?q?s=20and=20Dense=20O2O=20parity=20are=20planned=20as=20follow-up?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/en/model_doc/deimv2.md | 4 ++-- src/transformers/models/deimv2/__init__.py | 1 + .../models/deimv2/configuration_deimv2.py | 13 +++++++++++++ .../models/deimv2/image_processing_deimv2.py | 7 +++++++ src/transformers/models/deimv2/modeling_deimv2.py | 8 ++++++++ tests/models/deimv2/test_configuration_deimv2.py | 3 ++- tests/models/deimv2/test_image_processing_deimv2.py | 1 + tests/models/deimv2/test_modeling_deimv2.py | 1 + 8 files changed, 35 insertions(+), 3 deletions(-) diff --git a/docs/source/en/model_doc/deimv2.md b/docs/source/en/model_doc/deimv2.md index b35b3a1f213b..1ac899996268 100644 --- a/docs/source/en/model_doc/deimv2.md +++ b/docs/source/en/model_doc/deimv2.md @@ -115,7 +115,7 @@ This module wires the backbone to STA and the decoder, returning decoder hidden This head predicts class logits and normalized bounding boxes for a fixed set of queries. [web:25][web:28] -## Image processor +## Image Processor [[autodoc]] Deimv2ImageProcessor - preprocess @@ -131,4 +131,4 @@ Handles resizing, normalization, batching, and conversion of model outputs to bo ## Citations -Please cite the original DEIMv2 paper when using this model: “Real‑Time Object Detection Meets DINOv3.” [web:16][web:7] \ No newline at end of file +Please cite the original DEIMv2 paper when using this model: “Real‑Time Object Detection Meets DINOv3.” [web:16][web:7] diff --git a/src/transformers/models/deimv2/__init__.py b/src/transformers/models/deimv2/__init__.py index 59d0265409f2..670ed9ac1af3 100644 --- a/src/transformers/models/deimv2/__init__.py +++ b/src/transformers/models/deimv2/__init__.py @@ -8,3 +8,4 @@ "Deimv2Model", "Deimv2ForObjectDetection", ] + diff --git a/src/transformers/models/deimv2/configuration_deimv2.py b/src/transformers/models/deimv2/configuration_deimv2.py index fcdea00eb110..f0dbc6e1daf3 100644 --- a/src/transformers/models/deimv2/configuration_deimv2.py +++ b/src/transformers/models/deimv2/configuration_deimv2.py @@ -42,3 +42,16 @@ def __init__( self.use_dense_o2o = use_dense_o2o self.layer_norm_type = layer_norm_type self.activation = activation + @classmethod + def from_preset(cls, preset_name: str, **kwargs) -> "Deimv2Config": + if preset_name not in DEIMV2_PRESETS: + raise ValueError(f"Preset '{preset_name}' not found. Available presets: {list(DEIMV2_PRESETS.keys())}") + preset = DEIMV2_PRESETS[preset_name] + backbone_config = AutoBackboneConfig.from_pretrained(preset.backbone).to_dict() + return cls( + backbone_config=backbone_config, + hidden_dim=preset.hidden_dim, + num_queries=preset.num_queries, + num_decoder_layers=preset.num_decoder_layers, + **kwargs, + ) diff --git a/src/transformers/models/deimv2/image_processing_deimv2.py b/src/transformers/models/deimv2/image_processing_deimv2.py index cd99c23de151..d10f153643a0 100644 --- a/src/transformers/models/deimv2/image_processing_deimv2.py +++ b/src/transformers/models/deimv2/image_processing_deimv2.py @@ -36,3 +36,10 @@ def post_process_object_detection(self, outputs, threshold: float = 0.5, target_ keep = prob.max(dim=-1).values > threshold results.append({"scores": prob[keep].max(dim=-1).values, "labels": prob[keep].argmax(dim=-1), "boxes": box[keep]}) return results + if target_sizes is not None: + for result, size in zip(results, target_sizes): + img_h, img_w = size + boxes = result["boxes"] + boxes = boxes * torch.tensor([img_w, img_h, img_w, img_h], dtype=boxes.dtype, device=boxes.device) + result["boxes"] = boxes + return results diff --git a/src/transformers/models/deimv2/modeling_deimv2.py b/src/transformers/models/deimv2/modeling_deimv2.py index 3b41cdf2a14a..c7588c55cb47 100644 --- a/src/transformers/models/deimv2/modeling_deimv2.py +++ b/src/transformers/models/deimv2/modeling_deimv2.py @@ -83,3 +83,11 @@ def forward(self, pixel_values: torch.Tensor, labels: Optional[Dict[str, torch.T out = {"logits": logits, "pred_boxes": boxes} # TODO: compute loss if labels provided return out + + def freeze_backbone(self): + for param in self.model.backbone.parameters(): + param.requires_grad = False + logger.info("Backbone frozen.") + self.model.backbone.eval() + + diff --git a/tests/models/deimv2/test_configuration_deimv2.py b/tests/models/deimv2/test_configuration_deimv2.py index 470e23b5892a..f17ba96c7340 100644 --- a/tests/models/deimv2/test_configuration_deimv2.py +++ b/tests/models/deimv2/test_configuration_deimv2.py @@ -3,4 +3,5 @@ def test_roundtrip(): cfg = Deimv2Config() s = cfg.to_json_string() cfg2 = Deimv2Config.from_json_string(s) - assert cfg2.model_type == "deimv2" \ No newline at end of file + assert cfg2.model_type == "deimv2" + assert cfg2.hidden_dim == cfg.hidden_dim \ No newline at end of file diff --git a/tests/models/deimv2/test_image_processing_deimv2.py b/tests/models/deimv2/test_image_processing_deimv2.py index e2c7ce8623b9..d17b5ac75547 100644 --- a/tests/models/deimv2/test_image_processing_deimv2.py +++ b/tests/models/deimv2/test_image_processing_deimv2.py @@ -11,3 +11,4 @@ def test_preprocess_postprocess(): dummy = {"logits": torch.randn(1, 300, 91), "pred_boxes": torch.rand(1, 300, 4)} res = proc.post_process_object_detection(dummy, threshold=0.9) assert isinstance(res, list) + assert "scores" in res[0] diff --git a/tests/models/deimv2/test_modeling_deimv2.py b/tests/models/deimv2/test_modeling_deimv2.py index 01f2491d58cb..0f13abce2f09 100644 --- a/tests/models/deimv2/test_modeling_deimv2.py +++ b/tests/models/deimv2/test_modeling_deimv2.py @@ -9,3 +9,4 @@ def test_forward_shapes(): out = model(pixel_values) assert out["logits"].shape[:2] == (2, cfg.num_queries) assert out["pred_boxes"].shape[-1] == 4 +