huggingface · tarekziade · Mar 31, 2026 · Apr 1, 2026
diff --git a/.ai/skills/new-model/SKILL.md b/.ai/skills/new-model/SKILL.md
diff --git a/.ai/skills/new-model/review-standards.md b/.ai/skills/new-model/review-standards.md
@@ -0,0 +1,208 @@
+# Reviewer-Enforced Standards
+
+These standards are derived from actual reviewer feedback on model PRs (primarily vasqu's review of PR #44320, SAM3-LiteText). Violations will be flagged and changes requested.
+
+## Modeling Code
+
+### nn.ModuleList over nn.Sequential
+
+Bad:
+```python
+self.layers = nn.Sequential(
+    ConvLayer(hidden_size),
+    BatchNorm(hidden_size),
+)
+```
+
+Good:
+```python
+self.conv = nn.Conv2d(hidden_size, hidden_size, kernel_size=3, padding=1, bias=False)
+self.norm = nn.BatchNorm2d(hidden_size)
+```
+
+Or for variable-length layer lists:
+```python
+self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.num_layers)])
+```
+
+### nn.Linear for projections
+
+Bad:
+```python
+self.projection = nn.Parameter(torch.empty(config.hidden_size, config.projection_dim))
+# manual matmul in forward: output = hidden @ self.projection
+```
+
+Good:
+```python
+self.projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+```
+
+### Reuse existing components
+
+Bad — rewriting MLP from scratch:
+```python
+class MyModelMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, x):
+        return self.fc2(self.act(self.fc1(x)))
+```
+
+Good — inherit from existing:
+```python
+from ..clip.modeling_clip import CLIPMLP
+
+class MyModelMLP(CLIPMLP):
+    pass  # or override only what differs
+```
+
+### Configurable constants
+
+Bad:
+```python
+self.layer_scale = nn.Parameter(1e-5 * torch.ones((hidden_size,)), requires_grad=True)
+```
+
+Good:
+```python
+# In config:
+layer_scale_init: float = 1e-5
+
+# In model:
+self.layer_scale = nn.Parameter(
+    config.layer_scale_init * torch.ones((config.hidden_size,)), requires_grad=True
+)
+```
+
+### Clean naming
+
+Bad — keeping opaque names from original codebase:
+```python
+self.rbr_skip = nn.BatchNorm2d(hidden_size)
+self.rbr_conv = nn.ModuleList([...])
+```
+
+Good — descriptive names:
+```python
+self.skip_norm = nn.BatchNorm2d(hidden_size)
+self.conv_branches = nn.ModuleList([...])
+```
+
+If you must keep a name, document what it means.
+
+### Minimal PreTrainedModel overrides
+
+Bad — overriding attributes to their default values:
+```python
+class MyPreTrainedModel(PreTrainedModel):
+    config_class = MyConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _no_split_modules = [...]
+    _skip_keys_device_placement = [...]
+    # ... 10 more attributes that are already the default
+```
+
+Good — only override what differs:
+```python
+class MyPreTrainedModel(PreTrainedModel):
+    config_class = MyConfig
+    main_input_name = "pixel_values"
+    input_modalities = ["image", "text"]
+    _no_split_modules = ["MyEncoderLayer", "MyDecoderLayer"]
+```
+
+### Data transforms inside layers
+
+Bad — permutations in parent forward loop:
+```python
+# In the model's forward:
+for idx, layer in enumerate(self.layers):
+    if idx in self.special_indices:
+        hidden_states = hidden_states.permute(0, 2, 1).unsqueeze(2)
+        hidden_states = layer(hidden_states)
+        hidden_states = hidden_states.squeeze(2).permute(0, 2, 1)
+    else:
+        hidden_states = layer(hidden_states)
+```
+
+Good — each layer handles its own format:
+```python
+# In the special layer's forward:
+def forward(self, hidden_states):
+    hidden_states = hidden_states.permute(0, 2, 1).unsqueeze(2)
+    # ... do computation ...
+    return hidden_states.squeeze(2).permute(0, 2, 1)
+
+# In the model's forward (clean):
+for layer in self.layers:
+    hidden_states = layer(hidden_states)
+```
+
+### Conditional layers with nn.Identity
+
+Bad:
+```python
+def forward(self, hidden_states):
+    if self.config.use_special_mixer:
+        hidden_states = self.special_mixer(hidden_states)
+    # else: pass through
+```
+
+Good:
+```python
+def __init__(self, config):
+    self.token_mixer = SpecialMixer(config) if config.use_special_mixer else nn.Identity()
+
+def forward(self, hidden_states):
+    hidden_states = self.token_mixer(hidden_states)
+```
+
+### Attention support flags
+
+Bad — skipping tests:
+```python
+# In test file:
+@unittest.skip("Flash attention not compatible with float masks")
+def test_flash_attn_2_inference_equivalence(self):
+    pass
+```
+
+Good — setting flags in model:
+```python
+# In model file:
+class MyPreTrainedModel(PreTrainedModel):
+    _supports_flash_attn = False  # float attention masks incompatible
+```
+
+### @capture_outputs decorator
+
+Bad:
+```python
+@capture_outputs(tie_last_hidden_states=False)
+def forward(self, ...):
+```
+
+Good (unless the parameter is truly needed for backward compatibility):
+```python
+@capture_outputs
+def forward(self, ...):
+```
+
+## Meta-observation
+
+Always apply a simplification pass:
+
+- Remove redundant abstractions
+- Flatten unnecessary nesting
+- Replace verbose patterns with existing library utilities
+- Question every attribute override and magic number
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -1324,6 +1324,8 @@
         title: SAM3
       - local: model_doc/sam3_video
         title: SAM3 Video
+      - local: model_doc/sam3_lite_text
+        title: SAM3-LiteText
       - local: model_doc/shieldgemma2
         title: ShieldGemma2
       - local: model_doc/siglip

diff --git a/docs/source/en/model_doc/sam3_lite_text.md b/docs/source/en/model_doc/sam3_lite_text.md
@@ -0,0 +1,69 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+-->
+*This model was released on {release_date} and added to Hugging Face Transformers on 2026-03-31.*
+
+# SAM3-LiteText
+
+## Overview
+
+SAM3-LiteText is a lightweight variant of [SAM3](sam3) that replaces the CLIP text encoder with a compact MobileCLIP-S0 text encoder. This reduces the text encoder parameters by up to 88% while maintaining the full SAM3 vision and segmentation capabilities.
+
+The model was introduced in the [EfficientSAM3](https://github.com/SimonZeng7108/efficientsam3) repository by Simon Zeng.
+
+Key differences from SAM3:
+- **Text encoder**: MobileCLIP-S0 (RepMixer + Transformer) instead of CLIP (512 hidden dim, 4 transformer layers + 2 RepMixer blocks, context length 16)
+- **All other components** (ViT-H backbone, FPN, geometry encoder, DETR encoder/decoder, mask decoder) are identical to SAM3
+
+## Usage
+
+```python
+from transformers import AutoModel, AutoProcessor
+
+model = AutoModel.from_pretrained("Simon7108528/EfficientSAM3", device_map="auto")
+processor = AutoProcessor.from_pretrained("Simon7108528/EfficientSAM3")
+
+inputs = processor(images=image, text="cat", return_tensors="pt")
+outputs = model(**inputs)
+```
+
+## Sam3LiteTextConfig
+
+[[autodoc]] Sam3LiteTextConfig
+
+## Sam3LiteTextMobileCLIPConfig
+
+[[autodoc]] Sam3LiteTextMobileCLIPConfig
+
+## Sam3LiteTextViTConfig
+
+[[autodoc]] Sam3LiteTextViTConfig
+
+## Sam3LiteTextModel
+
+[[autodoc]] Sam3LiteTextModel
+    - forward
+    - get_text_features
+    - get_vision_features
+
+## Sam3LiteTextViTModel
+
+[[autodoc]] Sam3LiteTextViTModel
+    - forward
+
+## Sam3LiteTextImageProcessor
+
+[[autodoc]] Sam3LiteTextImageProcessor
+
+## Sam3LiteTextProcessor
+
+[[autodoc]] Sam3LiteTextProcessor
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -365,6 +365,7 @@
     from .sam2 import *
     from .sam2_video import *
     from .sam3 import *
+    from .sam3_lite_text import *
     from .sam3_tracker import *
     from .sam3_tracker_video import *
     from .sam3_video import *

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -416,6 +416,9 @@
         ("sam2_video", "Sam2VideoConfig"),
         ("sam2_vision_model", "Sam2VisionConfig"),
         ("sam3", "Sam3Config"),
+        ("sam3_lite_text", "Sam3LiteTextConfig"),
+        ("sam3_lite_text_mobileclip", "Sam3LiteTextMobileCLIPConfig"),
+        ("sam3_lite_text_vit_model", "Sam3LiteTextViTConfig"),
         ("sam3_tracker", "Sam3TrackerConfig"),
         ("sam3_tracker_video", "Sam3TrackerVideoConfig"),
         ("sam3_video", "Sam3VideoConfig"),
@@ -944,6 +947,9 @@
         ("sam2_video", "Sam2VideoModel"),
         ("sam2_vision_model", "Sam2VisionModel"),
         ("sam3", "SAM3"),
+        ("sam3_lite_text", "SAM3-LiteText"),
+        ("sam3_lite_text_mobileclip", "Sam3LiteTextMobileCLIPEncoder"),
+        ("sam3_lite_text_vit_model", "Sam3LiteTextViTModel"),
         ("sam3_tracker", "Sam3Tracker"),
         ("sam3_tracker_video", "Sam3TrackerVideo"),
         ("sam3_video", "Sam3VideoModel"),
@@ -1131,6 +1137,8 @@
         ("sam2_hiera_det_model", "sam2"),
         ("sam3_vit_model", "sam3"),
         ("sam3_vision_model", "sam3"),
+        ("sam3_lite_text_vit_model", "sam3_lite_text"),
+        ("sam3_lite_text_mobileclip", "sam3_lite_text"),
         ("edgetam_vision_model", "edgetam"),
         ("sam_hq_vision_model", "sam_hq"),
         ("t5gemma2_encoder", "t5gemma2"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -225,6 +225,7 @@
             ("sam2", {"torchvision": "Sam2ImageProcessor"}),
             ("sam2_video", {"torchvision": "Sam2ImageProcessor"}),
             ("sam3", {"torchvision": "Sam3ImageProcessor"}),
+            ("sam3_lite_text", {"torchvision": "Sam3LiteTextImageProcessor"}),
             ("sam3_tracker", {"torchvision": "Sam3ImageProcessor"}),
             ("sam3_tracker_video", {"torchvision": "Sam3ImageProcessor"}),
             ("sam3_video", {"torchvision": "Sam3ImageProcessor"}),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -394,6 +394,8 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("sam2_video", "Sam2VideoModel"),
         ("sam2_vision_model", "Sam2VisionModel"),
         ("sam3", "Sam3Model"),
+        ("sam3_lite_text", "Sam3LiteTextModel"),
+        ("sam3_lite_text_vit_model", "Sam3LiteTextViTModel"),
         ("sam3_tracker", "Sam3TrackerModel"),
         ("sam3_tracker", "Sam3TrackerModel"),
         ("sam3_tracker_video", "Sam3TrackerVideoModel"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -151,6 +151,7 @@
             ("sam", "SamProcessor"),
             ("sam2", "Sam2Processor"),
             ("sam3", "Sam3Processor"),
+            ("sam3_lite_text", "Sam3LiteTextProcessor"),
             ("sam_hq", "SamHQProcessor"),
             ("seamless_m4t", "SeamlessM4TProcessor"),
             ("sew", "Wav2Vec2Processor"),

diff --git a/src/transformers/models/sam3_lite_text/__init__.py b/src/transformers/models/sam3_lite_text/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_sam3_lite_text import *
+    from .image_processing_sam3_lite_text import *
+    from .modeling_sam3_lite_text import *
+    from .processing_sam3_lite_text import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)