huggingface · AkshatSh · Sep 18, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -1237,6 +1237,8 @@
         title: InstructBlipVideo
       - local: model_doc/internvl
         title: InternVL
+      - local: model_doc/isaac
+        title: Isaac
       - local: model_doc/janus
         title: Janus
       - local: model_doc/kosmos-2

diff --git a/docs/source/en/model_doc/isaac.md b/docs/source/en/model_doc/isaac.md
@@ -0,0 +1,143 @@
+<!--Copyright 2026 Perceptron, Inc. and The HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on {release_date} and added to Hugging Face Transformers on 2026-04-13.*
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# Isaac
+
+## Overview
+
+Isaac is Perceptron's vision-language model (VLM) that pairs a SigLIP2 vision encoder with a Qwen3 decoder-only stack. The
+Transformers implementation supports text-only and image-conditioned generation, including prompts with multiple interleaved
+images. Isaac uses variable-resolution image preprocessing and can optionally reduce spatial tokens with pixel shuffle to keep
+long multimodal prompts manageable. For more information, refer to the [technical report](https://github.com/perceptron-ai-inc/perceptron/blob/main/papers/isaac_01.pdf).
+
+Isaac checkpoints are distributed under Perceptron's Non-Production license; please review the license that ships with the
+weights before using them in commercial settings.
+
+## Usage tips
+
+- Batched inputs can mix text-only and multimodal samples. For direct processor/model batching, pass images as a nested
+  list such as `[[], [image_a], [image_b, image_c]]`.
+- `image_grid_thw[batch_idx, image_slot] == (0, 0, 0)` marks a padded empty slot. Real image slots have
+  `(T=1, H>0, W>0)`.
+- If truncation is enabled, the processor keeps the rightmost part of the multimodal prompt and updates the slot-local
+  `image_metadata[..., 0]` and `image_metadata[..., 1]` values automatically.
+
+## Usage example
+
+Isaac uses explicit image placeholders in the rendered prompt. Every occurrence of `processor.image_token` (usually `<image>`) must have a matching image in the `images` argument.
+
+```py
+import torch
+from PIL import Image
+from transformers import AutoProcessor, IsaacForConditionalGeneration
+
+model_id = "PerceptronAI/Isaac-0.1"
+processor = AutoProcessor.from_pretrained(model_id)
+model = IsaacForConditionalGeneration.from_pretrained(
+    model_id,
+    dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="flash_attention_2",
+)
+
+conversation = [
+    {
+
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Compare the two figures and explain what changed."},
+            {"type": "image", "path": "first_image.png"},
+            {"type": "image", "path": "second_image.png"},
+            ],
+    },
+]
+
+prompt = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+    add_generation_prompt=True,
+    return_tensors="pt",
+)
+
+inputs = processor(text=prompt, images=images, return_tensors="pt").to(model.device)
+generated_ids = model.generate(**inputs, max_new_tokens=256, do_sample=False,)
+
+generated_ids = generated_ids[:, inputs["input_ids"].shape[1] :]
+response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(response)
+```
+
+### Post-processing grounded outputs
+
+Isaac can generate grounded points and boxes in tagged text spans. Use `post_process_generation()` to strip the tags and
+recover structured annotations.
+
+```py
+clean_text, annotations = processor.post_process_generation(response, expected="box")
+print(clean_text)
+print(annotations)
+```
+
+Set `expected="point"` to extract point annotations, or leave `expected=None` to collect both points and boxes.
+
+## IsaacVisionConfig
+
+[[autodoc]] IsaacVisionConfig
+
+## IsaacTextConfig
+
+[[autodoc]] IsaacTextConfig
+
+## IsaacConfig
+
+[[autodoc]] IsaacConfig
+
+## IsaacVisionModel
+
+[[autodoc]] IsaacVisionModel
+
+## IsaacTextModel
+
+[[autodoc]] IsaacTextModel
+    - forward
+
+## IsaacModel
+
+[[autodoc]] IsaacModel
+    - forward
+
+## IsaacForConditionalGeneration
+
+[[autodoc]] IsaacForConditionalGeneration
+    - forward
+
+## IsaacProcessor
+
+[[autodoc]] IsaacProcessor
+
+## IsaacImageProcessor
+
+[[autodoc]] IsaacImageProcessor
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
@@ -133,6 +133,10 @@ def _build_checkpoint_conversion_mapping():
             ),
             WeightRenaming(source_patterns=r"^visual", target_patterns="model.visual"),
         ],
+        "isaac": [
+            WeightRenaming(source_patterns=r"text_model", target_patterns="language_model"),
+            WeightRenaming(source_patterns=r"vision_tower", target_patterns="visual"),
+        ],
         "colqwen2": [
             WeightRenaming(source_patterns=r"vlm.model", target_patterns="vlm"),
             WeightRenaming(source_patterns=r"vlm(?!\.(language_model|visual))", target_patterns="vlm.language_model"),

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -1003,6 +1003,33 @@ class EmbeddingAccessMixin:
 
     _input_embed_layer = "embed_tokens"  # default layer that holds input embeddings.
 
+    def _resolve_input_embed_layer(self) -> tuple[nn.Module | None, str]:
+        """
+        Returns the parent module and leaf attribute for `_input_embed_layer`.
+
+        Supports both a simple attribute name such as `embed_tokens` and a dotted path such as
+        `text_model.embed_tokens`.
+        """
+
+        name = getattr(self, "_input_embed_layer", "embed_tokens")
+        if "." not in name:
+            return None, name
+
+        module_path, _, attribute_name = name.rpartition(".")
+        try:
+            module = self.get_submodule(module_path)
+        except AttributeError as error:
+            raise NotImplementedError(
+                f"`_input_embed_layer={name}` could not be resolved for {self.__class__.__name__}."
+            ) from error
+
+        if not hasattr(module, attribute_name):
+            raise NotImplementedError(
+                f"`_input_embed_layer={name}` could not be resolved for {self.__class__.__name__}."
+            )
+
+        return module, attribute_name
+
     def get_input_embeddings(self) -> nn.Module:
         """
         Returns the model's input embeddings.
@@ -1011,7 +1038,9 @@ def get_input_embeddings(self) -> nn.Module:
             `nn.Module`: A torch module mapping vocabulary to hidden states.
         """
 
-        name = getattr(self, "_input_embed_layer", "embed_tokens")
+        module, name = self._resolve_input_embed_layer()
+        if module is not None:
+            return getattr(module, name)
 
         # 1) Direct attribute (most NLP models).
         if (default_embedding := getattr(self, name, None)) is not None:
@@ -1044,7 +1073,11 @@ def set_input_embeddings(self, value: nn.Module):
             should) override for exotic layouts.
         """
 
-        name = getattr(self, "_input_embed_layer", "embed_tokens")
+        module, name = self._resolve_input_embed_layer()
+        if module is not None:
+            setattr(module, name, value)
+            return
+
         # 1) Direct attribute (most NLP models)
         if hasattr(self, name):
             setattr(self, name, value)

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -201,6 +201,7 @@
     from .instructblip import *
     from .instructblipvideo import *
     from .internvl import *
+    from .isaac import *
     from .jais2 import *
     from .jamba import *
     from .janus import *

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -238,6 +238,8 @@
         ("instructblipvideo", "InstructBlipVideoConfig"),
         ("internvl", "InternVLConfig"),
         ("internvl_vision", "InternVLVisionConfig"),
+        ("isaac", "IsaacConfig"),
+        ("isaac_vision", "IsaacVisionConfig"),
         ("jais2", "Jais2Config"),
         ("jamba", "JambaConfig"),
         ("janus", "JanusConfig"),
@@ -758,6 +760,8 @@
         ("instructblipvideo", "InstructBlipVideo"),
         ("internvl", "InternVL"),
         ("internvl_vision", "InternVLVision"),
+        ("isaac", "Isaac"),
+        ("isaac_vision", "IsaacVision"),
         ("jais2", "Jais2"),
         ("jamba", "Jamba"),
         ("janus", "Janus"),
@@ -1109,6 +1113,7 @@
         ("gemma4_audio", "gemma4"),
         ("gemma4_text", "gemma4"),
         ("gemma4_vision", "gemma4"),
+        ("isaac_vision", "isaac"),
         ("glm4v_vision", "glm4v"),
         ("glm4v_moe_vision", "glm4v_moe"),
         ("glm4v_text", "glm4v"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -145,6 +145,7 @@
             ("imagegpt", {"torchvision": "ImageGPTImageProcessor", "pil": "ImageGPTImageProcessorPil"}),
             ("instructblip", {"torchvision": "BlipImageProcessor", "pil": "BlipImageProcessorPil"}),
             ("internvl", {"torchvision": "GotOcr2ImageProcessor", "pil": "GotOcr2ImageProcessorPil"}),
+            ("isaac", {"torchvision": "IsaacImageProcessor"}),
             ("janus", {"torchvision": "JanusImageProcessor", "pil": "JanusImageProcessorPil"}),
             ("kosmos-2", {"torchvision": "CLIPImageProcessor", "pil": "CLIPImageProcessorPil"}),
             ("kosmos-2.5", {"torchvision": "Kosmos2_5ImageProcessor", "pil": "Kosmos2_5ImageProcessorPil"}),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -235,6 +235,8 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("instructblipvideo", "InstructBlipVideoModel"),
         ("internvl", "InternVLModel"),
         ("internvl_vision", "InternVLVisionModel"),
+        ("isaac", "IsaacModel"),
+        ("isaac_vision", "IsaacVisionModel"),
         ("jais2", "Jais2Model"),
         ("jamba", "JambaModel"),
         ("janus", "JanusModel"),
@@ -990,6 +992,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("instructblip", "InstructBlipForConditionalGeneration"),
         ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
         ("internvl", "InternVLForConditionalGeneration"),
+        ("isaac", "IsaacForConditionalGeneration"),
         ("janus", "JanusForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
         ("kosmos-2.5", "Kosmos2_5ForConditionalGeneration"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -100,6 +100,7 @@
             ("instructblip", "InstructBlipProcessor"),
             ("instructblipvideo", "InstructBlipVideoProcessor"),
             ("internvl", "InternVLProcessor"),
+            ("isaac", "IsaacProcessor"),
             ("janus", "JanusProcessor"),
             ("kosmos-2", "Kosmos2Processor"),
             ("kosmos-2.5", "Kosmos2_5Processor"),

diff --git a/src/transformers/models/isaac/__init__.py b/src/transformers/models/isaac/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_isaac import *
+    from .modeling_isaac import *
+    from .processing_isaac import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)