From caf33f1a4d80c55d954bf611fb3fe7272c84b485 Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Tue, 23 Dec 2025 20:36:35 -0500
Subject: [PATCH 01/12] Music flamingo

---
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    5 +
 src/transformers/models/auto/modeling_auto.py |    4 +
 .../models/auto/processing_auto.py            |    1 +
 .../models/musicflamingo/__init__.py          |   31 +
 .../configuration_musicflamingo.py            |  210 ++
 .../convert_musicflamingo_to_hf.py            |  295 +++
 .../models/musicflamingo/dataset.py           | 1828 +++++++++++++++++
 .../musicflamingo/modeling_musicflamingo.py   |  626 ++++++
 .../musicflamingo/modular_musicflamingo.py    |  329 +++
 .../musicflamingo/processing_musicflamingo.py |  330 +++
 .../models/musicflamingo/rotary_embedding.py  |  300 +++
 .../models/musicflamingo/sound_encoder.py     |  167 ++
 13 files changed, 4127 insertions(+)
 create mode 100644 src/transformers/models/musicflamingo/__init__.py
 create mode 100644 src/transformers/models/musicflamingo/configuration_musicflamingo.py
 create mode 100644 src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
 create mode 100644 src/transformers/models/musicflamingo/dataset.py
 create mode 100644 src/transformers/models/musicflamingo/modeling_musicflamingo.py
 create mode 100644 src/transformers/models/musicflamingo/modular_musicflamingo.py
 create mode 100644 src/transformers/models/musicflamingo/processing_musicflamingo.py
 create mode 100644 src/transformers/models/musicflamingo/rotary_embedding.py
 create mode 100644 src/transformers/models/musicflamingo/sound_encoder.py

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 37f50eb4ad56..d01fc5f629cf 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -249,6 +249,7 @@
     from .mpt import *
     from .mra import *
     from .mt5 import *
+    from .musicflamingo import *
     from .musicgen import *
     from .musicgen_melody import *
     from .mvp import *
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2b883b44c58c..2ae76d99d409 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -283,6 +283,8 @@
         ("mpt", "MptConfig"),
         ("mra", "MraConfig"),
         ("mt5", "MT5Config"),
+        ("musicflamingo", "MusicFlamingoConfig"),
+        ("musicflamingo_encoder", "MusicFlamingoEncoderConfig"),
         ("musicgen", "MusicgenConfig"),
         ("musicgen_melody", "MusicgenMelodyConfig"),
         ("mvp", "MvpConfig"),
@@ -745,6 +747,8 @@
         ("mpt", "MPT"),
         ("mra", "MRA"),
         ("mt5", "MT5"),
+        ("musicflamingo", "MusicFlamingo"),
+        ("musicflamingo_encoder", "MusicFlamingoEncoder"),
         ("musicgen", "MusicGen"),
         ("musicgen_melody", "MusicGen Melody"),
         ("mvp", "MVP"),
@@ -949,6 +953,7 @@
 SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
     [
         ("audioflamingo3_encoder", "audioflamingo3"),
+        ("musicflamingo_encoder", "musicflamingo"),
         ("openai-gpt", "openai"),
         ("blip-2", "blip_2"),
         ("data2vec-audio", "data2vec"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 711c9553d45b..b178089e480b 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -283,6 +283,8 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("mpt", "MptModel"),
         ("mra", "MraModel"),
         ("mt5", "MT5Model"),
+        ("musicflamingo", "MusicFlamingoForConditionalGeneration"),
+        ("musicflamingo_encoder", "MusicFlamingoEncoder"),
         ("musicgen", "MusicgenModel"),
         ("musicgen_melody", "MusicgenMelodyModel"),
         ("mvp", "MvpModel"),
@@ -505,6 +507,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("mpnet", "MPNetForMaskedLM"),
         ("mpt", "MptForCausalLM"),
         ("mra", "MraForMaskedLM"),
+        ("musicflamingo", "MusicFlamingoForConditionalGeneration"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nanochat", "NanoChatForCausalLM"),
         ("nllb-moe", "NllbMoeForConditionalGeneration"),
@@ -1175,6 +1178,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("marian", "MarianMTModel"),
         ("mbart", "MBartForConditionalGeneration"),
         ("mt5", "MT5ForConditionalGeneration"),
+        ("musicflamingo", "MusicFlamingoForConditionalGeneration"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nllb-moe", "NllbMoeForConditionalGeneration"),
         ("pegasus", "PegasusForConditionalGeneration"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index c0d88d163b44..628f8594dfd0 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -112,6 +112,7 @@
         ("mllama", "MllamaProcessor"),
         ("mm-grounding-dino", "GroundingDinoProcessor"),
         ("moonshine", "Wav2Vec2Processor"),
+        ("musicflamingo", "MusicFlamingoProcessor"),
         ("omdet-turbo", "OmDetTurboProcessor"),
         ("oneformer", "OneFormerProcessor"),
         ("ovis2", "Ovis2Processor"),
diff --git a/src/transformers/models/musicflamingo/__init__.py b/src/transformers/models/musicflamingo/__init__.py
new file mode 100644
index 000000000000..a9d654d9eb54
--- /dev/null
+++ b/src/transformers/models/musicflamingo/__init__.py
@@ -0,0 +1,31 @@
+# coding=utf-8
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_musicflamingo import *
+    from .modeling_musicflamingo import *
+    from .processing_musicflamingo import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/musicflamingo/configuration_musicflamingo.py b/src/transformers/models/musicflamingo/configuration_musicflamingo.py
new file mode 100644
index 000000000000..5fb0aaca5e5c
--- /dev/null
+++ b/src/transformers/models/musicflamingo/configuration_musicflamingo.py
@@ -0,0 +1,210 @@
+# coding=utf-8
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class MusicFlamingoEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`MusicFlamingoEncoder`]. It is used to instantiate an
+    MusicFlamingo audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of the MusicFlamingo
+    architecture.
+
+    e.g. [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_mel_bins (`int`, *optional*, defaults to 128):
+            Number of mel features used per input features. Should correspond to the value used in the
+            `MusicFlamingoProcessor` class.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of encoder layers.
+        num_attention_heads (`int`, *optional*, defaults to 20):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 5120):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](https://huggingface.co/papers/1909.11556)
+            for more details.
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_size (`int`, *optional*, defaults to 1280):
+            Dimensionality of the layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by dividing by sqrt(hidden_size).
+        max_source_positions (`int`, *optional*, defaults to 1500):
+            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
+
+    Example:
+
+    ```python
+    >>> from transformers import MusicFlamingoEncoderConfig, MusicFlamingoEncoder
+
+    >>> # Initializing an MusicFlamingoEncoderConfig
+    >>> configuration = MusicFlamingoEncoderConfig()
+
+    >>> # Initializing an MusicFlamingoEncoder (with random weights)
+    >>> model = MusicFlamingoEncoder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "musicflamingo_encoder"
+
+    attribute_map = {
+        "d_model": "hidden_size",
+        "encoder_layers": "num_hidden_layers",
+        "encoder_attention_heads": "num_attention_heads",
+        "encoder_ffn_dim": "intermediate_size",
+        "encoder_layerdrop": "layerdrop",
+    }
+
+    def __init__(
+        self,
+        num_mel_bins=128,
+        num_hidden_layers=32,
+        num_attention_heads=20,
+        intermediate_size=5120,
+        layerdrop=0.0,
+        activation_function="gelu",
+        hidden_size=1280,
+        dropout=0.0,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        initializer_range=0.02,
+        scale_embedding=False,
+        max_source_positions=1500,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_mel_bins = num_mel_bins
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.initializer_range = initializer_range
+        self.layerdrop = layerdrop
+        self.num_hidden_layers = num_hidden_layers
+        self.scale_embedding = scale_embedding
+        self.max_source_positions = max_source_positions
+
+
+class MusicFlamingoConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`MusicFlamingoForConditionalGeneration`]. It is used to instantiate an
+    MusicFlamingo model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the MusicFlamingo.
+
+    e.g. [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        audio_config (`Union[MusicFlamingoEncoderConfig, dict]`, *optional*, defaults to `MusicFlamingoEncoderConfig`):
+            The config object or dictionary of the audio backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
+            The config object or dictionary of the text backbone.
+        audio_token_id (`int`, *optional*, defaults to 151669):
+            The audio token index to encode the audio prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            Activation function used in the projector.
+        projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to include bias terms in the projector.
+
+    Example:
+
+    ```python
+    >>> from transformers import MusicFlamingoForConditionalGeneration, MusicFlamingoConfig, MusicFlamingoEncoderConfig, Qwen2Config
+
+    >>> # Initializing an MusicFlamingoEncoder config
+    >>> audio_config = MusicFlamingoEncoderConfig()
+
+    >>> # Initializing a Qwen2 config
+    >>> text_config = Qwen2Config()
+
+    >>> # Initializing an MusicFlamingo configuration
+    >>> configuration = MusicFlamingoConfig(audio_config, text_config)
+
+    >>> # Initializing a model from the musicflamingo style configuration
+    >>> model = MusicFlamingoForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "musicflamingo"
+    sub_configs = {
+        "audio_config": MusicFlamingoEncoderConfig,
+        "text_config": AutoConfig,
+    }
+
+    def __init__(
+        self,
+        audio_config=None,
+        text_config=None,
+        audio_token_id=151669,
+        projector_hidden_act="gelu",
+        projector_bias=True,
+        **kwargs,
+    ):
+        self.audio_token_id = audio_token_id
+
+        if isinstance(audio_config, dict):
+            audio_config["model_type"] = audio_config.get("model_type", "musicflamingo_encoder")
+            audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config)
+        elif audio_config is None:
+            audio_config = CONFIG_MAPPING["musicflamingo_encoder"]()
+
+        self.audio_config = audio_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "qwen2")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["qwen2"]()
+
+        self.text_config = text_config
+        self.projector_hidden_act = projector_hidden_act
+        self.projector_bias = projector_bias
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["MusicFlamingoConfig", "MusicFlamingoEncoderConfig"]
diff --git a/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py b/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
new file mode 100644
index 000000000000..802b1a647826
--- /dev/null
+++ b/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
@@ -0,0 +1,295 @@
+# coding=utf-8
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Convert MusicFlamingo checkpoints into a Hugging Face repository layout."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+import torch
+from safetensors.torch import safe_open
+
+from transformers import (
+    AutoTokenizer,
+    GenerationConfig,
+    MusicFlamingoConfig,
+    MusicFlamingoForConditionalGeneration,
+    MusicFlamingoProcessor,
+    Qwen2Config,
+    WhisperFeatureExtractor,
+)
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+
+def _load_json(p: Path):
+    if not p.is_file():
+        raise FileNotFoundError(f"Missing JSON: {p}")
+    with p.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def write_processor(src_root: Path, dst_root: Path):
+    llm_dir = src_root / "llm"
+
+    system_prompt = (
+        "You are Music Flamingo, a multimodal assistant for language and music. "
+        "On each turn you receive an audio clip which contains music and optional text, "
+        "you will receive at least one or both; use your world knowledge and reasoning "
+        "to help the user with any task. Interpret the entirety of the content any input music"
+        "--regardlenss of whether the user calls it audio, music, or sound."
+    )
+
+    # fmt: off
+    tokenizer_chat_template = (
+        "{% if messages[0]['role'] != 'system' %}"
+            "{{ '<|im_start|>system\\n" + system_prompt + "<|im_end|>\\n' }}"
+        "{% endif %}"
+        "{% for message in messages if message['content'] is not none %}"
+            "{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}"
+            "{{ '<|im_start|>assistant\\n' }}"
+        "{% endif %}"
+    )
+    # fmt: on
+
+    # fmt: off
+    processor_chat_template = (
+        "{% if messages[0]['role'] != 'system' %}"
+            "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"
+        "{% endif %}"
+        "{% for m in messages if m['content'] is not none %}"
+            "<|im_start|>{{ m['role'] }}\n"
+            "{% if m['content'] is string %}"
+                "{{ m['content'] }}"
+            "{% else %}"
+                "{% set audio = namespace(found=False) %}"
+                "{% set text_buf = namespace(v='') %}"
+                "{% for c in m['content'] %}"
+                    "{% if c.get('type') == 'audio' or 'audio' in c %}"
+                        "{% set audio.found = True %}"
+                    "{% elif c.get('type') == 'text' or 'text' in c %}"
+                        "{% set text_buf.v = text_buf.v + c['text'] %}"
+                    "{% endif %}"
+                "{% endfor %}"
+                "{% if audio.found %}{{ '<sound>' }}{% endif %}{{ text_buf.v }}"
+            "{% endif %}"
+            "<|im_end|>\n"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}"
+            "<|im_start|>assistant\n"
+        "{% endif %}"
+    )
+    # fmt: on
+
+    processor = MusicFlamingoProcessor(
+        feature_extractor=WhisperFeatureExtractor(feature_size=128, return_attention_mask=True),
+        tokenizer=AutoTokenizer.from_pretrained(str(llm_dir), chat_template=tokenizer_chat_template, use_fast=True),
+        chat_template=processor_chat_template,
+    )
+    processor.save_pretrained(str(dst_root))
+
+    logger.info("processor (tokenizer + preprocessor)")
+    return processor
+
+
+PREFIX_MAP = {
+    "llm": "language_model",
+    "sound_tower": "audio_tower",
+    "sound_mm_projector": "multi_modal_projector",
+}
+
+
+def _resolve_component_dir(dirpath: Path):
+    if not dirpath.is_dir():
+        return None
+    idx = dirpath / "model.safetensors.index.json"
+    mono = dirpath / "model.safetensors"
+    if idx.exists():
+        wm = _load_json(idx).get("weight_map") or {}
+        by_shard: dict[str, list[str]] = defaultdict(list)
+        for k, shard in wm.items():
+            by_shard[shard].append(k)
+        return ("sharded", dirpath, {k: sorted(v) for k, v in sorted(by_shard.items())})
+    if mono.exists():
+        return ("file", mono)
+    cands = sorted([x for x in dirpath.iterdir() if x.suffix == ".safetensors"])
+    return ("file", cands[0]) if len(cands) == 1 else None
+
+
+def merge_and_shard_weights(src_root: Path, dst_root: Path, processor: MusicFlamingoProcessor):
+    state: dict[str, Any] = {}
+    for tag in PREFIX_MAP.keys():
+        comp = _resolve_component_dir(src_root / tag)
+        if not comp:
+            continue
+
+        out_prefix = PREFIX_MAP.get(tag, tag)
+
+        if comp[0] == "file":
+            fp: Path = comp[1]
+            with safe_open(str(fp), framework="pt", device="cpu") as f:
+                for k in f.keys():
+                    if k == "__metadata__":
+                        continue
+                    state[f"{out_prefix}.{k}"] = f.get_tensor(k)
+        else:
+            base: Path = comp[1]
+            shard_map: dict[str, list[str]] = comp[2]
+            for shard, keys in shard_map.items():
+                sp = base / shard
+                with safe_open(str(sp), framework="pt", device="cpu") as f:
+                    for k in keys:
+                        state[f"{out_prefix}.{k}"] = f.get_tensor(k)
+
+    if not state:
+        raise FileNotFoundError("No tensors found in llm/, sound_tower/, or sound_mm_projector/.")
+
+    tok = processor.tokenizer
+
+    text_config = Qwen2Config(
+        bos_token_id=tok.bos_token_id,
+        eos_token_id=tok.eos_token_id,
+        pad_token_id=tok.pad_token_id,
+        vocab_size=len(tok),
+        hidden_size=3584,
+        intermediate_size=18944,
+        model_max_length=8192,
+        num_attention_heads=28,
+        num_hidden_layers=28,
+        num_key_value_heads=4,
+        rope_theta=1000000.0,
+        use_cache=False,
+    )
+    config = MusicFlamingoConfig(text_config=text_config, audio_token_id=tok.get_vocab()["<sound>"])
+    model = MusicFlamingoForConditionalGeneration(config).to(dtype=torch.bfloat16)
+
+    # Update state dict to new key names if necessary
+    projector_key_mapping = {
+        "multi_modal_projector.layers.0.weight": "multi_modal_projector.linear_1.weight",
+        "multi_modal_projector.layers.0.bias": "multi_modal_projector.linear_1.bias",
+        "multi_modal_projector.layers.2.weight": "multi_modal_projector.linear_2.weight",
+        "multi_modal_projector.layers.2.bias": "multi_modal_projector.linear_2.bias",
+        "audio_tower.sound_tower.pos_emb.freqs": "audio_tower.pos_emb.freqs",
+    }
+    for old_key, new_key in projector_key_mapping.items():
+        if old_key in state:
+            state[new_key] = state.pop(old_key)
+
+    # Load weights into the instantiated model so we can push via `push_to_hub` later.
+    load_res = model.load_state_dict(state, strict=True)
+    # Enforce a clean load
+    if getattr(load_res, "missing_keys", None) and load_res.missing_keys:
+        mk = load_res.missing_keys
+        raise ValueError(f"Missing keys when loading: {mk[:10]}{' ...' if len(mk) > 10 else ''}")
+    if getattr(load_res, "unexpected_keys", None) and load_res.unexpected_keys:
+        uk = load_res.unexpected_keys
+        raise ValueError(f"Unexpected keys when loading: {uk[:10]}{' ...' if len(uk) > 10 else ''}")
+
+    generation_config = GenerationConfig(
+        bos_token_id=tok.bos_token_id,
+        eos_token_id=tok.eos_token_id,
+        pad_token_id=tok.pad_token_id,
+        max_new_tokens=2048,
+    )
+    model.generation_config = generation_config
+
+    model.save_pretrained(save_directory=str(dst_root))
+    logger.info("model.safetensors index and shards")
+    return model
+
+
+"""
+Reproducible Usage
+==================
+
+1) Download the original MusicFlamingo weights from NVIDIA (requires Git LFS):
+
+```
+git lfs install
+git clone https://huggingface.co/nvidia/audio-flamingo-3
+```
+
+This will create a folder `audio-flamingo-3/` containing the original components:
+`llm/`, `sound_tower/`, and `sound_mm_projector/`.
+
+2) Convert to the Hugging Face Transformers format (locally):
+
+```
+python src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py \
+  --src_dir audio-flamingo-3 \
+  --dst_dir audio-flamingo-3-hf
+```
+
+3) Convert and push directly to the Hub (requires `huggingface-cli login` or `HF_TOKEN`):
+
+```
+python src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py \
+  --src_dir audio-flamingo-3 \
+  --dst_dir audio-flamingo-3-hf \
+  --push_to_hub <username-or-org>/audio-flamingo-3
+```
+
+This command uploads both the processor (tokenizer + feature extractor) and the converted
+model (sharded safetensors + configs) to the specified Hub repository.
+"""
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Convert MusicFlamingo to Hugging Face format.")
+    ap.add_argument("--src_dir", required=True, help="Source model root directory")
+    ap.add_argument("--dst_dir", required=True, help="Destination directory for converted model")
+    ap.add_argument(
+        "--push_to_hub",
+        default=None,
+        type=str,
+        help=(
+            "Optional repository ID to push the converted assets to the Hugging Face Hub, "
+            "e.g. 'username/audio-flamingo-3'."
+        ),
+    )
+    args = ap.parse_args()
+
+    src_root = Path(args.src_dir).resolve()
+    if not src_root.is_dir():
+        raise FileNotFoundError(f"Source directory not found: {src_root}")
+
+    dst_root = Path(args.dst_dir).resolve()
+    if dst_root.exists():
+        raise FileExistsError(f"Destination already exists: {dst_root}")
+
+    processor = write_processor(src_root, dst_root)
+    model = merge_and_shard_weights(src_root, dst_root, processor)
+
+    # Optionally push converted assets using native push_to_hub only
+    if args.push_to_hub:
+        logger.info("Pushing processor to the Hub ...")
+        processor.push_to_hub(args.push_to_hub)
+        logger.info("Pushing model to the Hub ...")
+        model.push_to_hub(args.push_to_hub)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/musicflamingo/dataset.py b/src/transformers/models/musicflamingo/dataset.py
new file mode 100644
index 000000000000..a970f4e186ff
--- /dev/null
+++ b/src/transformers/models/musicflamingo/dataset.py
@@ -0,0 +1,1828 @@
+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import base64
+import copy
+import re
+import io
+import json
+import os
+import os.path as osp
+import random
+import time
+import warnings
+from dataclasses import dataclass
+from typing import Dict, Sequence
+import math
+import numpy as np
+import PIL
+import torch
+import transformers
+from PIL import Image, ImageFile
+from torch.utils.data import Dataset, default_collate
+from transformers import PreTrainedTokenizer
+from transformers import AutoFeatureExtractor
+import kaldiio
+import llava.data.datasets_mixture as datasets_mixture
+from llava import conversation as conversation_lib
+from llava.constants import DEFAULT_SOUND_TOKEN,DEFAULT_SPEECH_TOKEN, IGNORE_INDEX
+from llava.data.collate import DataCollator
+from llava.mm_utils import (
+    load_audio,
+    get_num_windows,
+    tokenizer_image_token,
+)
+from llava.train.args import DataArguments, TrainingArguments
+from llava.train.sequence_parallel import (
+    extract_local_from_list,
+    extract_local_input_ids,
+    extract_local_position_ids,
+    get_pg_manager,
+)
+from llava.utils.tokenizer import preprocess_conversation
+# import torchaudio
+from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler, UniformClipSampler
+import soundfile as sf
+from librosa import resample as librosa_resample
+import whisper
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+PIL.Image.MAX_IMAGE_PIXELS = 1000000000
+import soundfile as sf
+from librosa import resample as librosa_resample
+import whisper
+import librosa
+from llava.utils.logging import logger
+import os
+from pydub import AudioSegment
+import subprocess
+import random
+import io
+import numpy as np
+import librosa
+from decord import AudioReader, cpu
+import datetime
+# SwiftStack / OpenStack credentials and endpoints
+os.environ["ST_USER"] = "arushig"
+os.environ["ST_KEY"] = ""
+os.environ["ST_END_POINT"] = "https://pdx.s8k.io"
+os.environ["ST_AUTH"] = "https://pdx.s8k.io/auth/v1.0"
+os.environ["OS_STORAGE_URL"] = "https://pdx.s8k.io/v1/AUTH_team-nemo-data-acquisition"
+# import torch
+# torch.distributed.init_process_group(
+#     backend="nccl",
+#     timeout=datetime.timedelta(minutes=30)
+# )
+from llava.data.audio_utils import _load_tarball_local_audio_byteseek
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+
+
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+
+import warnings
+
+MAX_DURATION = 1000  # seconds
+warnings.filterwarnings("ignore", category=FutureWarning)
+
+def preprocess_multimodal(sources: Sequence[str], data_args: DataArguments) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    for source in sources:
+        concat_values = "".join([sentence["value"] for sentence in source])
+        for sid, sentence in enumerate(source):
+            # In multimodal conversations, we automatically prepend '<image>' at the start of the first sentence if it doesn't already contain one.
+            
+            if DEFAULT_SOUND_TOKEN in sentence["value"]:
+                sentence["value"] = sentence["value"].replace(DEFAULT_SOUND_TOKEN, f"{DEFAULT_SOUND_TOKEN}\n")
+                sentence["value"] = sentence["value"].replace(f"{DEFAULT_SOUND_TOKEN}\n\n", f"{DEFAULT_SOUND_TOKEN}\n")
+            if DEFAULT_SPEECH_TOKEN in sentence["value"]:
+                sentence["value"] = sentence["value"].replace(DEFAULT_SPEECH_TOKEN, f"{DEFAULT_SPEECH_TOKEN}\n")
+                sentence["value"] = sentence["value"].replace(f"{DEFAULT_SPEECH_TOKEN}\n\n", f"{DEFAULT_SPEECH_TOKEN}\n")
+    return sources
+
+
+def read_audio_from_ss(container, object_name):
+    
+    
+
+    def read_swift_file(container, object_name):
+        try:
+            result = subprocess.run(
+                ["swift", "download", container, object_name, "--output", "-"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                check=True
+            )
+            return result.stdout  # returns bytes
+        except subprocess.CalledProcessError as e:
+            print("Error reading Swift object:", e.stderr.decode())
+            return None
+
+   
+
+    data = read_swift_file(container, object_name)
+
+    return data
+
+def get_mp4_duration_bytes(mp4_bytes):
+    """
+    Get duration (in seconds) of MP4 audio bytes using ffprobe.
+    """
+    try:
+        cmd = [
+            "ffprobe",
+            "-v", "error",
+            "-show_entries", "format=duration",
+            "-of", "json",
+            "pipe:0"
+        ]
+        proc = subprocess.run(
+            cmd,
+            input=mp4_bytes,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=True
+        )
+        info = json.loads(proc.stdout)
+        duration = float(info["format"]["duration"])
+        return duration
+    except Exception as e:
+        print("Failed to get duration:", e)
+        return None
+
+
+def load_mp4_audio_bytes_librosa(mp4_bytes, sr=22050, mono=True):
+    """
+    Load MP4 audio bytes into librosa using ffmpeg for decoding.
+    """
+    try:
+        # Run ffmpeg to decode mp4 bytes into wav (PCM 16-bit)
+        ffmpeg_cmd = [
+            "ffmpeg",
+            "-i", "pipe:0",        # input from stdin
+            "-f", "wav",           # output format
+            "pipe:1"               # output to stdout
+        ]
+        proc = subprocess.run(
+            ffmpeg_cmd,
+            input=mp4_bytes,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=True
+        )
+
+        wav_bytes = proc.stdout
+        # Load the decoded wav bytes with librosa
+        audio_buffer = io.BytesIO(wav_bytes)
+        y, sr = librosa.load(audio_buffer, sr=sr, mono=mono)
+        # print(f"Audio loaded: {y.shape[0]} samples at {sr} Hz")
+        return y, sr
+    except subprocess.CalledProcessError as e:
+        print("ffmpeg error:", e.stderr.decode())
+        return None, None
+    except Exception as e:
+        print("Failed to load audio with librosa:", e)
+        return None, None
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    no_system_prompt: bool = False,
+) -> Dict:
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    return default_collate(
+        [
+            preprocess_conversation(conversation, tokenizer, no_system_prompt=no_system_prompt)
+            for conversation in sources
+        ]
+    )
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is originally implemented by the LLaVA team and modified by
+    Ji Lin and Haotian Tang.
+    """
+
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+    ):
+        super().__init__()
+        if os.path.isdir(data_path):
+            list_data_dict=[]
+            for fname in sorted(os.listdir(data_path)):
+                if fname.endswith(".ndjson") or fname.endswith(".jsonl"):
+                    full_path = os.path.join(data_path, fname)
+                    with open(full_path, "r", encoding="utf-8") as fp:
+                        for line in fp:
+                            line = line.strip()
+                            if line:
+                                list_data_dict.append(json.loads(line))
+        else:
+            try:
+                with open(data_path) as fp:
+                    list_data_dict = json.load(fp)
+            except:
+                with open(data_path) as fp:
+                    list_data_dict = [json.loads(q) for q in fp]
+
+        # rank0_print("Formatting inputs...Skip in lazy mode")
+        print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+        self.image_folder = image_folder
+        self.wav_processor = AutoFeatureExtractor.from_pretrained('Qwen/Qwen2-Audio-7B')
+        self.tar_handles = {}
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 128 if "image" in sample else 0
+            length_list.append(sum(len(conv["value"].split()) for conv in sample["conversations"]) + img_tokens)
+        return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            if 'duration' in sample.keys():
+                duration = sample["duration"]
+            else:
+                duration = 10.
+            try:
+                cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"]) + int(math.ceil(duration * 25))
+                cur_len = cur_len if "sound" in sample else -cur_len
+                length_list.append(cur_len)
+            except:
+                try:
+                    cur_len = 0 + int(math.ceil(duration * 25))
+                    cur_len = cur_len if "sound" in sample else -cur_len
+                    length_list.append(cur_len)  
+                except:
+                    cur_len = 0 + int(math.ceil(10. * 25))
+                    cur_len = cur_len if "sound" in sample else -cur_len
+                    length_list.append(cur_len) 
+        return length_list
+    
+
+    @staticmethod
+    def _load_sound_tar(audio_metadata,audio_dataset_name, tar_handles, wav_processor, sample_rate=16000, window_length=30.0, window_overlap=0.0, max_num_window=3, audio_start = 0.0):
+        
+        window_length  = int(window_length * sample_rate)
+        window_overlap = int(window_overlap * sample_rate)
+        max_num_window = int(max_num_window)
+        duration = max_num_window * (window_length - window_overlap) + window_overlap
+        absolute_path_to_tarball = '/lustre/fs1/portfolios/llmservice/projects/llmservice_fm_audio/users/arushig/datasets/tarred/audio_flamingo'
+        sound_outputs = []
+        audio_feature_masks = []
+        audio_embed_masks = []
+        audio_times =[]
+        # try:
+        audio_data, tar_handles = _load_tarball_local_audio_byteseek(absolute_path_to_tarball, audio_metadata, tar_handles,audio_dataset_name)
+        # print(f"audio_data: {audio_data.shape}")
+        T = len(audio_data)
+            
+        audio_data = audio_data.reshape(1, -1)
+        num_windows, full_length = get_num_windows(T, sample_rate, max_num_window)
+
+        audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
+        count = 0
+        for i in range(num_windows):
+            audio_time= torch.zeros(750)
+            audio_embed_mask = torch.zeros(750)
+            start = i * (window_length - window_overlap)
+            audio_data_tensor_this = audio_data_tensor[:, start:start+window_length]
+            orig_length = audio_data_tensor_this.shape[1]
+            audio_data_tensor_this = wav_processor(audio_data_tensor_this.cpu().numpy(), sampling_rate=sample_rate, return_tensors="pt") #.squeeze(0) text="dummy", audios=audio_data_tensor_this, return_tensors="pt") #
+            sound_outputs.append(audio_data_tensor_this["input_features"])
+            # calculate the mask for the input melspec to Whisper
+            melspec_frames_this_window = int(math.ceil(orig_length / 160))
+            feature_attention_mask = torch.zeros(3000, dtype=torch.int32)
+            feature_attention_mask[:melspec_frames_this_window] = 1
+            audio_feature_masks.append(feature_attention_mask.unsqueeze(0))
+            # calculate the mask for the output embedding for use in AF2
+            conv_lengths = (melspec_frames_this_window - 1) // 2 + 1
+            output_embedding_lengths = (conv_lengths - 2) // 2 + 1
+            audio_embed_mask[:output_embedding_lengths] = 1
+            audio_embed_masks.append(audio_embed_mask)
+            time = torch.arange(count * 30, (count+1) * 0.04 * 750, 0.04)
+            audio_time[:750] = time[:750]
+            audio_times.append(audio_time)
+            count = count + 1
+        # except:
+        #     print('error loading file', sound_file)
+        #     sound_outputs.append(torch.zeros(1,128,3000))
+        #     audio_feature_masks.append(torch.zeros(1,3000, dtype=torch.int32))
+        #     audio_embed_masks.append(torch.zeros(750))
+        #     audio_times.append(torch.zeros(750))
+
+        return torch.stack(sound_outputs, dim=0), torch.stack(audio_feature_masks, dim=0), torch.stack(audio_embed_masks, dim=0), torch.stack(audio_times, dim=0), tar_handles
+
+
+    @staticmethod
+    def _load_sound(sound_file, wav_processor, sample_rate=16000, window_length=30.0, window_overlap=0.0, max_num_window=3, audio_start = 0.0):
+        if sound_file is None:
+            return None
+        window_length  = int(window_length * sample_rate)
+        window_overlap = int(window_overlap * sample_rate)
+        max_num_window = int(max_num_window)
+        duration = max_num_window * (window_length - window_overlap) + window_overlap
+
+        sound_outputs = []
+        audio_feature_masks = []
+        audio_embed_masks = []
+        audio_times =[]
+        # try:
+        if 's3://' in sound_file:
+            # Remove the s3:// prefix and split into bucket and key
+            path = sound_file.replace("s3://", "")
+            parts = path.split("/", 1)
+            bucket_name = parts[0]
+            key = parts[1]
+            audio_data_bytes = read_audio_from_ss(bucket_name, key)
+            audio_data, sr = load_mp4_audio_bytes_librosa(audio_data_bytes, sr=16000)
+        else:
+            sound_filename = str.split(sound_file, '/')[-1]
+            if '.ark' in sound_filename:
+                sound = kaldiio.load_mat(sound_file)
+                audio_data = sound[1]
+                audio_data=audio_data.astype(np.float16)
+            # elif '.mp4' in sound_filename:
+            #     # Load audio from video file
+            #     ar = AudioReader(sound_file, ctx=cpu(0), sample_rate=sample_rate, mono=True)
+            #     cur_max_length = ar.shape[1]
+            #     audio_data = ar[0:cur_max_length].asnumpy()[0]  # Load the first CHUNK_LENGTH seconds
+            # elif '.m4a' in sound_filename:
+            #     # Load audio from video file
+            #     # ar = AudioReader(sound_file, ctx=cpu(0), sample_rate=sample_rate, mono=True)
+            #     # cur_max_length = ar.shape[1]
+            #     # audio_data = ar[0:cur_max_length].asnumpy()[0]
+            #     audio = AudioSegment.from_file(sound_file, format="m4a")
+
+            #     # Export to bytes
+            #     buffer = io.BytesIO()
+            #     audio.export(buffer, format="wav")
+            #     buffer.seek(0)
+
+            #     audio_data, _ = librosa.load(buffer, sr=16000)
+            else:
+                audio_data = load_audio(sound_file, sample_rate, duration, audio_start) # already cuts to max duration
+        T = len(audio_data)
+            
+        audio_data = audio_data.reshape(1, -1)
+        num_windows, full_length = get_num_windows(T, sample_rate, max_num_window)
+
+        audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
+        count = 0
+        for i in range(num_windows):
+            audio_time= torch.zeros(750)
+            audio_embed_mask = torch.zeros(750)
+            start = i * (window_length - window_overlap)
+            audio_data_tensor_this = audio_data_tensor[:, start:start+window_length]
+            orig_length = audio_data_tensor_this.shape[1]
+            audio_data_tensor_this = wav_processor(audio_data_tensor_this.cpu().numpy(), sampling_rate=sample_rate, return_tensors="pt") #.squeeze(0) text="dummy", audios=audio_data_tensor_this, return_tensors="pt") #
+            sound_outputs.append(audio_data_tensor_this["input_features"])
+            # calculate the mask for the input melspec to Whisper
+            melspec_frames_this_window = int(math.ceil(orig_length / 160))
+            feature_attention_mask = torch.zeros(3000, dtype=torch.int32)
+            feature_attention_mask[:melspec_frames_this_window] = 1
+            audio_feature_masks.append(feature_attention_mask.unsqueeze(0))
+            # calculate the mask for the output embedding for use in AF2
+            conv_lengths = (melspec_frames_this_window - 1) // 2 + 1
+            output_embedding_lengths = (conv_lengths - 2) // 2 + 1
+            audio_embed_mask[:output_embedding_lengths] = 1
+            audio_embed_masks.append(audio_embed_mask)
+            time = torch.arange(count * 30, (count+1) * 0.04 * 750, 0.04)
+            audio_time[:750] = time[:750]
+            audio_times.append(audio_time)
+            count = count + 1
+        # except:
+        #     print('error loading file', sound_file)
+        #     sound_outputs.append(torch.zeros(1,128,3000))
+        #     audio_feature_masks.append(torch.zeros(1,3000, dtype=torch.int32))
+        #     audio_embed_masks.append(torch.zeros(750))
+        #     audio_times.append(torch.zeros(750))
+
+        return torch.stack(sound_outputs, dim=0), torch.stack(audio_feature_masks, dim=0), torch.stack(audio_embed_masks, dim=0), torch.stack(audio_times, dim=0)
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+
+        entry = self.list_data_dict[i]
+        try:
+            if "sound" in entry or "audio" in entry:
+                # arushig: not modifying the multi turn chat code for not to support tar format
+                sound_path = entry["sound"] if "sound" in entry else entry["audio"]['path']
+                if isinstance(sound_path, list):
+                    sound_files = entry["sound"]                   # e.g. ["a.wav","b.wav",...]
+                    conversations_raw = entry["conversations"]     # list of {"from","value"}
+                    # 1) Collect tag occurrences in textual order (e.g., <sound-1>, <sound-2>, …)
+                    tag_re = re.compile(r"<sound-(\d+)>")
+                    ordered_tags = []   # e.g., ["<sound-2>", "<sound-1>", "<sound-2>", ...]
+                    for turn in conversations_raw:
+                        for m in tag_re.findall(turn["value"]):
+                            ordered_tags.append(f"<sound-{m}>")
+
+                    # 2) Load each referenced file once; cache by index k (1-based tag)
+                    #    Assumption: sound_files[k-1] corresponds to <sound-k>
+                    cache = {}  # k -> (windows_tensor [W, ...], feat_mask [W,...], embed_mask [W,...])
+                    def _get_cached(k: int):
+                        if k in cache:
+                            return cache[k]
+                        if k < 1 or k > len(sound_files):
+                            raise ValueError(f"Tag <sound-{k}> refers to missing sound file at index {k-1}.")
+                        snd_path = sound_files[k-1]
+                        # your loader returns (windows, feat_mask, embed_mask); windows is often [W, 1, 750, 2048] or [W, 750, 2048]
+                        win, af_mask, ae_mask,a_time = self._load_sound(snd_path, self.wav_processor,
+                                                                max_num_window=self.data_args.audio_frames)
+                        # match your single-audio behavior: squeeze the extra batch dim if present
+                        # (keeps shape [W, 750, 2048] or [W, 1, 750, 2048] depending on your SoundTower path)
+                        win = win.squeeze(1)
+                        cache[k] = (win, af_mask, ae_mask, a_time)
+                        return cache[k]
+
+                    # 3) Build replacement strings and flatten windows/masks in the exact tag order
+                    sound_tensors = []         # list of [Wk, ...] to be cat later
+                    sound_feat_masks = []      # list of [Wk, ...]
+                    sound_embed_masks = []     # list of [Wk, ...]
+                    sound_times= []
+                    token_map = {}             # "<sound-k>" -> "<sound>\n" * Wk  (for replacement)
+                    for tag in ordered_tags:
+                        k = int(tag.split("-")[1][:-1])  # "<sound-12>" -> 12
+                        win, af_mask, ae_mask, a_time = _get_cached(k)
+                        # append these windows for this *occurrence* of the tag
+                        sound_tensors.append(win)                   # [Wk, ...]
+                        sound_feat_masks.append(af_mask)            # [Wk, ...]
+                        sound_embed_masks.append(ae_mask)           # [Wk, ...]
+                        sound_times.append(a_time)
+                        # remember replacement string for this tag (<sound> repeated Wk times, newline after each)
+                        if tag not in token_map:
+                            Wk = win.shape[0]
+                            token_map[tag] = ("<sound>\n" * Wk).rstrip()
+
+                    # 4) Replace <sound-k> with repeated "<sound>\n" in the conversation
+                    conversation = []
+                    for turn in conversations_raw:
+                        role, value = turn["from"], turn["value"]
+                        for tag, marker in token_map.items():
+                            value = value.replace(tag, marker)
+                        conversation.append({"from": role, "value": value.rstrip()})
+
+                    # 5) Finalize sources (chat) + stack media
+                    sources = [conversation]
+                    # If no tags were present, fall back to text-only like below.
+                    if len(sound_tensors) > 0:
+                        sound_tensor = torch.cat(sound_tensors, dim=0)
+                        audio_feature_masks = torch.cat(sound_feat_masks, dim=0)
+                        audio_embed_masks = torch.cat(sound_embed_masks, dim=0)
+                        audio_times = torch.cat(sound_times, dim=0)
+                    else:
+
+                        sound_tensor = None
+                        audio_feature_masks = None
+                        audio_embed_masks = None
+                        audio_times = None
+
+                # single turn loading
+                elif isinstance(sound_path, str):
+                    if "sound" in self.list_data_dict[i]:
+                        
+                        sound_file = self.list_data_dict[i]["sound"]
+                        # print(sound_file)
+                        if 'duration' in self.list_data_dict[i]:
+                            duration = self.list_data_dict[i]["duration"]
+                            # print(duration)
+                        else:
+                            if ".mat" in sound_file or ".ark" in sound_file:
+                                duration = 30.
+                            else:
+                                try:
+                                    duration = librosa.get_duration(path=sound_file)
+                                except:
+                                    print(f'error getting duration for {sound_file}')
+                                    duration = 30.
+                        if duration > 1000.:
+                            print(f'duration is long: {duration}')
+                            # logger.exception(f"Error processing instance '{self.list_data_dict[i]}':. Resampling.")
+                            return self.__getitem__(random.randint(0, len(self.list_data_dict) - 1))
+                        else:
+                            question = str(self.list_data_dict[i]["conversations"][0]["value"].rstrip())
+                            answer = str(self.list_data_dict[i]["conversations"][1]["value"]).rstrip()
+                            question = question.replace("<speech>\n", "").replace("\n<speech>", "").replace("<speech>", "")
+                            question = question.replace("<sound>\n", "").replace("\n<sound>", "").replace("<sound>", "")
+                            question = question.replace("<en><asr>\n", "").replace("\n<en><asr>", "").replace("<en><asr>", "")
+                            question = question.replace("<eng><asr>\n", "").replace("\n<eng><asr>", "").replace("<eng><asr>", "")
+                            sound_tensor, audio_feature_masks, audio_embed_masks, audio_times = self._load_sound(sound_file, self.wav_processor, max_num_window=self.data_args.audio_frames)
+                            sound_tensor=sound_tensor.squeeze(1) # squeeze the irrelevant dimension which was caused due to processor getting 1 batch for processing --> (windows x 750 x 2048)
+
+                            if random.choice([True, False]):
+                                question = "<sound>" * sound_tensor.shape[0] + "\n" + question
+                            else:
+                                question = question + "\n" + "<sound>" * sound_tensor.shape[0]
+                            conversation = [
+                                {"from": "human", "value": question},
+                                {"from": "gpt", "value": answer},
+                            ]
+
+                            sources = [conversation]
+                    else:
+                        audio_metadata = self.list_data_dict[i]["audio"]
+                        # print(audio_metadata)
+                        audio_dataset_name = self.list_data_dict[i]["dataset"]
+                        try:
+                            duration = self.list_data_dict[i]["audio"]['duration']
+                        except:
+                            print("no duration found")
+                            duration = 30.
+                        if duration > 1000.0:
+                            print(f'duration is long: {duration}')
+                            return self.__getitem__(random.randint(0, len(self.list_data_dict) - 1))
+                        else:
+                            question = str(self.list_data_dict[i]["conversations"][0]["value"].rstrip())
+                            answer = str(self.list_data_dict[i]["conversations"][1]["value"]).rstrip()
+                            question = question.replace("<speech>\n", "").replace("\n<speech>", "").replace("<speech>", "")
+                            question = question.replace("<sound>\n", "").replace("\n<sound>", "").replace("<sound>", "")
+                            question = question.replace("<en><asr>\n", "").replace("\n<en><asr>", "").replace("<en><asr>", "")
+                            question = question.replace("<eng><asr>\n", "").replace("\n<eng><asr>", "").replace("<eng><asr>", "")
+                            sound_tensor, audio_feature_masks, audio_embed_masks, audio_times, tar_handles = self._load_sound_tar(audio_metadata, audio_dataset_name, self.tar_handles, self.wav_processor, max_num_window=self.data_args.audio_frames)
+                            sound_tensor=sound_tensor.squeeze(1) # squeeze the irrelevant dimension which was caused due to processor getting 1 batch for processing --> (windows x 750 x 2048)
+                            self.tar_handles = tar_handles
+                            if random.choice([True, False]):
+                                question = "<sound>" * sound_tensor.shape[0] + "\n" + question
+                            else:
+                                question = question + "\n" + "<sound>" * sound_tensor.shape[0]
+                            conversation = [
+                                {"from": "human", "value": question},
+                                {"from": "gpt", "value": answer},
+                            ]
+
+                            sources = [conversation]
+                # text-only data loading 
+                else:
+                    question = str(self.list_data_dict[i]["conversations"][0]["value"].rstrip())
+                    answer = str(self.list_data_dict[i]["conversations"][1]["value"]).rstrip()
+                    conversation = [
+                        {"from": "human", "value": question},
+                        {"from": "gpt", "value": answer},
+                    ]
+
+                    sources = [conversation]
+        except:
+            print('error loading file', sound_path)
+            # logger.exception(f"Error processing instance '{self.list_data_dict[i]}':. Resampling.")
+            return self.__getitem__(random.randint(0, len(self.list_data_dict) - 1))
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_image=(
+                "sound" in self.list_data_dict[i] or "audio" in self.list_data_dict[i]
+            ),
+        )
+        
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+        if (len(data_dict["input_ids"]) + len(data_dict["labels"])) > 24000:
+            print(f'length of input data long: {len(data_dict["input_ids"]) + len(data_dict["labels"])}')
+            # logger.exception(f"Error processing instance: Resampling. data_path: {self.data_path}")
+            return self.__getitem__(random.randint(0, len(self.list_data_dict) - 1))
+        if "sound" in self.list_data_dict[i] or "audio" in self.list_data_dict[i]:
+            data_dict["sound"] = sound_tensor
+            data_dict["sound_feature_masks"] = audio_feature_masks
+            data_dict["sound_embed_masks"] = audio_embed_masks
+            data_dict["sound_times"] = audio_times
+        else:
+            data_dict["sound"] = None
+            data_dict["sound_feature_masks"] = None
+            data_dict["sound_embed_masks"] = None
+            data_dict["sound_times"] = None
+      
+        return data_dict
+
+
+class LazyMMC4Dataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is implemented by Ji Lin and Haotian Tang."""
+
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+        image_following_text_only=False,
+        text_only=False,
+    ):
+        super().__init__()
+
+        import pickle
+
+        n_samples = []
+        # actually shards and stats info
+        n_shards = len(os.listdir(data_path)) // 2
+        # n_shards = 100
+        count_info_list = sorted([f for f in os.listdir(data_path) if f.endswith(".count")])[:n_shards]
+        n_samples = [int(open(os.path.join(data_path, f)).read().strip()) for f in count_info_list]
+
+        print("total MMC4 samples", sum(n_samples))  # 10,881,869
+
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is not None:
+            import torch.distributed as dist
+
+            sequence_parallel_size = training_args.seq_parallel_size
+        else:
+            sequence_parallel_size = 1
+        print("sequence_parallel_size", sequence_parallel_size)
+        rank = training_args.process_index // sequence_parallel_size  # int(os.environ["RANK"])
+        world_size = training_args.world_size // sequence_parallel_size  # int(os.environ["WORLD_SIZE"])
+        shared_size = n_shards // world_size
+
+        gpu_samples = [sum(n_samples[i * shared_size : (i + 1) * shared_size]) for i in range(world_size)]
+        self.n_samples = min(gpu_samples) * world_size  # total size
+        self.idx_offset = rank * min(gpu_samples)
+        shard_start, shard_end = rank * shared_size, (rank + 1) * shared_size
+        print(f" * loading data from shard {shard_start}-{shard_end}")
+
+        shard_names = [d.replace(".count", ".pkl") for d in count_info_list]
+        shard_names = shard_names[shard_start:shard_end]
+
+        full_data_list = []
+        # now load data
+        for shard_name in shard_names:
+            # load shard
+            with open(os.path.join(data_path, shard_name), "rb") as f:
+                data_list = pickle.load(f)
+
+            full_data_list.extend(data_list)
+
+        print(f"* loaded totally {len(full_data_list)} samples")
+
+        self.data_list = full_data_list
+
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+        self.image_folder = image_folder
+
+        self.image_following_text_only = image_following_text_only
+        self.text_only = text_only
+
+    def __len__(self):
+        # return len(self.data_list)
+        return self.n_samples
+
+    @property
+    def modality_lengths(self):
+        # Estimate the number of tokens after tokenization, used for length-grouped sampling
+        length_list = []
+        for info in self.data_list:
+            num_images = min(6, len(info["image_info"]))
+            sentences = [info["text_list"][x["matched_text_index"]] for x in info["image_info"][:num_images]]
+            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
+            cur_len = num_images * self.num_image_tokens // 2 + sum([len(x) for x in sentences])
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        info = self.data_list[i - self.idx_offset]
+
+        sentences = info["text_list"]
+        # kentang-mit@: remove existing <image> tokens in the sentences
+        for ix in range(len(sentences)):
+            # if this is an html tag, we still preserve its semantic meaning
+            sentences[ix] = sentences[ix].replace("<image>", "<IMAGE>")
+        sim_matrix = info["similarity_matrix"]  # we do not use this...
+
+        # convert images from base64 to PIL and filter based on image-text similarity
+        images, sentence_ixs = [], []
+        if not self.text_only:
+            for sample_image, sim_vec in zip(info["image_info"], sim_matrix):
+                image_base64 = sample_image["image_base64"]
+                rawbytes = base64.b64decode(image_base64)
+
+                sim_ix = sample_image["matched_text_index"]
+                # sim_ix = np.argmax(sim_vec)
+                # sim_score = sim_vec[sim_ix]
+
+                # filter to images >= 5KB
+                # if len(rawbytes) // 1000 <= 5:
+                #     continue
+                # if sim_score < 0.24:
+                #     continue
+                image = Image.open(io.BytesIO(rawbytes)).convert("RGB")
+
+                images.append(image)
+                sentence_ixs.append(sim_ix)
+
+        # constrain max num 6 images
+        max_num_images = 6
+        if len(images) > max_num_images:
+            images = images[:max_num_images]
+            sentence_ixs = sentence_ixs[:max_num_images]
+
+        # reorder images according to text insertion
+        images = [images[iii] for iii in np.argsort(sentence_ixs)]
+
+        # preprocess and tokenize text
+        for ix in sentence_ixs:
+            sentences[ix] = f"<image>\n{sentences[ix]}"
+
+        if self.image_following_text_only:
+            # use pad tokens to divide sentence pieces
+            text = self.tokenizer.pad_token.join(sentences)
+        else:
+            text = " ".join(sentences)
+        # whitespace cleanup
+        text = text.replace("<image> ", "<image>").replace(" <image>", "<image>")
+        text = f"{text}{self.tokenizer.eos_token}"  # add eos token
+
+        if len(images) > 0:
+            if self.data_args.image_aspect_ratio == "dynamic_s2":
+                images, block_sizes = dynamic_s2_process_images_and_prompt(
+                    images, text, self.data_args, self.image_folder
+                )
+            elif self.data_args.image_aspect_ratio == "dynamic":
+                images, text = dynamic_process_images_and_prompt(
+                    images, text, self.data_args, self.image_folder, max_tiles=6
+                )
+            else:
+                images = torch.stack([process_image(image, self.data_args, self.image_folder) for image in images])
+
+            # the same size for all images, so we concat
+            # cur_token_len = (
+            #     images[0].shape[-2] // self.multimodal_cfg["patch_size"]
+            # ) * (images[0].shape[-1] // self.multimodal_cfg["patch_size"])
+            # cur_token_len += self.multimodal_cfg["n_extra_patch"]
+        else:
+            images = None
+            # cur_token_len = 0
+
+        input_ids = tokenizer_image_token(
+            text,
+            self.tokenizer,
+            return_tensors="pt",
+        )
+
+        image_token_id = self.tokenizer.media_token_ids["image"]
+
+        # now check the case where the last token is image patch token
+        if input_ids[-1] == image_token_id:  # need to remove one last image
+            last_non_im_patch_indices = torch.where(input_ids != image_token_id)[0][-1] + 1
+            input_ids = input_ids[:last_non_im_patch_indices]
+
+        n_im_patch = (input_ids == image_token_id).sum().item()
+
+        if self.data_args.image_aspect_ratio != "dynamic_s2":
+            images = images[:n_im_patch]
+            assert len(images) == n_im_patch, print(text, input_ids)
+        assert len(input_ids.shape) == 1, "Unexpected shape of 'input_ids' from MMC4."
+        input_ids = (
+            torch.concat([torch.tensor([self.tokenizer.bos_token_id]), input_ids])
+            if self.tokenizer.bos_token_id is not None and input_ids[0] != self.tokenizer.bos_token_id
+            else input_ids
+        )
+        targets = input_ids.clone()
+
+        if self.image_following_text_only:  # keep only text after leading image token
+            # remove loss for any token before the first <image> token
+            label_idx = 0
+            while label_idx < targets.shape[-1] and targets[label_idx] != image_token_id:
+                targets[label_idx] = IGNORE_INDEX
+                label_idx += 1
+
+            pad_token = self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0]
+
+            pad_token_idxs = torch.where(targets == pad_token)[0]
+            for pad_token_idx in pad_token_idxs:
+                token_idx = pad_token_idx + 1
+                while token_idx < targets.shape[-1] and targets[token_idx] != image_token_id:
+                    targets[token_idx] = IGNORE_INDEX
+                    token_idx += 1
+            # do not train on padding tokens
+            targets[targets == pad_token] = IGNORE_INDEX
+
+        # mask image tokens is unnecessary for llava-1.5
+        # targets[targets == IMAGE_TOKEN_INDEX] = IGNORE_INDEX
+        # print(input_ids.shape)
+
+        data_dict = dict(input_ids=input_ids, labels=targets, image=images)
+        if self.data_args.image_aspect_ratio == "dynamic_s2":
+            data_dict["block_sizes"] = block_sizes
+
+        return data_dict
+
+
+class LazyCoyoDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is implemented by Ji Lin and Haotian Tang."""
+
+    num_image_tokens = 576
+
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+        # kentang-mit@: balance the total number of tokens for Coyo and MMC4.
+        n_samples_per_idx=4,
+    ):
+        super().__init__()
+
+        import pickle
+
+        n_samples = []
+        # actually shards and stats info
+        n_shards = len(os.listdir(data_path)) // 2
+        # n_shards = 100
+        count_info_list = sorted([f for f in os.listdir(data_path) if f.endswith(".count")])[:n_shards]
+        n_samples = [int(open(os.path.join(data_path, f)).read().strip()) for f in count_info_list]
+
+        print("total COYO samples", sum(n_samples))
+
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is not None:
+            import torch.distributed as dist
+
+            sequence_parallel_size = training_args.seq_parallel_size
+        else:
+            sequence_parallel_size = 1
+        print("sequence_parallel_size", sequence_parallel_size)
+        rank = training_args.process_index // sequence_parallel_size  # int(os.environ["RANK"])
+        world_size = training_args.world_size // sequence_parallel_size  # int(os.environ["WORLD_SIZE"])
+        shared_size = n_shards // world_size
+
+        gpu_samples = [
+            sum(n_samples[i * shared_size : (i + 1) * shared_size]) // n_samples_per_idx for i in range(world_size)
+        ]
+        self.n_samples = min(gpu_samples) * world_size  # total size
+        self.idx_offset = rank * min(gpu_samples)
+
+        shard_start, shard_end = rank * shared_size, (rank + 1) * shared_size
+        print(f" * loading data from shard {shard_start}-{shard_end}")
+
+        shard_names = [d.replace(".count", ".pkl") for d in count_info_list]
+        shard_names = shard_names[shard_start:shard_end]
+
+        full_data_list = []
+        # now load data
+        for shard_name in shard_names:
+            # load shard
+            with open(os.path.join(data_path, shard_name), "rb") as f:
+                shard_data = pickle.load(f)
+                random.seed(42)
+                if "mmc4" in data_path:
+                    random.shuffle(shard_data)  # shuffle for MMC4cap only
+                full_data_list.extend(shard_data)
+
+        print(f"* loaded totally {len(full_data_list)} samples")
+
+        # now pack the samples into groups
+        n_groups = len(full_data_list) // n_samples_per_idx
+        full_data_list = [
+            full_data_list[i : i + n_samples_per_idx] for i in range(0, len(full_data_list), n_samples_per_idx)
+        ]
+        if len(full_data_list[-1]) < n_samples_per_idx:
+            full_data_list = full_data_list[:-1]
+        assert len(full_data_list) == n_groups
+        print(f"split into {n_groups} groups")
+
+        self.data_list = full_data_list
+
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+        self.image_folder = image_folder
+
+    def __len__(self):
+        # return len(self.data_list)
+        return self.n_samples
+
+    @property
+    def modality_lengths(self):
+        # Estimate the number of tokens after tokenization, used for length-grouped sampling
+        length_list = []
+        for samples in self.data_list:
+            cur_len = sum([len(conv["text" if "text" in conv else "caption"].split()) for conv in samples])
+            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
+            cur_len = cur_len + len(samples) * self.num_image_tokens // 2
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        CONCAT_SAMPLES = False
+        info_list = self.data_list[i - self.idx_offset]
+
+        text_list = []
+        image_list = []
+
+        for sample in info_list:
+            caption_key = (
+                "text" if "text" in sample else "caption"
+            )  # kentang-mit@: remove existing <image> tokens in the sentences
+            # kentang-mit@: remove existing <image> token.
+            # if this is an html tag, we still preserve its semantic meaning
+            sample[caption_key] = sample[caption_key].replace("<image>", "<IMAGE>")
+            text_list.append(DEFAULT_IMAGE_TOKEN + "\n" + sample[caption_key] + self.tokenizer.eos_token)
+            if "image" in sample:
+                image_base64 = sample["image"]
+                rawbytes = base64.b64decode(image_base64)
+            else:
+                rawbytes = sample["rawbytes"]
+            image = Image.open(io.BytesIO(rawbytes)).convert("RGB")
+            image_list.append(image)
+
+        image_list = torch.stack([process_image(image, self.data_args, self.image_folder) for image in image_list])
+
+        if CONCAT_SAMPLES:
+            # into <image>cap<eos><image>cap<eos>...
+            text_list = "".join(text_list)
+
+            input_ids = self.tokenizer(
+                text_list,
+                return_tensors="pt",
+                padding="longest",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+            ).input_ids  # 4, seq_len
+
+            input_ids = input_ids[0]
+
+        else:
+            input_ids = [
+                tokenizer_image_token(
+                    prompt,
+                    self.tokenizer,
+                    return_tensors="pt",
+                )
+                for prompt in text_list
+            ]
+            # print([x.shape[0] for x in input_ids], [len(x.split()) for x in text_list], [len(re.findall(r"<image[^>]*>", x)) for x in text_list])
+
+            # input_ids = torch.nn.utils.rnn.pad_sequence(
+            #     input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+            # )
+
+        targets = copy.deepcopy(input_ids)
+        for i in range(len(targets)):
+            targets[i][targets[i] == self.tokenizer.pad_token_id] = IGNORE_INDEX
+
+        return dict(input_ids=input_ids, labels=targets, image=image_list)
+
+
+class LazyWDSDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is implemented by Ji Lin and Ligeng Zhu."""
+
+    def __init__(
+        self,
+        data_path: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        image_folder: str,
+        training_args: TrainingArguments,
+    ):
+        super().__init__()
+        n_samples = []
+        n_shards = len(os.listdir(data_path)) // 3
+        for shard in range(n_shards):
+            with open(os.path.join(data_path, f"{shard:05d}_stats.json")) as f:
+                info = json.load(f)
+                n_samples.append(info["successes"])
+
+        # print(f"[DEBUG] {data_path} total samples", sum(n_samples))  # 10,881,869
+
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is not None:
+            import torch.distributed as dist
+
+            sequence_parallel_size = training_args.seq_parallel_size
+        else:
+            sequence_parallel_size = 1
+        print("sequence_parallel_size", sequence_parallel_size)
+        rank = training_args.process_index // sequence_parallel_size  # int(os.environ["RANK"])
+        world_size = training_args.world_size // sequence_parallel_size  # int(os.environ["WORLD_SIZE"])
+        shared_size = n_shards // world_size
+        print("rank", rank, "world_size", world_size, "shared_size", shared_size)
+        gpu_samples = [sum(n_samples[i * shared_size : (i + 1) * shared_size]) for i in range(world_size)]
+        self.n_samples = min(gpu_samples) * world_size  # total size
+        self.idx_offset = rank * min(gpu_samples)
+        shard_start, shard_end = rank * shared_size, (rank + 1) * shared_size
+        print(f" * loading data from shard {shard_start}-{shard_end}")
+
+        tar_list = [f"{shard_idx:05d}.tar" for shard_idx in range(shard_start, shard_end)]
+
+        self.data_list = []
+        t1 = time.time()
+        for tar in tar_list:
+            tmp_path = f"/tmp/ccs{tar}"
+            tar_path = os.path.join(data_path, tar)
+
+            if PROCESS_GROUP_MANAGER is not None:
+                dist.barrier()
+                if PROCESS_GROUP_MANAGER.sp_rank == 0:
+                    os.makedirs(tmp_path, exist_ok=True)
+                    os.system(f"tar -xkf {tar_path} -C {tmp_path}")
+                dist.barrier()
+            else:
+                os.makedirs(tmp_path, exist_ok=True)
+                os.system(f"tar -xkf {tar_path} -C {tmp_path}")
+
+            txt_list = [f for f in os.listdir(tmp_path) if f.endswith(".txt")]
+
+            for txt in txt_list:
+                caption = open(os.path.join(tmp_path, txt)).read().strip()
+                image_path = os.path.join(tmp_path, txt.split(".")[0] + ".jpg")
+                self.data_list.append({"caption": caption, "image": image_path})
+        t2 = time.time()
+        print(f"Loading done. Total time: {t2 - t1:.2f} seconds")
+
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+        self.image_folder = image_folder
+
+    def __len__(self):
+        return self.n_samples
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+
+        # print("i", i, "idx_offset", self.idx_offset, "len", len(self.data_list))
+        info = self.data_list[i - self.idx_offset]
+        caption, image_path = info["caption"], info["image"]
+
+        rand_prompt = "<image>\n"
+        sources = [
+            {
+                "image": image_path,
+                "conversations": [
+                    {"from": "human", "value": rand_prompt},
+                    {"from": "gpt", "value": caption},
+                ],
+            }
+        ]
+
+        # one example of sources
+        # [{'id': 'GCC_train_001738742', 'image': 'GCC_train_001738742.jpg', 'conversations': [{'from': 'human', 'value': 'Provide a brief description of the given image.\n<image>'}, {'from': 'gpt', 'value': 'a sketch of an ostrich'}]}]
+        if "image" in sources[0]:
+            image = process_image(sources[0]["image"], self.data_args, self.image_folder)
+            image = torch.unsqueeze(image, dim=0)
+            # now random pick some context samples for training
+            if hasattr(self.data_args, "num_shots"):
+                if self.data_args.num_shots > 0:
+                    raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+        data_dict = preprocess([sources[0]["conversations"]], self.tokenizer, has_image=True)
+
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+
+        # image exist in the data
+        if image is not None:
+            data_dict["image"] = image
+        else:
+            raise NotImplementedError
+
+        return data_dict
+
+
+class LazyCCSWebDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is implemented by Ligeng Zhu."""
+
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+    ):
+        super().__init__()
+        t1 = time.time()
+
+        from llava.data.simple_vila_webdataset import VILAWebDataset
+
+        print("[DEBUG] ", osp.abspath(data_path))
+        self.dataset = VILAWebDataset(data_path=osp.abspath(data_path))
+
+        t2 = time.time()
+        print(f"Loading done. Total time: {t2 - t1:.2f} seconds")
+
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        # info = self.data_list[i - self.idx_offset]
+        # caption, image_path = info["caption"], info["image"]
+        info = self.dataset[i]
+        if ".jpg" in info:
+            caption, image_path = info[".txt"], info[".jpg"]
+        elif ".png" in info:
+            caption, image_path = info[".txt"], info[".png"]
+        elif ".webp" in info:
+            caption, image_path = info[".txt"], info[".webp"]
+        elif ".bmp" in info:
+            caption, image_path = info[".txt"], info[".bmp"]
+        elif ".tiff" in info:
+            caption, image_path = info[".txt"], info[".tiff"]
+        else:
+            print(info.keys())
+            print(info)
+            raise KeyError
+
+        caption = caption.replace("<image>", "<IMAGE>")
+        if isinstance(image_path, io.BytesIO):
+            image_path = Image.open(image_path).convert("RGB")
+
+        if not isinstance(image_path, PIL.Image.Image):
+            print(image_path)
+            print(info.keys())
+            print(type(image_path))
+            raise NotImplementedError
+
+        rand_prompt = "<image>\n"
+        sources = [
+            {
+                "image": image_path,
+                "conversations": [
+                    {"from": "human", "value": rand_prompt},
+                    {"from": "gpt", "value": caption},
+                ],
+            }
+        ]
+
+        # one example of sources
+        # [{'id': 'GCC_train_001738742', 'image': 'GCC_train_001738742.jpg', 'conversations': [{'from': 'human', 'value': 'Provide a brief description of the given image.\n<image>'}, {'from': 'gpt', 'value': 'a sketch of an ostrich'}]}]
+        if "image" in sources[0]:
+            image = process_image(sources[0]["image"], self.data_args, image_folder=None)
+            image = torch.unsqueeze(image, dim=0)
+            # now random pick some context samples for training
+            if hasattr(self.data_args, "num_shots"):
+                if self.data_args.num_shots > 0:
+                    raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+        data_dict = preprocess([sources[0]["conversations"]], self.tokenizer, has_image=True)
+
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+
+        # image exist in the data
+        if image is not None:
+            data_dict["image"] = image
+        else:
+            raise NotImplementedError
+
+        return data_dict
+
+
+from functools import lru_cache
+
+
+@lru_cache(maxsize=16)
+def lru_json_load(fpath):
+    with open(fpath) as fp:
+        return json.load(fp)
+
+
+class LazyCoyoWebDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    This class is implemented by Ligeng Zhu."""
+
+    num_image_tokens = 576
+
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+        # kentang-mit@: balance the total number of tokens for Coyo and MMC4.
+        n_samples_per_idx=4,
+    ):
+        super().__init__()
+
+        from llava.data.simple_vila_webdataset import VILAWebDataset
+
+        print("[DEBUG] ", osp.abspath(data_path))
+        self.dataset = VILAWebDataset(data_path=osp.abspath(data_path), meta_path=data_args.meta_path)
+
+        if data_args.start_idx >= 0 and data_args.end_idx >= 0:
+            # Ligeng: support slicing for ablate different subsets.
+            total = len(self.dataset)
+            start_idx = int(total * data_args.start_idx)
+            end_idx = int(total * data_args.end_idx)
+            print(f"loading subset from {start_idx} to {end_idx}, total {total}")
+            self.dataset = torch.utils.data.Subset(self.dataset, range(start_idx, end_idx))
+
+        # For caption choice,
+        #   if None: use original caption
+        #   if a folder path: use specified caption to override original one (choice1)
+        #   if a folder path: use specified caption and concat with original one (choice2)
+        self.caption_choice = None
+        self.caption_choice_2 = None
+        self.data_path = data_path
+
+        if data_args.caption_choice is not None:
+            self.caption_choice = data_args.caption_choice
+            print("[recap] Override coyo caption using ", self.caption_choice)
+
+        if data_args.caption_choice_2 is not None:
+            self.caption_choice_2 = data_args.caption_choice_2
+            print("[recapv2] Override coyo caption using ", self.caption_choice_2)
+
+        print("total samples", len(self.dataset))
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is not None:
+            import torch.distributed as dist
+
+            sequence_parallel_size = training_args.seq_parallel_size
+            sequence_parallel_rank = PROCESS_GROUP_MANAGER.sp_rank
+        else:
+            sequence_parallel_size = 1
+        print("sequence_parallel_size", sequence_parallel_size)
+        rank = (
+            training_args.process_index // sequence_parallel_size if "RANK" in os.environ else 2
+        )  # int(os.environ["RANK"])
+        world_size = (
+            training_args.world_size // sequence_parallel_size if "WORLD_SIZE" in os.environ else 32
+        )  # int(os.environ["WORLD_SIZE"])
+        print(
+            "rank",
+            rank,
+            "world_size",
+            world_size,
+        )
+
+        self.n_samples_per_idx = n_samples_per_idx
+        # self.n_samples = len(self.dataset) // n_samples_per_idx
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.dataset) // self.n_samples_per_idx
+
+    @property
+    def modality_lengths(self):
+        # Estimate the number of tokens after tokenization, used for length-grouped sampling
+        length_list = []
+        for samples in self.data_list:
+            cur_len = sum([len(conv["text" if "text" in conv else "caption"].split()) for conv in samples])
+            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
+            cur_len = cur_len + len(samples) * self.num_image_tokens // 2
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        CONCAT_SAMPLES = False
+        # info_list = self.dataset[i - self.idx_offset]
+
+        begin_idx, end_idx = (
+            i * self.n_samples_per_idx,
+            (i + 1) * self.n_samples_per_idx,
+        )
+        end_idx = min(end_idx, len(self.dataset))
+
+        text_list = []
+        image_list = []
+
+        for idx in range(begin_idx, end_idx):
+            info = self.dataset[idx]
+            if ".jpg" in info:
+                caption, image_path = info[".txt"], info[".jpg"]
+            elif ".png" in info:
+                caption, image_path = info[".txt"], info[".png"]
+            elif ".webp" in info:
+                caption, image_path = info[".txt"], info[".webp"]
+            elif ".bmp" in info:
+                caption, image_path = info[".txt"], info[".bmp"]
+            elif ".tiff" in info:
+                caption, image_path = info[".txt"], info[".tiff"]
+            else:
+                print(info.keys())
+                print(info)
+                raise KeyError
+
+            if self.caption_choice is not None:
+                # load new captions
+                shard = info["__shard__"]
+                url = info[".json"]["url"]
+                tar_name = osp.relpath(osp.realpath(shard), osp.realpath(self.data_path))
+                # tar_name = osp.dirname(shard)
+                shard_json_path = osp.join(self.caption_choice, tar_name + ".json")
+                try:
+                    shard_json = lru_json_load(shard_json_path)
+                    try:
+                        caption = shard_json[url]["output"]
+                    except KeyError:
+                        print(f"{url} not in caption. fallback to original caption temporarially")
+                except:
+                    print(f"shard_json_path {shard_json_path} not found. fallback to original caption temporarially")
+            caption = caption.replace("<image>", "<IMAGE>")
+            text_list.append(DEFAULT_IMAGE_TOKEN + caption + self.tokenizer.eos_token)
+
+            if isinstance(image_path, io.BytesIO):
+                image_path = Image.open(image_path).convert("RGB")
+
+            if not isinstance(image_path, PIL.Image.Image):
+                print(image_path)
+                print(info.keys())
+                print(type(image_path))
+                raise NotImplementedError
+
+            image_list.append(image_path)
+
+        # image_list = torch.stack([process_image(image, self.data_args, image_folder=None) for image in image_list])
+        # NOTE(fix by ligeng)
+        #  now image_list should return a list of image tensor where each has a dimension of (1, c, h, w)
+        image_list = [process_image(image, self.data_args, image_folder=None).unsqueeze(0) for image in image_list]
+
+        if CONCAT_SAMPLES:
+            # into <image>cap<eos><image>cap<eos>...
+            text_list = "".join(text_list)
+
+            input_ids = self.tokenizer(
+                text_list,
+                return_tensors="pt",
+                padding="longest",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+            ).input_ids  # 4, seq_len
+
+            input_ids = input_ids[0]
+        else:
+            input_ids = [
+                tokenizer_image_token(
+                    prompt,
+                    self.tokenizer,
+                    return_tensors="pt",
+                )
+                for prompt in text_list
+            ]
+            input_ids = [
+                (
+                    torch.concat([torch.tensor([self.tokenizer.bos_token_id]), input_ids_i])
+                    if input_ids_i[0] != self.tokenizer.bos_token_id
+                    else input_ids_i
+                )
+                for input_ids_i in input_ids
+            ]
+
+        targets = copy.deepcopy(input_ids)
+        for i in range(len(targets)):
+            targets[i][targets[i] == self.tokenizer.pad_token_id] = IGNORE_INDEX
+
+        return dict(input_ids=input_ids, labels=targets, image=image_list)
+
+
+class LazyVideoWebDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(
+        self,
+        data_path: str,
+        image_folder: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+        # cache_path: str,
+        # n_samples_per_idx=4,
+    ):
+        super().__init__()
+
+        # from llava.data.simple_video_dataset import SimpleVideoDataset
+
+        from llava.data.simple_vila_webdataset import VILAWebDataset
+
+        print("[DEBUG] ", osp.abspath(data_path))
+        self.dataset = VILAWebDataset(
+            data_path=osp.abspath(data_path),
+            meta_path=f"{osp.abspath(data_path)}/wids-meta.json",
+            # cache_dir=cache_path,
+        )
+
+        # None: use original caption
+        # Folder path: use original caption
+        self.caption_choice = None
+        self.data_path = data_path
+
+        if data_args.caption_choice is not None:
+            self.caption_choice = data_args.caption_choice
+            print("[recap] Override LazyVideo caption using ", self.caption_choice)
+
+        print("total samples", len(self.dataset))
+        # InternVid: TODO
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is not None:
+            import torch.distributed as dist
+
+            sequence_parallel_size = training_args.seq_parallel_size
+            sequence_parallel_rank = PROCESS_GROUP_MANAGER.sp_rank
+        else:
+            sequence_parallel_size = 1
+        print("sequence_parallel_size", sequence_parallel_size)
+        rank = (
+            training_args.process_index // sequence_parallel_size if "RANK" in os.environ else 2
+        )  # int(os.environ["RANK"])
+        world_size = (
+            training_args.world_size // sequence_parallel_size if "WORLD_SIZE" in os.environ else 32
+        )  # int(os.environ["WORLD_SIZE"])
+        print(
+            "rank",
+            rank,
+            "world_size",
+            world_size,
+        )
+        self.rank = rank
+        # rank = int(os.environ["RANK"]) if "RANK" in os.environ else 2
+        # world_size = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 32
+
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+
+        self.missing_uids = set()
+
+    def __len__(self):
+        return len(self.dataset)
+
+    @property
+    def modality_lengths(self):
+        # Estimate the number of tokens after tokenization, used for length-grouped sampling
+        length_list = []
+        for samples in self.data_list:
+            cur_len = sum([len(conv["text" if "text" in conv else "caption"].split()) for conv in samples])
+            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
+            cur_len = cur_len + len(samples) * self.num_image_tokens // 2
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        ADD_TEXT_PROMPT = False
+        num_video_frames = self.data_args.num_video_frames if hasattr(self.data_args, "num_video_frames") else 8
+        loader_fps = self.data_args.fps if hasattr(self.data_args, "fps") else 0.0
+
+        info = self.dataset[i]
+
+        caption = ""
+        # print(info)
+        if ".mp4" in info:
+            caption, video_path = info[".txt"], info[".mp4"]
+        else:
+            video_path = None
+            caption = "Empty video."
+
+        images, frames_loaded, _ = LazySupervisedDataset._load_video(
+            video_path, num_video_frames, loader_fps, self.data_args
+        )
+
+        if frames_loaded == 0:
+            caption = "Empty video."
+
+        if self.caption_choice is not None:
+            shard = info["__shard__"]
+            uuid = osp.join(info["__shard__"], info["__key__"])
+            url = info["__key__"]
+            tar_name = osp.basename(info["__shard__"])
+
+            try:
+                shard_json_path = osp.join(self.caption_choice, tar_name.replace(".tar", ".json"))
+                shard_json = lru_json_load(shard_json_path)
+                caption = shard_json[url]["summary"]["output"]
+            except (KeyError, FileNotFoundError, json.decoder.JSONDecodeError):
+                if uuid not in self.missing_uids:
+                    print("override caption not found for ", uuid)
+                    self.missing_uids.add(uuid)
+
+            # print(f"[DEBUG {uuid}]", caption)
+
+        frames_loaded_successfully = len(images)
+        if caption is None:
+            caption = ""
+        prompt = "<image>\n" * frames_loaded_successfully + caption
+        image_tensor = torch.stack([process_image(image, self.data_args, None) for image in images])
+
+        input_ids = tokenizer_image_token(
+            prompt,
+            self.tokenizer,
+            return_tensors="pt",
+        )
+        targets = copy.deepcopy(input_ids)
+        data_dict = dict(input_ids=input_ids, labels=targets, image=image_tensor)
+
+        return data_dict
+
+
+class DataCollatorForSupervisedDatasetSeqParallel:
+    """Collate examples for supervised fine-tuning (audio version).
+    Adapted from LLaVA sequence-packing collator to support audio inputs instead of images/videos.
+    """
+
+    def __init__(
+        self,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        training_args: TrainingArguments,
+        sp_degree: int,
+        sp_rank: int,
+        ring_degree: int,
+        ring_type: str,
+    ):
+        self.tokenizer = tokenizer
+        self.data_args = data_args
+        self.training_args = training_args
+        self.sp_degree = sp_degree
+        self.sp_rank = sp_rank
+        self.ring_degree = ring_degree
+        self.ring_type = ring_type
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels, audios = [], [], []
+        audio_token_id = self.tokenizer.media_token_ids["sound"]
+
+        # --- Step 1: Collect all inputs ---
+        for instance in instances:
+            if not isinstance(instance["input_ids"], list):
+                input_ids.append(instance["input_ids"])
+            else:
+                input_ids += instance["input_ids"]
+
+            if not isinstance(instance["labels"], list):
+                labels.append(instance["labels"])
+            else:
+                labels += instance["labels"]
+
+            # Expect `instance["audio"]` tensor of shape (n_audios, 128, 3000)
+            if "sound" in instance:
+                cur_audio = instance["sound"]
+                assert len(cur_audio.shape) == 3, f"Expected (n_audios, 128, 3000), got {cur_audio.shape}"
+                if cur_audio.shape[0] == 0:
+                    warnings.warn("Loaded one sample without audio.")
+                if not isinstance(instance["input_ids"], list):
+                    audios.append(cur_audio)
+                else:
+                    audios.extend(cur_audio.chunk(cur_audio.size(0), 0))
+            else:
+                warnings.warn("Loaded one sample without audio.")
+                audios.append([])
+
+        # --- Step 2: Sanity checks ---
+        max_num_audios = max([len(_a) for _a in audios])
+        for _a, _ids in zip(audios, input_ids):
+            assert (
+                len(_a) == (_ids == audio_token_id).sum().item()
+            ), f"Mismatch between number of audio tensors and <audio> tokens. Found {len(_a)} audios but {_ids.tolist().count(audio_token_id)} tokens."
+
+        NUM_TOKENS_PER_AUDIO = getattr(self.data_args, "audio_frames", 10)  # configurable
+
+        # Dummy audio input for padding
+        dummy_audio = torch.ones((1, 128, 3000), device=input_ids[0].device)
+        dummy_input_ids = torch.tensor(
+            [self.tokenizer.bos_token_id, audio_token_id, self.tokenizer.eos_token_id],
+            device=input_ids[0].device,
+        )
+        dummy_labels = dummy_input_ids.clone()
+        dummy_labels[:2] = IGNORE_INDEX
+        dummy_seqlen = NUM_TOKENS_PER_AUDIO + 2
+        dummy_position_ids = torch.arange(0, dummy_seqlen, dtype=torch.int32)
+
+        # --- Step 3: Sort by effective length ---
+        combined = sorted(
+            zip(input_ids, labels, audios),
+            key=lambda x: len(x[2]) * (NUM_TOKENS_PER_AUDIO - 1) + x[0].size(-1),
+            reverse=True,
+        )
+        sorted_ids, sorted_labels, sorted_audios = zip(*combined)
+        sorted_ids, sorted_labels, sorted_audios = list(sorted_ids), list(sorted_labels), list(sorted_audios)
+
+        max_seq_length = self.tokenizer.model_max_length
+        max_sample_len = 0
+
+        batches, label_batches, position_ids, batch_audios, seqlens_in_batch = [], [], [], [], []
+        i = 0
+
+        # --- Step 4: Sequence Packing ---
+        while i < len(sorted_ids):
+            current_batch = torch.tensor([], dtype=torch.int32)
+            current_label_batch = torch.tensor([], dtype=torch.int32)
+            current_position_ids = torch.tensor([], dtype=torch.int32)
+            current_batch_audios = []
+            current_num_audios = 0
+            current_len = 0
+            current_num_samples = 0
+
+            while i < len(sorted_ids):
+                num_audios = (sorted_ids[i] == audio_token_id).sum().item()
+                num_audio_tokens_added = num_audios * (NUM_TOKENS_PER_AUDIO - 1)
+                num_incoming_tokens = sorted_ids[i].size(-1) + num_audio_tokens_added
+
+                # Handle Ring padding
+                if self.ring_degree > 1:
+                    RING_PAD_TOKEN_INDEX = 2
+                    pad_len = 0
+                    if self.ring_type == "ring_varlen":
+                        if num_incoming_tokens % self.sp_degree != 0:
+                            pad_len = self.sp_degree - num_incoming_tokens % self.sp_degree
+                    elif self.ring_type == "zigzag_ring_varlen":
+                        zigzag_sp_degree = self.sp_degree * 2
+                        if num_incoming_tokens % zigzag_sp_degree != 0:
+                            pad_len = zigzag_sp_degree - num_incoming_tokens % zigzag_sp_degree
+                    else:
+                        raise ValueError(f"Invalid ring_type: {self.ring_type}")
+
+                    if pad_len > 0:
+                        pad_tensor = torch.full(
+                            (pad_len,), RING_PAD_TOKEN_INDEX, dtype=sorted_ids[i].dtype, device=sorted_ids[i].device
+                        )
+                        sorted_ids[i] = torch.cat([sorted_ids[i], pad_tensor])
+                        pad_label_tensor = torch.full(
+                            (pad_len,), IGNORE_INDEX, dtype=sorted_labels[i].dtype, device=sorted_labels[i].device
+                        )
+                        sorted_labels[i] = torch.cat([sorted_labels[i], pad_label_tensor])
+                        num_incoming_tokens += pad_len
+
+                if (current_len + num_incoming_tokens <= max_seq_length):
+                    current_num_audios += num_audios
+                    current_len += num_incoming_tokens
+                    current_num_samples += 1
+                    current_position_ids = torch.cat(
+                        (current_position_ids, torch.arange(start=0, end=num_incoming_tokens)), dim=0
+                    )
+                    current_batch = torch.cat((current_batch, sorted_ids[i]), dim=0)
+                    sorted_labels[i][0] = IGNORE_INDEX
+                    current_label_batch = torch.cat((current_label_batch, sorted_labels[i]), dim=0)
+                    seqlens_in_batch.append(num_incoming_tokens)
+                    current_batch_audios.extend(sorted_audios[i])
+                    i += 1
+                    assert current_num_audios == len(current_batch_audios)
+                else:
+                    break
+
+            # --- Step 5: Padding with dummy audio if needed ---
+            MAX_RETRY = self.sp_degree
+            num_retry = 0
+            while current_num_audios < self.sp_degree and current_len < max_seq_length and num_retry <= MAX_RETRY:
+                current_num_audios += dummy_audio.size(0)
+                current_len += dummy_seqlen
+                current_num_samples += 1
+                current_position_ids = torch.cat((current_position_ids, dummy_position_ids), dim=0)
+                current_batch = torch.cat((current_batch, dummy_input_ids), dim=0)
+                current_label_batch = torch.cat((current_label_batch, dummy_labels), dim=0)
+                seqlens_in_batch.append(dummy_seqlen)
+                current_batch_audios.extend(dummy_audio)
+                num_retry += 1
+
+            if current_num_audios < self.sp_degree:
+                print(f"Warning: Skipping one packed sample with {current_num_audios} audios")
+                seqlens_in_batch = seqlens_in_batch[:-current_num_samples]
+                continue
+
+            max_sample_len = max(max_sample_len, current_len)
+            batches.append(current_batch)
+            label_batches.append(current_label_batch)
+            position_ids.append(current_position_ids)
+            batch_audios.append(current_batch_audios)
+
+        # --- Step 6: Sequence parallelism split ---
+        for i in range(len(batches)):
+            audio_token_indices = torch.where(batches[i] == audio_token_id)[0].tolist()
+            audio_ids = torch.arange(0, len(audio_token_indices), dtype=torch.int32)
+            batches[i] = extract_local_input_ids(
+                batches[i], audio_token_indices, self.sp_rank, self.sp_degree, self.tokenizer.bos_token_id
+            )
+            label_batches[i] = extract_local_input_ids(
+                label_batches[i], audio_token_indices, self.sp_rank, self.sp_degree, self.tokenizer.bos_token_id
+            )
+            batch_audios[i] = torch.concat(
+                extract_local_from_list(batch_audios[i], self.sp_rank, self.sp_degree), dim=0
+            )
+            num_audios = len(batch_audios[i])
+            assert num_audios == len(torch.where(batches[i] == audio_token_id)[0].tolist())
+
+            position_ids[i] = extract_local_position_ids(
+                position_ids[i], audio_token_indices, audio_ids, self.sp_rank, self.sp_degree, NUM_TOKENS_PER_AUDIO - 1
+            )
+
+        # --- Step 7: Final tensorization ---
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            batches, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(label_batches, batch_first=True, padding_value=IGNORE_INDEX)
+        seqlens_in_batch = torch.stack([torch.tensor(x) for x in seqlens_in_batch], axis=0).flatten()
+        position_ids = torch.nn.utils.rnn.pad_sequence(position_ids, batch_first=True, padding_value=-1)
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+            seqlens_in_batch=seqlens_in_batch,
+            media={"sound": batch_audios},
+            media_config={"sound": {}},
+            position_ids=position_ids,
+        )
+        return batch
+
+
+def make_supervised_data_module(
+    tokenizer: PreTrainedTokenizer,
+    data_args: DataArguments,
+    training_args: TrainingArguments,
+) -> Dict:
+    """Make dataset and collator for supervised fine-tuning.
+    This function is originally implemented by the LLaVA team and
+    modified by Jason Lu, Haotian Tang and Ligeng Zhu."""
+    datasets_mixture.register_datasets_mixtures()
+
+    from .builder import build_dataset
+
+    train_dataset = build_dataset(data_args.data_mixture, data_args, training_args, tokenizer)
+    training_args.sample_lens = [len(d) for d in train_dataset.datasets]
+
+    PROCESS_GROUP_MANAGER = get_pg_manager()
+    if PROCESS_GROUP_MANAGER is None:
+        data_collator = DataCollator(tokenizer=tokenizer)
+    else:
+        sp_degree = training_args.seq_parallel_size
+        sp_rank = PROCESS_GROUP_MANAGER.sp_rank
+        ring_degree = PROCESS_GROUP_MANAGER.ring_degree
+        ring_type = PROCESS_GROUP_MANAGER.ring_type
+        data_collator = DataCollatorForSupervisedDatasetSeqParallel(
+            tokenizer=tokenizer,
+            data_args=data_args,
+            training_args=training_args,
+            sp_degree=sp_degree,
+            sp_rank=sp_rank,
+            ring_degree=ring_degree,
+            ring_type=ring_type,
+        )
+
+    return dict(
+        train_dataset=train_dataset,
+        data_collator=data_collator,
+    )
diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
new file mode 100644
index 000000000000..8556e34ca322
--- /dev/null
+++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
@@ -0,0 +1,626 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/musicflamingo/modular_musicflamingo.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_musicflamingo.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Callable
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_bidirectional_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_musicflamingo import MusicFlamingoConfig, MusicFlamingoEncoderConfig
+from .rotary_embedding import RotaryEmbedding, apply_rotary_emb
+
+
+logger = logging.get_logger(__name__)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None and attention_mask.ndim == 4:
+        attn_weights = attn_weights + attention_mask[:, :, :, : key.shape[-2]]
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class MusicFlamingoAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        layer_idx: Optional[int] = None,
+        config: Optional[MusicFlamingoConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        if layer_idx is None and is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.layer_idx = layer_idx
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+        # TODO: we need a refactor so that the different attention modules can get their specific kwargs
+        # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        # determine input shapes
+        bsz, tgt_len = hidden_states.shape[:-1]
+        q_input_shape = (bsz, tgt_len, -1, self.head_dim)
+
+        # Scaling is susceptible to floating point arithmetics' inprecisions
+        # which can lead to different results (this is dependent from model
+        # to model, e.g. musicflamingo is one such case). We therefore keep the
+        # original order of scaling to follow the original implementation
+        # and enforce no scaling (1.0) in the attention call below.
+        query_states = self.q_proj(hidden_states) * self.scaling
+        query_states = query_states.view(*q_input_shape)
+        query_states = query_states.transpose(1, 2).contiguous()
+
+        # Check is encoder-decoder model is being used. Otherwise we'll get `DynamicCache`
+        if past_key_values is not None and isinstance(past_key_values, EncoderDecoderCache):
+            is_updated = past_key_values.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_values.is_updated[self.layer_idx] = True
+                past_key_values = past_key_values.cross_attention_cache
+            else:
+                past_key_values = past_key_values.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_values and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_values.layers[self.layer_idx].keys
+            value_states = past_key_values.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states).view(bsz, -1, self.num_heads, self.head_dim)
+            value_states = self.v_proj(current_states).view(bsz, -1, self.num_heads, self.head_dim)
+            key_states = key_states.transpose(1, 2).contiguous()
+            value_states = value_states.transpose(1, 2).contiguous()
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_values.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=1.0,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class MusicFlamingoEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: MusicFlamingoConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = MusicFlamingoAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        return hidden_states, attn_weights
+
+
+@auto_docstring
+class MusicFlamingoPreTrainedModel(PreTrainedModel):
+    config: MusicFlamingoConfig
+    base_model_prefix = "model"
+    input_modalities = ("audio", "text")
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MusicFlamingoAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+
+@auto_docstring(
+    custom_intro="""
+    The audio model from MusicFlamingo without any head or projection on top.
+    """
+)
+class MusicFlamingoEncoder(MusicFlamingoPreTrainedModel):
+    """
+    MusicFlamingo encoder: Whisper encoder, average pool (time/2), then LayerNorm.
+    """
+
+    # Ignore copy
+    config: MusicFlamingoEncoderConfig
+    main_input_name = "input_features"
+    input_modalities = "audio"
+    _no_split_modules = ["MusicFlamingoEncoderLayer"]
+
+    def __init__(self, config: MusicFlamingoConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+
+        self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
+        self.embed_positions.requires_grad_(False)
+
+        self.layers = nn.ModuleList([MusicFlamingoEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+        # Ignore copy
+        self.avg_pooler = nn.AvgPool1d(2, stride=2)
+
+        self.gradient_checkpointing = False
+        self.pos_emb = RotaryEmbedding(dim=256, freqs_for="lang", max_time=1200.0)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.conv1
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.conv1 = value
+
+    @can_return_tuple
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        input_features_mask: Optional[torch.Tensor] = None,
+        audio_times: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        r"""
+        Args:
+            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
+                these features from waveform input.
+            input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+        """
+
+        seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
+        input_features_lengths = input_features_mask.sum(-1)
+        input_features_lengths = (input_features_lengths - 1) // 2 + 1  # conv2 downsampling
+        input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
+
+        # Conv front-end
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        # Add positions, dropout
+        hidden_states = inputs_embeds + self.embed_positions.weight
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        attention_mask = create_bidirectional_mask(
+            config=self.config,
+            input_embeds=hidden_states,
+            attention_mask=input_features_mask,
+        )
+
+        # Transformer stack
+        for layer in self.layers:
+            drop = self.training and torch.rand([]) < self.layerdrop
+            if not drop:
+                hidden_states = layer(hidden_states, attention_mask)[0]
+
+        # AvgPool (time/2) + LayerNorm
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
+        hidden_states = self.layer_norm(hidden_states)
+
+        if audio_times is not None:
+            times = audio_times.to(hidden_states.device)
+            freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
+            angle = (-times * 2 * np.pi).to(self.conv1.weight.device)
+            angle_expanded = angle.unsqueeze(2)
+            angle_expanded = angle_expanded.expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
+
+            freqs = freqs * angle_expanded
+            hidden_states = apply_rotary_emb(freqs, hidden_states)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+        )
+
+    # Ignore copy
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers and the output length of the audio encoder
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        return input_lengths, output_lengths
+
+
+class MusicFlamingoMultiModalProjector(nn.Module):
+    """
+    Audio adaptor (small MLP) that projects MusicFlamingoEncoder features
+    to the LLM embedding space so they can replace `<sound>` tokens.
+    """
+
+    def __init__(self, config: MusicFlamingoConfig):
+        super().__init__()
+        self.linear_1 = nn.Linear(
+            config.audio_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
+        )
+
+    def forward(self, audio_features):
+        hidden_states = self.linear_1(audio_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    The MusicFlamingo model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
+    """
+)
+class MusicFlamingoForConditionalGeneration(MusicFlamingoPreTrainedModel, GenerationMixin):
+    _keep_in_fp32_modules_strict = None
+    _tp_plan = None
+    _pp_plan = None
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.audio_tower = AutoModel.from_config(config.audio_config)
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.multi_modal_projector = MusicFlamingoMultiModalProjector(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def get_audio_features(
+        self,
+        input_features: torch.FloatTensor,
+        input_features_mask: torch.Tensor,
+        audio_times: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
+        Args:
+            input_features (`torch.FloatTensor`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
+                Mask to avoid performing attention on padded feature indices.
+
+        Returns:
+            `torch.FloatTensor`:
+                The audio embeddings.
+        """
+
+        # Encode audio
+        input_features = input_features.to(dtype=self.audio_tower.conv1.weight.dtype)
+        encoder_output = self.audio_tower(
+            input_features, input_features_mask=input_features_mask, audio_times=audio_times
+        )
+        audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
+
+        # Mask according to avg pooling (which is after attention blocks)
+        post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
+        valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
+        audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
+        return audio_embeds
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        input_features_mask: Optional[torch.Tensor] = None,
+        audio_times: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
+            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        audio_times (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Time embeddings for the audio features.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
+
+        >>> model_id = "nvidia/audio-flamingo-3-hf"
+        >>> processor = AutoProcessor.from_pretrained(model_id)
+        >>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+        >>> conversations = [
+        >>>     [
+        >>>         {
+        >>>             "role": "user",
+        >>>             "content": [
+        >>>                 {"type": "text", "text": "Transcribe the input speech."},
+        >>>                 {
+        >>>                     "type": "audio",
+        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
+        >>>                 },
+        >>>             ],
+        >>>         }
+        >>>     ],
+        >>>     [
+        >>>         {
+        >>>             "role": "user",
+        >>>             "content": [
+        >>>                 {
+        >>>                     "type": "text",
+        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
+        >>>                 },
+        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
+        >>>             ],
+        >>>         }
+        >>>     ],
+        >>> ]
+
+        >>> inputs = processor.apply_chat_template(
+        >>>     conversations,
+        >>>     tokenize=True,
+        >>>     add_generation_prompt=True,
+        >>>     return_dict=True,
+        >>> ).to(model.device)
+
+        >>> outputs = model.generate(**inputs, max_new_tokens=500)
+
+        >>> decoded_outputs = processor.batch_decode(
+        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
+        >>> )
+        >>> print(decoded_outputs)
+        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
+        ```"""
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if input_features is not None and input_ids is not None:
+            audio_embeds = self.get_audio_features(input_features, input_features_mask, audio_times=audio_times)
+
+            # replace text-audio token placeholders with audio embeddings
+            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
+            inputs_embeds = inputs_embeds.masked_scatter(
+                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
+            )
+
+        outputs: CausalLMOutputWithPast = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            labels=labels,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        return outputs
+
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        # Overwritten -- we should not pass input_features when we are in cached decoding stage
+
+        input_features = kwargs.pop("input_features", None)
+        input_features_mask = kwargs.pop("input_features_mask", None)
+        audio_times = kwargs.pop("audio_times", None)
+        cache_position = kwargs.get("cache_position")
+
+        model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
+
+        if cache_position is not None and cache_position[0] == 0:
+            # input_features should only be passed when we are not in cached decoding stage
+            if input_features is not None:
+                model_inputs["input_features"] = input_features
+            if input_features_mask is not None:
+                model_inputs["input_features_mask"] = input_features_mask
+            if audio_times is not None:
+                model_inputs["audio_times"] = audio_times
+
+        return model_inputs
+
+
+__all__ = ["MusicFlamingoForConditionalGeneration", "MusicFlamingoPreTrainedModel", "MusicFlamingoEncoder"]
diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py
new file mode 100644
index 000000000000..ee99460f25ba
--- /dev/null
+++ b/src/transformers/models/musicflamingo/modular_musicflamingo.py
@@ -0,0 +1,329 @@
+# coding=utf-8
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...masking_utils import create_bidirectional_mask
+from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ..qwen2_audio.modeling_qwen2_audio import (
+    Qwen2AudioEncoder,
+    Qwen2AudioPreTrainedModel,
+)
+from ..voxtral.modeling_voxtral import VoxtralForConditionalGeneration, VoxtralMultiModalProjector
+from ..whisper.modeling_whisper import WhisperEncoderLayer
+from .configuration_musicflamingo import MusicFlamingoConfig
+from .rotary_embedding import RotaryEmbedding, apply_rotary_emb
+
+
+logger = logging.get_logger(__name__)
+
+
+class MusicFlamingoEncoderLayer(WhisperEncoderLayer):
+    pass
+
+
+class MusicFlamingoPreTrainedModel(Qwen2AudioPreTrainedModel):
+    pass
+
+
+@auto_docstring(
+    custom_intro="""
+    The audio model from MusicFlamingo without any head or projection on top.
+    """
+)
+class MusicFlamingoEncoder(Qwen2AudioEncoder):
+    """
+    MusicFlamingo encoder: Whisper encoder, average pool (time/2), then LayerNorm.
+    """
+
+    def __init__(self, config: MusicFlamingoConfig):
+        super().__init__(config)
+        self.pos_emb = RotaryEmbedding(dim=256, freqs_for="lang", max_time=1200.0)
+
+    @can_return_tuple
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        input_features_mask: Optional[torch.Tensor] = None,
+        audio_times: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        r"""
+        Args:
+            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
+                these features from waveform input.
+            input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+        """
+
+        seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
+        input_features_lengths = input_features_mask.sum(-1)
+        input_features_lengths = (input_features_lengths - 1) // 2 + 1  # conv2 downsampling
+        input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
+
+        # Conv front-end
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        # Add positions, dropout
+        hidden_states = inputs_embeds + self.embed_positions.weight
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        attention_mask = create_bidirectional_mask(
+            config=self.config,
+            input_embeds=hidden_states,
+            attention_mask=input_features_mask,
+        )
+
+        # Transformer stack
+        for layer in self.layers:
+            drop = self.training and torch.rand([]) < self.layerdrop
+            if not drop:
+                hidden_states = layer(hidden_states, attention_mask)[0]
+
+        # AvgPool (time/2) + LayerNorm
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
+        hidden_states = self.layer_norm(hidden_states)
+
+        if audio_times is not None:
+            times = audio_times.to(hidden_states.device)
+            freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
+            angle = (-times * 2 * np.pi).to(self.conv1.weight.device)
+            angle_expanded = angle.unsqueeze(2)
+            angle_expanded = angle_expanded.expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
+
+            freqs = freqs * angle_expanded
+            hidden_states = apply_rotary_emb(freqs, hidden_states)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+        )
+
+
+class MusicFlamingoMultiModalProjector(VoxtralMultiModalProjector):
+    """
+    Audio adaptor (small MLP) that projects MusicFlamingoEncoder features
+    to the LLM embedding space so they can replace `<sound>` tokens.
+    """
+
+    def __init__(self, config: MusicFlamingoConfig):
+        super().__init__()
+        self.linear_1 = nn.Linear(
+            config.audio_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The MusicFlamingo model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
+    """
+)
+class MusicFlamingoForConditionalGeneration(VoxtralForConditionalGeneration):
+    _tp_plan = None
+    _pp_plan = None
+    _keep_in_fp32_modules_strict = None
+
+    def __init__(self, config):
+        super().__init__(config)
+
+    def get_audio_features(
+        self,
+        input_features: torch.FloatTensor,
+        input_features_mask: torch.Tensor,
+        audio_times: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
+        Args:
+            input_features (`torch.FloatTensor`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
+                Mask to avoid performing attention on padded feature indices.
+
+        Returns:
+            `torch.FloatTensor`:
+                The audio embeddings.
+        """
+
+        # Encode audio
+        input_features = input_features.to(dtype=self.audio_tower.conv1.weight.dtype)
+        encoder_output = self.audio_tower(
+            input_features, input_features_mask=input_features_mask, audio_times=audio_times
+        )
+        audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
+
+        # Mask according to avg pooling (which is after attention blocks)
+        post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
+        valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
+        audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
+        return audio_embeds
+
+    def get_audio_embeds(self):
+        raise NotImplementedError("This method is not supported for MusicFlamingoForConditionalGeneration.")
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        input_features_mask: Optional[torch.Tensor] = None,
+        audio_times: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
+            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        audio_times (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Time embeddings for the audio features.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
+
+        >>> model_id = "nvidia/audio-flamingo-3-hf"
+        >>> processor = AutoProcessor.from_pretrained(model_id)
+        >>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+        >>> conversations = [
+        >>>     [
+        >>>         {
+        >>>             "role": "user",
+        >>>             "content": [
+        >>>                 {"type": "text", "text": "Transcribe the input speech."},
+        >>>                 {
+        >>>                     "type": "audio",
+        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
+        >>>                 },
+        >>>             ],
+        >>>         }
+        >>>     ],
+        >>>     [
+        >>>         {
+        >>>             "role": "user",
+        >>>             "content": [
+        >>>                 {
+        >>>                     "type": "text",
+        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
+        >>>                 },
+        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
+        >>>             ],
+        >>>         }
+        >>>     ],
+        >>> ]
+
+        >>> inputs = processor.apply_chat_template(
+        >>>     conversations,
+        >>>     tokenize=True,
+        >>>     add_generation_prompt=True,
+        >>>     return_dict=True,
+        >>> ).to(model.device)
+
+        >>> outputs = model.generate(**inputs, max_new_tokens=500)
+
+        >>> decoded_outputs = processor.batch_decode(
+        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
+        >>> )
+        >>> print(decoded_outputs)
+        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
+        ```"""
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if input_features is not None and input_ids is not None:
+            audio_embeds = self.get_audio_features(input_features, input_features_mask, audio_times=audio_times)
+
+            # replace text-audio token placeholders with audio embeddings
+            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
+            inputs_embeds = inputs_embeds.masked_scatter(
+                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
+            )
+
+        outputs: CausalLMOutputWithPast = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            labels=labels,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        return outputs
+
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        # Overwritten -- we should not pass input_features when we are in cached decoding stage
+
+        input_features = kwargs.pop("input_features", None)
+        input_features_mask = kwargs.pop("input_features_mask", None)
+        audio_times = kwargs.pop("audio_times", None)
+        cache_position = kwargs.get("cache_position")
+
+        model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
+
+        if cache_position is not None and cache_position[0] == 0:
+            # input_features should only be passed when we are not in cached decoding stage
+            if input_features is not None:
+                model_inputs["input_features"] = input_features
+            if input_features_mask is not None:
+                model_inputs["input_features_mask"] = input_features_mask
+            if audio_times is not None:
+                model_inputs["audio_times"] = audio_times
+
+        return model_inputs
+
+
+__all__ = ["MusicFlamingoForConditionalGeneration", "MusicFlamingoPreTrainedModel", "MusicFlamingoEncoder"]
diff --git a/src/transformers/models/musicflamingo/processing_musicflamingo.py b/src/transformers/models/musicflamingo/processing_musicflamingo.py
new file mode 100644
index 000000000000..8fb561a60faa
--- /dev/null
+++ b/src/transformers/models/musicflamingo/processing_musicflamingo.py
@@ -0,0 +1,330 @@
+# coding=utf-8
+# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Optional, Union
+
+import numpy as np
+
+from ...audio_utils import AudioInput, make_list_of_audio
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import TextInput
+from ...utils import is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+MAX_AUDIO_LEN = 10 * 60  # 10 minutes
+DEFAULT_TRANSCRIPTION_PROMPT = "Transcribe the input speech."
+
+
+class MusicFlamingoProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": True,
+        },
+        "audio_kwargs": {
+            "sampling_rate": 16000,
+            "chunk_length": 30.0,
+            "return_attention_mask": True,
+            "padding": "max_length",
+        },
+        "common_kwargs": {
+            "return_tensors": "pt",
+            "padding_side": "left",
+        },
+    }
+
+
+class MusicFlamingoProcessor(ProcessorMixin):
+    r"""
+    Constructs an MusicFlamingo processor which wraps an MusicFlamingo feature extractor and an MusicFlamingo
+    tokenizer into a single processor.
+
+    [`MusicFlamingoProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
+    [`Qwen2TokenizerFast`]. See the [`~MusicFlamingoProcessor.__call__`] for more information.
+
+    Args:
+        feature_extractor ([`WhisperFeatureExtractor`]):
+            The feature extractor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`]):
+            The tokenizer is a required input.
+        chat_template (`Optional[str]`, *optional*):
+            The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
+            template will be used.
+        audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
+            Special token used to represent audio inputs in the chat template.
+        sound_bos_token (`Optional[str]`, *optional*, defaults to `"<|sound_bos|>"`):
+            Special token used to represent the beginning of an audio sequence.
+        sound_eos_token (`Optional[str]`, *optional*, defaults to `"<|sound_eos|>"`):
+            Special token used to represent the end of an audio sequence.
+    """
+
+    def __init__(
+        self,
+        feature_extractor,
+        tokenizer,
+        chat_template=None,
+        audio_token="<sound>",
+        sound_bos_token="<|sound_bos|>",
+        sound_eos_token="<|sound_eos|>",
+    ):
+        self.audio_token = audio_token
+        self.sound_bos_token = sound_bos_token
+        self.sound_eos_token = sound_eos_token
+        self.audio_token_id = tokenizer.convert_tokens_to_ids(audio_token)
+        self.sound_bos_token_id = tokenizer.convert_tokens_to_ids(sound_bos_token)
+        self.sound_eos_token_id = tokenizer.convert_tokens_to_ids(sound_eos_token)
+        super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        text: Union[TextInput, list[TextInput]],
+        audio: Optional[AudioInput] = None,
+        output_labels: Optional[bool] = False,
+        **kwargs: Unpack[MusicFlamingoProcessorKwargs],
+    ) -> BatchFeature:
+        r"""
+        Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
+        method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
+        audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
+        with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
+        the text is tokenized as-is (LM-only behavior).
+
+        Args:
+            text (`str` or `list[str]`):
+                Input sequence or batch of sequences.
+            audio (`np.ndarray` or `list[np.ndarray]`):
+                Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
+                `audio` inputs.
+            output_labels (bool, *optional*, default=False):
+                Whether to return labels for training.
+
+        Returns:
+            [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
+            audio features (`input_features`, `input_features_mask`).
+        """
+
+        # Merge defaults with user kwargs
+        call_kwargs = self._merge_kwargs(
+            MusicFlamingoProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        text_kwargs = call_kwargs["text_kwargs"]
+        audio_kwargs = call_kwargs["audio_kwargs"]
+        return_tensors = text_kwargs.get("return_tensors")
+        if return_tensors != "pt":
+            raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
+
+        if isinstance(text, str):
+            text = [text]
+        elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        audio_inputs = {}
+        if audio is not None:
+            audio = make_list_of_audio(audio)
+            if len(text) != len(audio):
+                raise ValueError(f"Got {len(text)} text but {len(audio)} audios; they must match 1:1.")
+
+            # Determine number of chunks per sample, and flatten
+            window_size = int(audio_kwargs["sampling_rate"] * audio_kwargs["chunk_length"])
+            max_windows = int(MAX_AUDIO_LEN // audio_kwargs["chunk_length"])
+
+            per_sample_windows: list[int] = []
+            flat_chunks: list[np.ndarray] = []
+
+            for audio_el in audio:
+                n_samples = int(audio_el.shape[0])
+                n_win = max(1, (n_samples + window_size - 1) // window_size)
+                if n_win > max_windows:
+                    logger.warning(
+                        f"Audio duration ({n_samples / audio_kwargs['sampling_rate']:.1f}s) exceeds {MAX_AUDIO_LEN}s; truncating to first {MAX_AUDIO_LEN}s."
+                    )
+                    n_win = max_windows
+                per_sample_windows.append(n_win)
+
+                time_cap = min(n_samples, n_win * window_size)
+                for i in range(n_win):
+                    start = i * window_size
+                    end = min((i + 1) * window_size, time_cap)
+                    flat_chunks.append(audio_el[start:end])
+
+            # Feature extraction
+            audio_inputs = self.feature_extractor(flat_chunks, **audio_kwargs)
+            padding_mask = audio_inputs.pop("attention_mask")
+            audio_inputs["input_features_mask"] = padding_mask
+
+            # Compute sequence lengths token counting
+            audio_lengths = torch.stack([s.sum() for s in torch.split(padding_mask.sum(-1), per_sample_windows)])
+            conv_output_lengths = (audio_lengths - 1) // 2 + 1  # After conv2 downsampling
+            audio_tokens_lengths = (conv_output_lengths - 2) // 2 + 1  # After avg pooling
+
+            # expand audio tokens in text
+            for i, audio_length in enumerate(audio_tokens_lengths):
+                expanded = re.sub(
+                    re.escape(self.audio_token),
+                    self.sound_bos_token + self.audio_token * audio_length + self.sound_eos_token,
+                    text[i],
+                )
+                text[i] = expanded
+
+        # Tokenize
+        text_inputs = self.tokenizer(text, **text_kwargs)
+
+        data = {**text_inputs, **audio_inputs}
+        if output_labels:
+            labels = data["input_ids"].clone()
+            labels[labels == self.audio_token_id] = -100
+            labels[labels == self.sound_bos_token_id] = -100
+            labels[labels == self.sound_eos_token_id] = -100
+            labels[labels == self.tokenizer.pad_token_id] = -100
+            data["labels"] = labels
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    @property
+    def model_input_names(self) -> list[str]:
+        tok_names = self.tokenizer.model_input_names
+        fea_names = self.feature_extractor.model_input_names
+        return list(dict.fromkeys(tok_names + fea_names + ["input_features_mask"]))
+
+    def apply_transcription_request(
+        self,
+        audio: Union[str, list[str], AudioInput],
+        prompt: Optional[Union[str, list[str]]] = None,
+        **kwargs: Unpack[MusicFlamingoProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.
+
+        Args:
+            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
+                the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
+            prompt (`str` or `list[str]`, *optional*):
+                Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
+                each sample uses `"Transcribe the input speech."`.
+            **kwargs:
+                Additional keyword arguments forwarded to [`~MusicFlamingoProcessor.apply_chat_template`] (for example
+                `text_kwargs`, `audio_kwargs`, ...).
+
+        Returns:
+            [`BatchFeature`]: Processor outputs ready to be passed to [`MusicFlamingoForConditionalGeneration.generate`].
+
+        """
+
+        if isinstance(audio, str):
+            audio_items: list[Union[str, np.ndarray]] = [audio]
+        elif isinstance(audio, (list, tuple)) and audio and all(isinstance(el, str) for el in audio):
+            audio_items = list(audio)
+        else:
+            audio_items = list(make_list_of_audio(audio))
+            if is_torch_available():
+                audio_items = [el.detach().cpu().numpy() if isinstance(el, torch.Tensor) else el for el in audio_items]
+
+        batch_size = len(audio_items)
+        if batch_size == 0:
+            raise ValueError("`audio` must contain at least one sample.")
+
+        if prompt is None:
+            prompts = [DEFAULT_TRANSCRIPTION_PROMPT] * batch_size
+        elif isinstance(prompt, str):
+            prompts = [prompt] * batch_size
+        elif isinstance(prompt, (list, tuple)):
+            if len(prompt) != batch_size:
+                raise ValueError(
+                    f"Received {len(prompt)} prompt(s) for {batch_size} audio sample(s); counts must match."
+                )
+            prompts = []
+            for item in prompt:
+                if item is None:
+                    prompts.append(DEFAULT_TRANSCRIPTION_PROMPT)
+                elif isinstance(item, str):
+                    prompts.append(item)
+                else:
+                    raise TypeError("Each prompt must be a string or `None`.")
+        else:
+            raise TypeError("`prompt` must be a string, a sequence of strings, or `None`.")
+
+        conversations = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt_text},
+                        {"type": "audio", "path": audio_item}
+                        if isinstance(audio_item, str)
+                        else {"type": "audio", "audio": audio_item},
+                    ],
+                }
+            ]
+            for prompt_text, audio_item in zip(prompts, audio_items)
+        ]
+
+        return self.apply_chat_template(
+            conversations,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            **kwargs,
+        )
+
+    def batch_decode(self, *args, strip_prefix=False, **kwargs):
+        """
+        Forward arguments to [`~PreTrainedTokenizer.batch_decode`] and optionally remove the assistant framing the model
+        was trained to produce.
+
+        AF3 transcription requests respond with sentences such as `"The spoken content of the audio is \"...\"."`.
+        Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
+        """
+        decoded = self.tokenizer.batch_decode(*args, **kwargs)
+        if strip_prefix:
+            decoded = [self._strip_assistant_prefix_and_quotes(text) for text in decoded]
+        return decoded
+
+    def _strip_assistant_prefix_and_quotes(self, text: str) -> str:
+        """
+        Remove the assistant prefix and surrounding quotes from a decoded transcription string.
+        """
+
+        stripped = text.strip()
+
+        for prefix in (
+            "The spoken content of the audio is",
+            "The transcription of the audio is",
+        ):
+            if stripped.startswith(prefix):
+                stripped = stripped[len(prefix) :].strip()
+                break
+
+        if stripped.endswith("."):
+            stripped = stripped[:-1].strip()
+
+        if len(stripped) >= 2 and stripped[0] == stripped[-1] and stripped[0] in {"'", '"'}:
+            stripped = stripped[1:-1].strip()
+
+        return stripped
+
+
+__all__ = ["MusicFlamingoProcessor"]
diff --git a/src/transformers/models/musicflamingo/rotary_embedding.py b/src/transformers/models/musicflamingo/rotary_embedding.py
new file mode 100644
index 000000000000..3f4cbbdf2e2c
--- /dev/null
+++ b/src/transformers/models/musicflamingo/rotary_embedding.py
@@ -0,0 +1,300 @@
+from math import pi, log
+
+import torch
+from torch.nn import Module, ModuleList
+from torch.cuda.amp import autocast
+from torch import nn, einsum, broadcast_tensors, Tensor
+
+from einops import rearrange, repeat
+
+from beartype import beartype
+from beartype.typing import Literal, Union, Optional
+
+# helper functions
+
+def exists(val):
+    return val is not None
+
+def default(val, d):
+    return val if exists(val) else d
+
+# broadcat, as tortoise-tts was using it
+
+def broadcat(tensors, dim = -1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+    return torch.cat(broadcasted_tensors, dim = dim)
+
+# rotary embedding helper functions
+
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+
+@autocast(enabled = False)
+def apply_rotary_emb(freqs, t, start_index = 0, scale = 1., seq_dim = -2):
+    ori_dtype = t.dtype
+    embed_dtype = torch.float64
+    t = t.to(embed_dtype)
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        freqs = freqs[-seq_len:].to(t)
+
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+
+    assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+
+    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    return torch.cat((t_left, t, t_right), dim = -1).to(ori_dtype)
+
+
+# learned rotation helpers
+
+def apply_learned_rotations(rotations, t, start_index = 0, freq_ranges = None):
+    if exists(freq_ranges):
+        rotations = einsum('..., f -> ... f', rotations, freq_ranges)
+        rotations = rearrange(rotations, '... r f -> ... (r f)')
+
+    rotations = repeat(rotations, '... n -> ... (n r)', r = 2)
+    return apply_rotary_emb(rotations, t, start_index = start_index)
+
+# classes
+
+class RotaryEmbedding(Module):
+    @beartype
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Optional[Tensor] = None,
+        freqs_for: Union[Literal['lang', 'pixel', 'constant']] = 'lang',
+        theta = 50000,
+        max_freq = 10,
+        num_freqs = 1,
+        learned_freq = False,
+        use_xpos = False,
+        xpos_scale_base = 512,
+        interpolate_factor = 1.,
+        theta_rescale_factor = 1.,
+        seq_before_head_dim = False,
+        cache_if_possible = True,
+        max_time = 7200 
+    ):
+        super().__init__()
+
+        self.dim = dim
+        self.freqs_for = freqs_for
+        self.max_freq = max_freq
+        self.num_freqs = num_freqs
+        self.learned_freq = learned_freq
+        self.use_xpos = use_xpos
+        self.xpos_scale_base = xpos_scale_base
+        self.interpolate_factor = interpolate_factor
+        self.theta_rescale_factor = theta_rescale_factor
+        self.cache_if_possible = cache_if_possible
+        self.max_time = max_time
+
+        self.tmp_store('cached_freqs', None)
+        self.tmp_store('cached_scales', None)
+
+        # Adjust theta to avoid angle wrapping after large times
+        if exists(max_time) and freqs_for == 'lang':
+            # Make sure highest frequency completes 1 full rotation over max time
+            # theta = base of exponent: higher theta → lower frequency range
+            # max_time * (1/theta^(0)) = 2pi  =>  theta = max_time / (2pi)
+            theta = max_time / (2 * pi)
+
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+        self.theta = theta
+
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+
+        self.freqs = nn.Parameter(freqs, requires_grad = learned_freq)
+
+        self.learned_freq = learned_freq
+
+        # dummy for device
+
+        self.tmp_store('dummy', torch.tensor(0))
+
+        # default sequence dimension
+
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+
+        # interpolation factors
+
+        assert interpolate_factor >= 1.
+        self.interpolate_factor = interpolate_factor
+
+        # xpos
+        if not use_xpos:
+            self.tmp_store('scale', None)
+            return
+
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.tmp_store('scale', scale)
+
+        # add apply_rotary_emb as static method
+
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+
+    @property
+    def device(self):
+        return self.dummy.device
+
+    def tmp_store(self, key, value):
+        self.register_buffer(key, value, persistent = False)
+
+    def get_seq_pos(self, seq_len, device, dtype, offset = 0):
+        return (torch.arange(seq_len, device = device, dtype = dtype) + offset) / self.interpolate_factor
+
+    def rotate_queries_or_keys(self, t, seq_dim = None, offset = 0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert not self.use_xpos, 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
+
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+
+        freqs = self.forward(self.get_seq_pos(seq_len, device = device, dtype = dtype, offset = offset), seq_len = seq_len, offset = offset)
+
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+
+        return apply_rotary_emb(freqs, t, seq_dim = seq_dim)
+
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim = None, offset = 0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim = seq_dim, offset = k_len - q_len + offset)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim = seq_dim, offset = offset)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    def rotate_queries_and_keys(self, q, k, seq_dim = None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+
+        seq = self.get_seq_pos(seq_len, dtype = dtype, device = device)
+
+        freqs = self.forward(seq, seq_len = seq_len)
+        scale = self.get_scale(seq, seq_len = seq_len).to(dtype)
+
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+            scale = rearrange(scale, 'n d -> n 1 d')
+
+        rotated_q = apply_rotary_emb(freqs, q, scale = scale, seq_dim = seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale = scale ** -1, seq_dim = seq_dim)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    @beartype
+    def get_scale(
+        self,
+        t: Tensor,
+        seq_len: Optional[int] = None,
+        offset = 0
+    ):
+        assert self.use_xpos
+
+        should_cache = (
+            self.cache_if_possible and
+            exists(seq_len)
+        )
+
+        if (
+            should_cache and \
+            exists(self.cached_scales) and \
+            (seq_len + offset) <= self.cached_scales.shape[0]
+        ):
+            return self.cached_scales[offset:(offset + seq_len)]
+
+        scale = 1.
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** rearrange(power, 'n -> n 1')
+            scale = torch.cat((scale, scale), dim = -1)
+
+        if should_cache:
+            self.tmp_store('cached_scales', scale)
+
+        return scale
+
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == 'pixel':
+                pos = torch.linspace(-1, 1, steps = dim, device = self.device)
+            else:
+                pos = torch.arange(dim, device = self.device)
+
+            freqs = self.forward(pos, seq_len = dim)
+
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim = -1)
+
+    @autocast(enabled = False)
+    def forward(
+        self,
+        t: Tensor,
+        seq_len = None,
+        offset = 0
+    ):
+        should_cache = (
+            self.cache_if_possible and \
+            not self.learned_freq and \
+            exists(seq_len) and \
+            self.freqs_for != 'pixel'
+        )
+
+        if (
+            should_cache and \
+            exists(self.cached_freqs) and \
+            (offset + seq_len) <= self.cached_freqs.shape[0]
+        ):
+            return self.cached_freqs[offset:(offset + seq_len)].detach()
+
+        freqs = self.freqs
+
+        # Scale time to keep t * freq <= 2pi
+        if hasattr(self, 'max_time') and self.max_time is not None:
+            t = t / self.max_time * (2 * pi)
+
+        freqs = einsum('..., f -> ... f', t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
+
+        if should_cache:
+            self.tmp_store('cached_freqs', freqs.detach())
+
+        return freqs
\ No newline at end of file
diff --git a/src/transformers/models/musicflamingo/sound_encoder.py b/src/transformers/models/musicflamingo/sound_encoder.py
new file mode 100644
index 000000000000..0347ae9e8ff9
--- /dev/null
+++ b/src/transformers/models/musicflamingo/sound_encoder.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2025 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+
+# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
+# LICENSE is in incl_licenses directory.
+
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from llava.model.multimodal_encoder.rotary_embedding import (
+    RotaryEmbedding,
+    apply_rotary_emb
+)
+import numpy as np
+
+
+class SoundTower(nn.Module):
+    def __init__(self, sound_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.sound_tower_name = sound_tower
+        self.cfg_only = None
+        self.pos_emb = RotaryEmbedding(
+                    dim = 256,
+                    freqs_for = 'lang',
+                    max_time = 1200.
+                    )
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers and the output length of the audio encoder
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        return input_lengths, output_lengths
+
+    def forward(self, sounds, mask=None, times=None):
+        self.pos_emb = self.pos_emb.to(sounds.device)
+        if type(sounds) is list:
+            sound_features = []
+            for sound in sounds:
+                # Calculate attention mask
+                audio_feat_lengths, audio_output_lengths = self._get_feat_extract_output_lengths(mask.sum(-1))
+                # for cases where only one window is there for the audio_clip
+                batch_size, _, max_mel_seq_len = sound.shape
+                max_seq_len = (max_mel_seq_len - 2) // 2 + 1
+                seq_range = (
+                        torch.arange(0, max_seq_len, dtype=audio_feat_lengths.dtype, device=audio_feat_lengths.device)
+                        .unsqueeze(0)
+                        .expand(batch_size, max_seq_len)
+                    )
+                lengths_expand = audio_feat_lengths.unsqueeze(1).expand(batch_size, max_seq_len)
+                padding_mask = seq_range >= lengths_expand
+                audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
+                        batch_size, 1, max_seq_len, max_seq_len
+                    )
+                audio_attention_mask = audio_attention_mask_.to(
+                        dtype=self.sound_tower.conv1.weight.dtype, device=self.sound_tower.conv1.weight.device
+                    )
+                audio_attention_mask[audio_attention_mask_] = float("-inf")
+                # Calculate features
+                sound_feature = self.sound_tower(sound, attention_mask=audio_attention_mask)
+                sound_feature = sound_feature.to(sound.dtype)
+                sound_feature = sound_feature.last_hidden_state
+                times = times.to(sound_feature.device)
+                freqs = self.pos_emb.get_axial_freqs(times.shape[0], sound_feature.shape[-2]).to(self.sound_tower.conv1.weight.device)
+                angle = (-times * 2 * np.pi).to(self.sound_tower.conv1.weight.device)
+                angle_expanded = angle.unsqueeze(1).unsqueeze(2)
+                angle_expanded = angle_expanded.expand(times.shape[0], sound_feature.shape[-2], freqs.shape[-1])
+
+                freqs = freqs * angle_expanded
+                # print(freqs.shape)
+                sound_feature = apply_rotary_emb(freqs, sound_feature.unsqueeze(0))
+
+                sound_features.append(sound_feature)
+        else:
+            # Calculate attention mask
+            if len(sounds.shape) == 5:
+                sounds = sounds.squeeze(1).squeeze(1)
+                mask = mask.squeeze(0)
+                
+            audio_feat_lengths, audio_output_lengths = self._get_feat_extract_output_lengths(mask.sum(-1))
+            # for cases where only one window is there for the audio_clip
+            
+            batch_size, _, max_mel_seq_len = sounds.shape
+            max_seq_len = (max_mel_seq_len - 2) // 2 + 1
+            seq_range = (
+                    torch.arange(0, max_seq_len, dtype=audio_feat_lengths.dtype, device=audio_feat_lengths.device)
+                    .unsqueeze(0)
+                    .expand(batch_size, max_seq_len)
+                )
+            if len(audio_feat_lengths.shape) == 1:
+                audio_feat_lengths = audio_feat_lengths.unsqueeze(1)
+            elif len(audio_feat_lengths.shape) == 3: # hard-coded, check logic
+                audio_feat_lengths = audio_feat_lengths.squeeze(-1)
+
+            lengths_expand = audio_feat_lengths.expand(batch_size, max_seq_len)
+            padding_mask = seq_range >= lengths_expand
+            audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
+                    batch_size, 1, max_seq_len, max_seq_len
+                )
+            audio_attention_mask = audio_attention_mask_.to(
+                    dtype=self.sound_tower.conv1.weight.dtype, device=self.sound_tower.conv1.weight.device
+                )
+            audio_attention_mask[audio_attention_mask_] = float("-inf")
+            # Calculate features
+            sound_features = self.sound_tower(sounds, attention_mask=audio_attention_mask)
+            sound_features = sound_features.last_hidden_state
+            times = times.to(sound_features.device)
+            freqs = self.pos_emb.get_axial_freqs(times.shape[0], sound_features.shape[-2]).to(self.sound_tower.conv1.weight.device)
+            angle = (-times * 2 * np.pi).to(self.sound_tower.conv1.weight.device)
+            angle_expanded = angle.unsqueeze(2)
+            angle_expanded = angle_expanded.expand(times.shape[0], sound_features.shape[-2], freqs.shape[-1])
+
+            freqs = freqs * angle_expanded
+            # print(freqs.shape)
+            sound_features = apply_rotary_emb(freqs, sound_features)
+            sound_features = sound_features.to(sounds.dtype)
+
+        return sound_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.sound_tower.dtype
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.sound_tower.config
+        else:
+            return self.cfg_only
+            
+    @property
+    def device(self):
+        return self.sound_tower.device
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+

From b69a9d1f166ebea486b1cd43629fd4234b93f87c Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Fri, 26 Dec 2025 16:50:05 -0500
Subject: [PATCH 02/12] Fix pos embeddings

---
 .../configuration_musicflamingo.py            |    4 +-
 .../convert_musicflamingo_to_hf.py            |   16 +-
 .../models/musicflamingo/dataset.py           | 1828 -----------------
 .../musicflamingo/modeling_musicflamingo.py   |   62 +-
 .../musicflamingo/modular_musicflamingo.py    |  164 +-
 .../musicflamingo/processing_musicflamingo.py |   44 +-
 .../models/musicflamingo/rotary_embedding.py  |  217 +-
 .../models/musicflamingo/sound_encoder.py     |  167 --
 8 files changed, 162 insertions(+), 2340 deletions(-)
 delete mode 100644 src/transformers/models/musicflamingo/dataset.py
 delete mode 100644 src/transformers/models/musicflamingo/sound_encoder.py

diff --git a/src/transformers/models/musicflamingo/configuration_musicflamingo.py b/src/transformers/models/musicflamingo/configuration_musicflamingo.py
index 5fb0aaca5e5c..b59e0b216c45 100644
--- a/src/transformers/models/musicflamingo/configuration_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/configuration_musicflamingo.py
@@ -29,7 +29,7 @@ class MusicFlamingoEncoderConfig(PretrainedConfig):
     configuration with the defaults will yield a similar configuration to that of the audio encoder of the MusicFlamingo
     architecture.
 
-    e.g. [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
+    e.g. [nvidia/music-flamingo-hf](https://huggingface.co/nvidia/music-flamingo-hf)
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -131,7 +131,7 @@ class MusicFlamingoConfig(PretrainedConfig):
     MusicFlamingo model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the MusicFlamingo.
 
-    e.g. [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
+    e.g. [nvidia/music-flamingo-hf](https://huggingface.co/nvidia/music-flamingo-hf)
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py b/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
index 802b1a647826..bf175afa6818 100644
--- a/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
+++ b/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
@@ -229,27 +229,27 @@ def merge_and_shard_weights(src_root: Path, dst_root: Path, processor: MusicFlam
 
 ```
 git lfs install
-git clone https://huggingface.co/nvidia/audio-flamingo-3
+git clone https://huggingface.co/nvidia/music-flamingo
 ```
 
-This will create a folder `audio-flamingo-3/` containing the original components:
+This will create a folder `music-flamingo/` containing the original components:
 `llm/`, `sound_tower/`, and `sound_mm_projector/`.
 
 2) Convert to the Hugging Face Transformers format (locally):
 
 ```
 python src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py \
-  --src_dir audio-flamingo-3 \
-  --dst_dir audio-flamingo-3-hf
+  --src_dir music-flamingo \
+  --dst_dir music-flamingo-hf
 ```
 
 3) Convert and push directly to the Hub (requires `huggingface-cli login` or `HF_TOKEN`):
 
 ```
 python src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py \
-  --src_dir audio-flamingo-3 \
-  --dst_dir audio-flamingo-3-hf \
-  --push_to_hub <username-or-org>/audio-flamingo-3
+  --src_dir music-flamingo \
+  --dst_dir music-flamingo-hf \
+  --push_to_hub <username-or-org>/music-flamingo-hf
 ```
 
 This command uploads both the processor (tokenizer + feature extractor) and the converted
@@ -267,7 +267,7 @@ def main() -> None:
         type=str,
         help=(
             "Optional repository ID to push the converted assets to the Hugging Face Hub, "
-            "e.g. 'username/audio-flamingo-3'."
+            "e.g. 'username/music-flamingo-hf'."
         ),
     )
     args = ap.parse_args()
diff --git a/src/transformers/models/musicflamingo/dataset.py b/src/transformers/models/musicflamingo/dataset.py
deleted file mode 100644
index a970f4e186ff..000000000000
--- a/src/transformers/models/musicflamingo/dataset.py
+++ /dev/null
@@ -1,1828 +0,0 @@
-# Copyright (c) 2025 NVIDIA CORPORATION.
-# Licensed under the MIT license.
-
-# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
-# LICENSE is in incl_licenses directory.
-
-# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
-#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
-#
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#
-#        http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-import base64
-import copy
-import re
-import io
-import json
-import os
-import os.path as osp
-import random
-import time
-import warnings
-from dataclasses import dataclass
-from typing import Dict, Sequence
-import math
-import numpy as np
-import PIL
-import torch
-import transformers
-from PIL import Image, ImageFile
-from torch.utils.data import Dataset, default_collate
-from transformers import PreTrainedTokenizer
-from transformers import AutoFeatureExtractor
-import kaldiio
-import llava.data.datasets_mixture as datasets_mixture
-from llava import conversation as conversation_lib
-from llava.constants import DEFAULT_SOUND_TOKEN,DEFAULT_SPEECH_TOKEN, IGNORE_INDEX
-from llava.data.collate import DataCollator
-from llava.mm_utils import (
-    load_audio,
-    get_num_windows,
-    tokenizer_image_token,
-)
-from llava.train.args import DataArguments, TrainingArguments
-from llava.train.sequence_parallel import (
-    extract_local_from_list,
-    extract_local_input_ids,
-    extract_local_position_ids,
-    get_pg_manager,
-)
-from llava.utils.tokenizer import preprocess_conversation
-# import torchaudio
-from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler, UniformClipSampler
-import soundfile as sf
-from librosa import resample as librosa_resample
-import whisper
-ImageFile.LOAD_TRUNCATED_IMAGES = True
-PIL.Image.MAX_IMAGE_PIXELS = 1000000000
-import soundfile as sf
-from librosa import resample as librosa_resample
-import whisper
-import librosa
-from llava.utils.logging import logger
-import os
-from pydub import AudioSegment
-import subprocess
-import random
-import io
-import numpy as np
-import librosa
-from decord import AudioReader, cpu
-import datetime
-# SwiftStack / OpenStack credentials and endpoints
-os.environ["ST_USER"] = "arushig"
-os.environ["ST_KEY"] = ""
-os.environ["ST_END_POINT"] = "https://pdx.s8k.io"
-os.environ["ST_AUTH"] = "https://pdx.s8k.io/auth/v1.0"
-os.environ["OS_STORAGE_URL"] = "https://pdx.s8k.io/v1/AUTH_team-nemo-data-acquisition"
-# import torch
-# torch.distributed.init_process_group(
-#     backend="nccl",
-#     timeout=datetime.timedelta(minutes=30)
-# )
-from llava.data.audio_utils import _load_tarball_local_audio_byteseek
-def int16_to_float32(x):
-    return (x / 32767.0).astype(np.float32)
-
-
-def float32_to_int16(x):
-    x = np.clip(x, a_min=-1., a_max=1.)
-    return (x * 32767.).astype(np.int16)
-
-import warnings
-
-MAX_DURATION = 1000  # seconds
-warnings.filterwarnings("ignore", category=FutureWarning)
-
-def preprocess_multimodal(sources: Sequence[str], data_args: DataArguments) -> Dict:
-    is_multimodal = data_args.is_multimodal
-    if not is_multimodal:
-        return sources
-
-    for source in sources:
-        concat_values = "".join([sentence["value"] for sentence in source])
-        for sid, sentence in enumerate(source):
-            # In multimodal conversations, we automatically prepend '<image>' at the start of the first sentence if it doesn't already contain one.
-            
-            if DEFAULT_SOUND_TOKEN in sentence["value"]:
-                sentence["value"] = sentence["value"].replace(DEFAULT_SOUND_TOKEN, f"{DEFAULT_SOUND_TOKEN}\n")
-                sentence["value"] = sentence["value"].replace(f"{DEFAULT_SOUND_TOKEN}\n\n", f"{DEFAULT_SOUND_TOKEN}\n")
-            if DEFAULT_SPEECH_TOKEN in sentence["value"]:
-                sentence["value"] = sentence["value"].replace(DEFAULT_SPEECH_TOKEN, f"{DEFAULT_SPEECH_TOKEN}\n")
-                sentence["value"] = sentence["value"].replace(f"{DEFAULT_SPEECH_TOKEN}\n\n", f"{DEFAULT_SPEECH_TOKEN}\n")
-    return sources
-
-
-def read_audio_from_ss(container, object_name):
-    
-    
-
-    def read_swift_file(container, object_name):
-        try:
-            result = subprocess.run(
-                ["swift", "download", container, object_name, "--output", "-"],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                check=True
-            )
-            return result.stdout  # returns bytes
-        except subprocess.CalledProcessError as e:
-            print("Error reading Swift object:", e.stderr.decode())
-            return None
-
-   
-
-    data = read_swift_file(container, object_name)
-
-    return data
-
-def get_mp4_duration_bytes(mp4_bytes):
-    """
-    Get duration (in seconds) of MP4 audio bytes using ffprobe.
-    """
-    try:
-        cmd = [
-            "ffprobe",
-            "-v", "error",
-            "-show_entries", "format=duration",
-            "-of", "json",
-            "pipe:0"
-        ]
-        proc = subprocess.run(
-            cmd,
-            input=mp4_bytes,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            check=True
-        )
-        info = json.loads(proc.stdout)
-        duration = float(info["format"]["duration"])
-        return duration
-    except Exception as e:
-        print("Failed to get duration:", e)
-        return None
-
-
-def load_mp4_audio_bytes_librosa(mp4_bytes, sr=22050, mono=True):
-    """
-    Load MP4 audio bytes into librosa using ffmpeg for decoding.
-    """
-    try:
-        # Run ffmpeg to decode mp4 bytes into wav (PCM 16-bit)
-        ffmpeg_cmd = [
-            "ffmpeg",
-            "-i", "pipe:0",        # input from stdin
-            "-f", "wav",           # output format
-            "pipe:1"               # output to stdout
-        ]
-        proc = subprocess.run(
-            ffmpeg_cmd,
-            input=mp4_bytes,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            check=True
-        )
-
-        wav_bytes = proc.stdout
-        # Load the decoded wav bytes with librosa
-        audio_buffer = io.BytesIO(wav_bytes)
-        y, sr = librosa.load(audio_buffer, sr=sr, mono=mono)
-        # print(f"Audio loaded: {y.shape[0]} samples at {sr} Hz")
-        return y, sr
-    except subprocess.CalledProcessError as e:
-        print("ffmpeg error:", e.stderr.decode())
-        return None, None
-    except Exception as e:
-        print("Failed to load audio with librosa:", e)
-        return None, None
-
-
-def preprocess_plain(
-    sources: Sequence[str],
-    tokenizer: transformers.PreTrainedTokenizer,
-) -> Dict:
-    # add end signal and concatenate together
-    conversations = []
-    for source in sources:
-        assert len(source) == 2
-        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
-        source[0]["value"] = DEFAULT_IMAGE_TOKEN
-        conversation = source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
-        conversations.append(conversation)
-    # tokenize conversations
-    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations]
-    targets = copy.deepcopy(input_ids)
-    for target, source in zip(targets, sources):
-        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
-        target[:tokenized_len] = IGNORE_INDEX
-
-    return dict(input_ids=input_ids, labels=targets)
-
-
-def preprocess(
-    sources: Sequence[str],
-    tokenizer: transformers.PreTrainedTokenizer,
-    has_image: bool = False,
-    no_system_prompt: bool = False,
-) -> Dict:
-    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
-        return preprocess_plain(sources, tokenizer)
-    return default_collate(
-        [
-            preprocess_conversation(conversation, tokenizer, no_system_prompt=no_system_prompt)
-            for conversation in sources
-        ]
-    )
-
-
-class LazySupervisedDataset(Dataset):
-    """Dataset for supervised fine-tuning.
-    This class is originally implemented by the LLaVA team and modified by
-    Ji Lin and Haotian Tang.
-    """
-
-    def __init__(
-        self,
-        data_path: str,
-        image_folder: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        data_args: DataArguments,
-        training_args: TrainingArguments,
-    ):
-        super().__init__()
-        if os.path.isdir(data_path):
-            list_data_dict=[]
-            for fname in sorted(os.listdir(data_path)):
-                if fname.endswith(".ndjson") or fname.endswith(".jsonl"):
-                    full_path = os.path.join(data_path, fname)
-                    with open(full_path, "r", encoding="utf-8") as fp:
-                        for line in fp:
-                            line = line.strip()
-                            if line:
-                                list_data_dict.append(json.loads(line))
-        else:
-            try:
-                with open(data_path) as fp:
-                    list_data_dict = json.load(fp)
-            except:
-                with open(data_path) as fp:
-                    list_data_dict = [json.loads(q) for q in fp]
-
-        # rank0_print("Formatting inputs...Skip in lazy mode")
-        print("Formatting inputs...Skip in lazy mode")
-        self.tokenizer = tokenizer
-        self.list_data_dict = list_data_dict
-        self.data_args = data_args
-        self.image_folder = image_folder
-        self.wav_processor = AutoFeatureExtractor.from_pretrained('Qwen/Qwen2-Audio-7B')
-        self.tar_handles = {}
-    def __len__(self):
-        return len(self.list_data_dict)
-
-    @property
-    def lengths(self):
-        length_list = []
-        for sample in self.list_data_dict:
-            img_tokens = 128 if "image" in sample else 0
-            length_list.append(sum(len(conv["value"].split()) for conv in sample["conversations"]) + img_tokens)
-        return length_list
-
-    @property
-    def modality_lengths(self):
-        length_list = []
-        for sample in self.list_data_dict:
-            if 'duration' in sample.keys():
-                duration = sample["duration"]
-            else:
-                duration = 10.
-            try:
-                cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"]) + int(math.ceil(duration * 25))
-                cur_len = cur_len if "sound" in sample else -cur_len
-                length_list.append(cur_len)
-            except:
-                try:
-                    cur_len = 0 + int(math.ceil(duration * 25))
-                    cur_len = cur_len if "sound" in sample else -cur_len
-                    length_list.append(cur_len)  
-                except:
-                    cur_len = 0 + int(math.ceil(10. * 25))
-                    cur_len = cur_len if "sound" in sample else -cur_len
-                    length_list.append(cur_len) 
-        return length_list
-    
-
-    @staticmethod
-    def _load_sound_tar(audio_metadata,audio_dataset_name, tar_handles, wav_processor, sample_rate=16000, window_length=30.0, window_overlap=0.0, max_num_window=3, audio_start = 0.0):
-        
-        window_length  = int(window_length * sample_rate)
-        window_overlap = int(window_overlap * sample_rate)
-        max_num_window = int(max_num_window)
-        duration = max_num_window * (window_length - window_overlap) + window_overlap
-        absolute_path_to_tarball = '/lustre/fs1/portfolios/llmservice/projects/llmservice_fm_audio/users/arushig/datasets/tarred/audio_flamingo'
-        sound_outputs = []
-        audio_feature_masks = []
-        audio_embed_masks = []
-        audio_times =[]
-        # try:
-        audio_data, tar_handles = _load_tarball_local_audio_byteseek(absolute_path_to_tarball, audio_metadata, tar_handles,audio_dataset_name)
-        # print(f"audio_data: {audio_data.shape}")
-        T = len(audio_data)
-            
-        audio_data = audio_data.reshape(1, -1)
-        num_windows, full_length = get_num_windows(T, sample_rate, max_num_window)
-
-        audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
-        count = 0
-        for i in range(num_windows):
-            audio_time= torch.zeros(750)
-            audio_embed_mask = torch.zeros(750)
-            start = i * (window_length - window_overlap)
-            audio_data_tensor_this = audio_data_tensor[:, start:start+window_length]
-            orig_length = audio_data_tensor_this.shape[1]
-            audio_data_tensor_this = wav_processor(audio_data_tensor_this.cpu().numpy(), sampling_rate=sample_rate, return_tensors="pt") #.squeeze(0) text="dummy", audios=audio_data_tensor_this, return_tensors="pt") #
-            sound_outputs.append(audio_data_tensor_this["input_features"])
-            # calculate the mask for the input melspec to Whisper
-            melspec_frames_this_window = int(math.ceil(orig_length / 160))
-            feature_attention_mask = torch.zeros(3000, dtype=torch.int32)
-            feature_attention_mask[:melspec_frames_this_window] = 1
-            audio_feature_masks.append(feature_attention_mask.unsqueeze(0))
-            # calculate the mask for the output embedding for use in AF2
-            conv_lengths = (melspec_frames_this_window - 1) // 2 + 1
-            output_embedding_lengths = (conv_lengths - 2) // 2 + 1
-            audio_embed_mask[:output_embedding_lengths] = 1
-            audio_embed_masks.append(audio_embed_mask)
-            time = torch.arange(count * 30, (count+1) * 0.04 * 750, 0.04)
-            audio_time[:750] = time[:750]
-            audio_times.append(audio_time)
-            count = count + 1
-        # except:
-        #     print('error loading file', sound_file)
-        #     sound_outputs.append(torch.zeros(1,128,3000))
-        #     audio_feature_masks.append(torch.zeros(1,3000, dtype=torch.int32))
-        #     audio_embed_masks.append(torch.zeros(750))
-        #     audio_times.append(torch.zeros(750))
-
-        return torch.stack(sound_outputs, dim=0), torch.stack(audio_feature_masks, dim=0), torch.stack(audio_embed_masks, dim=0), torch.stack(audio_times, dim=0), tar_handles
-
-
-    @staticmethod
-    def _load_sound(sound_file, wav_processor, sample_rate=16000, window_length=30.0, window_overlap=0.0, max_num_window=3, audio_start = 0.0):
-        if sound_file is None:
-            return None
-        window_length  = int(window_length * sample_rate)
-        window_overlap = int(window_overlap * sample_rate)
-        max_num_window = int(max_num_window)
-        duration = max_num_window * (window_length - window_overlap) + window_overlap
-
-        sound_outputs = []
-        audio_feature_masks = []
-        audio_embed_masks = []
-        audio_times =[]
-        # try:
-        if 's3://' in sound_file:
-            # Remove the s3:// prefix and split into bucket and key
-            path = sound_file.replace("s3://", "")
-            parts = path.split("/", 1)
-            bucket_name = parts[0]
-            key = parts[1]
-            audio_data_bytes = read_audio_from_ss(bucket_name, key)
-            audio_data, sr = load_mp4_audio_bytes_librosa(audio_data_bytes, sr=16000)
-        else:
-            sound_filename = str.split(sound_file, '/')[-1]
-            if '.ark' in sound_filename:
-                sound = kaldiio.load_mat(sound_file)
-                audio_data = sound[1]
-                audio_data=audio_data.astype(np.float16)
-            # elif '.mp4' in sound_filename:
-            #     # Load audio from video file
-            #     ar = AudioReader(sound_file, ctx=cpu(0), sample_rate=sample_rate, mono=True)
-            #     cur_max_length = ar.shape[1]
-            #     audio_data = ar[0:cur_max_length].asnumpy()[0]  # Load the first CHUNK_LENGTH seconds
-            # elif '.m4a' in sound_filename:
-            #     # Load audio from video file
-            #     # ar = AudioReader(sound_file, ctx=cpu(0), sample_rate=sample_rate, mono=True)
-            #     # cur_max_length = ar.shape[1]
-            #     # audio_data = ar[0:cur_max_length].asnumpy()[0]
-            #     audio = AudioSegment.from_file(sound_file, format="m4a")
-
-            #     # Export to bytes
-            #     buffer = io.BytesIO()
-            #     audio.export(buffer, format="wav")
-            #     buffer.seek(0)
-
-            #     audio_data, _ = librosa.load(buffer, sr=16000)
-            else:
-                audio_data = load_audio(sound_file, sample_rate, duration, audio_start) # already cuts to max duration
-        T = len(audio_data)
-            
-        audio_data = audio_data.reshape(1, -1)
-        num_windows, full_length = get_num_windows(T, sample_rate, max_num_window)
-
-        audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
-        count = 0
-        for i in range(num_windows):
-            audio_time= torch.zeros(750)
-            audio_embed_mask = torch.zeros(750)
-            start = i * (window_length - window_overlap)
-            audio_data_tensor_this = audio_data_tensor[:, start:start+window_length]
-            orig_length = audio_data_tensor_this.shape[1]
-            audio_data_tensor_this = wav_processor(audio_data_tensor_this.cpu().numpy(), sampling_rate=sample_rate, return_tensors="pt") #.squeeze(0) text="dummy", audios=audio_data_tensor_this, return_tensors="pt") #
-            sound_outputs.append(audio_data_tensor_this["input_features"])
-            # calculate the mask for the input melspec to Whisper
-            melspec_frames_this_window = int(math.ceil(orig_length / 160))
-            feature_attention_mask = torch.zeros(3000, dtype=torch.int32)
-            feature_attention_mask[:melspec_frames_this_window] = 1
-            audio_feature_masks.append(feature_attention_mask.unsqueeze(0))
-            # calculate the mask for the output embedding for use in AF2
-            conv_lengths = (melspec_frames_this_window - 1) // 2 + 1
-            output_embedding_lengths = (conv_lengths - 2) // 2 + 1
-            audio_embed_mask[:output_embedding_lengths] = 1
-            audio_embed_masks.append(audio_embed_mask)
-            time = torch.arange(count * 30, (count+1) * 0.04 * 750, 0.04)
-            audio_time[:750] = time[:750]
-            audio_times.append(audio_time)
-            count = count + 1
-        # except:
-        #     print('error loading file', sound_file)
-        #     sound_outputs.append(torch.zeros(1,128,3000))
-        #     audio_feature_masks.append(torch.zeros(1,3000, dtype=torch.int32))
-        #     audio_embed_masks.append(torch.zeros(750))
-        #     audio_times.append(torch.zeros(750))
-
-        return torch.stack(sound_outputs, dim=0), torch.stack(audio_feature_masks, dim=0), torch.stack(audio_embed_masks, dim=0), torch.stack(audio_times, dim=0)
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-
-        sources = self.list_data_dict[i]
-        if isinstance(i, int):
-            sources = [sources]
-        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
-
-        entry = self.list_data_dict[i]
-        try:
-            if "sound" in entry or "audio" in entry:
-                # arushig: not modifying the multi turn chat code for not to support tar format
-                sound_path = entry["sound"] if "sound" in entry else entry["audio"]['path']
-                if isinstance(sound_path, list):
-                    sound_files = entry["sound"]                   # e.g. ["a.wav","b.wav",...]
-                    conversations_raw = entry["conversations"]     # list of {"from","value"}
-                    # 1) Collect tag occurrences in textual order (e.g., <sound-1>, <sound-2>, …)
-                    tag_re = re.compile(r"<sound-(\d+)>")
-                    ordered_tags = []   # e.g., ["<sound-2>", "<sound-1>", "<sound-2>", ...]
-                    for turn in conversations_raw:
-                        for m in tag_re.findall(turn["value"]):
-                            ordered_tags.append(f"<sound-{m}>")
-
-                    # 2) Load each referenced file once; cache by index k (1-based tag)
-                    #    Assumption: sound_files[k-1] corresponds to <sound-k>
-                    cache = {}  # k -> (windows_tensor [W, ...], feat_mask [W,...], embed_mask [W,...])
-                    def _get_cached(k: int):
-                        if k in cache:
-                            return cache[k]
-                        if k < 1 or k > len(sound_files):
-                            raise ValueError(f"Tag <sound-{k}> refers to missing sound file at index {k-1}.")
-                        snd_path = sound_files[k-1]
-                        # your loader returns (windows, feat_mask, embed_mask); windows is often [W, 1, 750, 2048] or [W, 750, 2048]
-                        win, af_mask, ae_mask,a_time = self._load_sound(snd_path, self.wav_processor,
-                                                                max_num_window=self.data_args.audio_frames)
-                        # match your single-audio behavior: squeeze the extra batch dim if present
-                        # (keeps shape [W, 750, 2048] or [W, 1, 750, 2048] depending on your SoundTower path)
-                        win = win.squeeze(1)
-                        cache[k] = (win, af_mask, ae_mask, a_time)
-                        return cache[k]
-
-                    # 3) Build replacement strings and flatten windows/masks in the exact tag order
-                    sound_tensors = []         # list of [Wk, ...] to be cat later
-                    sound_feat_masks = []      # list of [Wk, ...]
-                    sound_embed_masks = []     # list of [Wk, ...]
-                    sound_times= []
-                    token_map = {}             # "<sound-k>" -> "<sound>\n" * Wk  (for replacement)
-                    for tag in ordered_tags:
-                        k = int(tag.split("-")[1][:-1])  # "<sound-12>" -> 12
-                        win, af_mask, ae_mask, a_time = _get_cached(k)
-                        # append these windows for this *occurrence* of the tag
-                        sound_tensors.append(win)                   # [Wk, ...]
-                        sound_feat_masks.append(af_mask)            # [Wk, ...]
-                        sound_embed_masks.append(ae_mask)           # [Wk, ...]
-                        sound_times.append(a_time)
-                        # remember replacement string for this tag (<sound> repeated Wk times, newline after each)
-                        if tag not in token_map:
-                            Wk = win.shape[0]
-                            token_map[tag] = ("<sound>\n" * Wk).rstrip()
-
-                    # 4) Replace <sound-k> with repeated "<sound>\n" in the conversation
-                    conversation = []
-                    for turn in conversations_raw:
-                        role, value = turn["from"], turn["value"]
-                        for tag, marker in token_map.items():
-                            value = value.replace(tag, marker)
-                        conversation.append({"from": role, "value": value.rstrip()})
-
-                    # 5) Finalize sources (chat) + stack media
-                    sources = [conversation]
-                    # If no tags were present, fall back to text-only like below.
-                    if len(sound_tensors) > 0:
-                        sound_tensor = torch.cat(sound_tensors, dim=0)
-                        audio_feature_masks = torch.cat(sound_feat_masks, dim=0)
-                        audio_embed_masks = torch.cat(sound_embed_masks, dim=0)
-                        audio_times = torch.cat(sound_times, dim=0)
-                    else:
-
-                        sound_tensor = None
-                        audio_feature_masks = None
-                        audio_embed_masks = None
-                        audio_times = None
-
-                # single turn loading
-                elif isinstance(sound_path, str):
-                    if "sound" in self.list_data_dict[i]:
-                        
-                        sound_file = self.list_data_dict[i]["sound"]
-                        # print(sound_file)
-                        if 'duration' in self.list_data_dict[i]:
-                            duration = self.list_data_dict[i]["duration"]
-                            # print(duration)
-                        else:
-                            if ".mat" in sound_file or ".ark" in sound_file:
-                                duration = 30.
-                            else:
-                                try:
-                                    duration = librosa.get_duration(path=sound_file)
-                                except:
-                                    print(f'error getting duration for {sound_file}')
-                                    duration = 30.
-                        if duration > 1000.:
-                            print(f'duration is long: {duration}')
-                            # logger.exception(f"Error processing instance '{self.list_data_dict[i]}':. Resampling.")
-                            return self.__getitem__(random.randint(0, len(self.list_data_dict) - 1))
-                        else:
-                            question = str(self.list_data_dict[i]["conversations"][0]["value"].rstrip())
-                            answer = str(self.list_data_dict[i]["conversations"][1]["value"]).rstrip()
-                            question = question.replace("<speech>\n", "").replace("\n<speech>", "").replace("<speech>", "")
-                            question = question.replace("<sound>\n", "").replace("\n<sound>", "").replace("<sound>", "")
-                            question = question.replace("<en><asr>\n", "").replace("\n<en><asr>", "").replace("<en><asr>", "")
-                            question = question.replace("<eng><asr>\n", "").replace("\n<eng><asr>", "").replace("<eng><asr>", "")
-                            sound_tensor, audio_feature_masks, audio_embed_masks, audio_times = self._load_sound(sound_file, self.wav_processor, max_num_window=self.data_args.audio_frames)
-                            sound_tensor=sound_tensor.squeeze(1) # squeeze the irrelevant dimension which was caused due to processor getting 1 batch for processing --> (windows x 750 x 2048)
-
-                            if random.choice([True, False]):
-                                question = "<sound>" * sound_tensor.shape[0] + "\n" + question
-                            else:
-                                question = question + "\n" + "<sound>" * sound_tensor.shape[0]
-                            conversation = [
-                                {"from": "human", "value": question},
-                                {"from": "gpt", "value": answer},
-                            ]
-
-                            sources = [conversation]
-                    else:
-                        audio_metadata = self.list_data_dict[i]["audio"]
-                        # print(audio_metadata)
-                        audio_dataset_name = self.list_data_dict[i]["dataset"]
-                        try:
-                            duration = self.list_data_dict[i]["audio"]['duration']
-                        except:
-                            print("no duration found")
-                            duration = 30.
-                        if duration > 1000.0:
-                            print(f'duration is long: {duration}')
-                            return self.__getitem__(random.randint(0, len(self.list_data_dict) - 1))
-                        else:
-                            question = str(self.list_data_dict[i]["conversations"][0]["value"].rstrip())
-                            answer = str(self.list_data_dict[i]["conversations"][1]["value"]).rstrip()
-                            question = question.replace("<speech>\n", "").replace("\n<speech>", "").replace("<speech>", "")
-                            question = question.replace("<sound>\n", "").replace("\n<sound>", "").replace("<sound>", "")
-                            question = question.replace("<en><asr>\n", "").replace("\n<en><asr>", "").replace("<en><asr>", "")
-                            question = question.replace("<eng><asr>\n", "").replace("\n<eng><asr>", "").replace("<eng><asr>", "")
-                            sound_tensor, audio_feature_masks, audio_embed_masks, audio_times, tar_handles = self._load_sound_tar(audio_metadata, audio_dataset_name, self.tar_handles, self.wav_processor, max_num_window=self.data_args.audio_frames)
-                            sound_tensor=sound_tensor.squeeze(1) # squeeze the irrelevant dimension which was caused due to processor getting 1 batch for processing --> (windows x 750 x 2048)
-                            self.tar_handles = tar_handles
-                            if random.choice([True, False]):
-                                question = "<sound>" * sound_tensor.shape[0] + "\n" + question
-                            else:
-                                question = question + "\n" + "<sound>" * sound_tensor.shape[0]
-                            conversation = [
-                                {"from": "human", "value": question},
-                                {"from": "gpt", "value": answer},
-                            ]
-
-                            sources = [conversation]
-                # text-only data loading 
-                else:
-                    question = str(self.list_data_dict[i]["conversations"][0]["value"].rstrip())
-                    answer = str(self.list_data_dict[i]["conversations"][1]["value"]).rstrip()
-                    conversation = [
-                        {"from": "human", "value": question},
-                        {"from": "gpt", "value": answer},
-                    ]
-
-                    sources = [conversation]
-        except:
-            print('error loading file', sound_path)
-            # logger.exception(f"Error processing instance '{self.list_data_dict[i]}':. Resampling.")
-            return self.__getitem__(random.randint(0, len(self.list_data_dict) - 1))
-        data_dict = preprocess(
-            sources,
-            self.tokenizer,
-            has_image=(
-                "sound" in self.list_data_dict[i] or "audio" in self.list_data_dict[i]
-            ),
-        )
-        
-        if isinstance(i, int):
-            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
-        if (len(data_dict["input_ids"]) + len(data_dict["labels"])) > 24000:
-            print(f'length of input data long: {len(data_dict["input_ids"]) + len(data_dict["labels"])}')
-            # logger.exception(f"Error processing instance: Resampling. data_path: {self.data_path}")
-            return self.__getitem__(random.randint(0, len(self.list_data_dict) - 1))
-        if "sound" in self.list_data_dict[i] or "audio" in self.list_data_dict[i]:
-            data_dict["sound"] = sound_tensor
-            data_dict["sound_feature_masks"] = audio_feature_masks
-            data_dict["sound_embed_masks"] = audio_embed_masks
-            data_dict["sound_times"] = audio_times
-        else:
-            data_dict["sound"] = None
-            data_dict["sound_feature_masks"] = None
-            data_dict["sound_embed_masks"] = None
-            data_dict["sound_times"] = None
-      
-        return data_dict
-
-
-class LazyMMC4Dataset(Dataset):
-    """Dataset for supervised fine-tuning.
-    This class is implemented by Ji Lin and Haotian Tang."""
-
-    def __init__(
-        self,
-        data_path: str,
-        image_folder: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        data_args: DataArguments,
-        training_args: TrainingArguments,
-        image_following_text_only=False,
-        text_only=False,
-    ):
-        super().__init__()
-
-        import pickle
-
-        n_samples = []
-        # actually shards and stats info
-        n_shards = len(os.listdir(data_path)) // 2
-        # n_shards = 100
-        count_info_list = sorted([f for f in os.listdir(data_path) if f.endswith(".count")])[:n_shards]
-        n_samples = [int(open(os.path.join(data_path, f)).read().strip()) for f in count_info_list]
-
-        print("total MMC4 samples", sum(n_samples))  # 10,881,869
-
-        PROCESS_GROUP_MANAGER = get_pg_manager()
-        if PROCESS_GROUP_MANAGER is not None:
-            import torch.distributed as dist
-
-            sequence_parallel_size = training_args.seq_parallel_size
-        else:
-            sequence_parallel_size = 1
-        print("sequence_parallel_size", sequence_parallel_size)
-        rank = training_args.process_index // sequence_parallel_size  # int(os.environ["RANK"])
-        world_size = training_args.world_size // sequence_parallel_size  # int(os.environ["WORLD_SIZE"])
-        shared_size = n_shards // world_size
-
-        gpu_samples = [sum(n_samples[i * shared_size : (i + 1) * shared_size]) for i in range(world_size)]
-        self.n_samples = min(gpu_samples) * world_size  # total size
-        self.idx_offset = rank * min(gpu_samples)
-        shard_start, shard_end = rank * shared_size, (rank + 1) * shared_size
-        print(f" * loading data from shard {shard_start}-{shard_end}")
-
-        shard_names = [d.replace(".count", ".pkl") for d in count_info_list]
-        shard_names = shard_names[shard_start:shard_end]
-
-        full_data_list = []
-        # now load data
-        for shard_name in shard_names:
-            # load shard
-            with open(os.path.join(data_path, shard_name), "rb") as f:
-                data_list = pickle.load(f)
-
-            full_data_list.extend(data_list)
-
-        print(f"* loaded totally {len(full_data_list)} samples")
-
-        self.data_list = full_data_list
-
-        self.tokenizer = tokenizer
-        self.data_args = data_args
-        self.image_folder = image_folder
-
-        self.image_following_text_only = image_following_text_only
-        self.text_only = text_only
-
-    def __len__(self):
-        # return len(self.data_list)
-        return self.n_samples
-
-    @property
-    def modality_lengths(self):
-        # Estimate the number of tokens after tokenization, used for length-grouped sampling
-        length_list = []
-        for info in self.data_list:
-            num_images = min(6, len(info["image_info"]))
-            sentences = [info["text_list"][x["matched_text_index"]] for x in info["image_info"][:num_images]]
-            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
-            cur_len = num_images * self.num_image_tokens // 2 + sum([len(x) for x in sentences])
-            length_list.append(cur_len)
-        return length_list
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        info = self.data_list[i - self.idx_offset]
-
-        sentences = info["text_list"]
-        # kentang-mit@: remove existing <image> tokens in the sentences
-        for ix in range(len(sentences)):
-            # if this is an html tag, we still preserve its semantic meaning
-            sentences[ix] = sentences[ix].replace("<image>", "<IMAGE>")
-        sim_matrix = info["similarity_matrix"]  # we do not use this...
-
-        # convert images from base64 to PIL and filter based on image-text similarity
-        images, sentence_ixs = [], []
-        if not self.text_only:
-            for sample_image, sim_vec in zip(info["image_info"], sim_matrix):
-                image_base64 = sample_image["image_base64"]
-                rawbytes = base64.b64decode(image_base64)
-
-                sim_ix = sample_image["matched_text_index"]
-                # sim_ix = np.argmax(sim_vec)
-                # sim_score = sim_vec[sim_ix]
-
-                # filter to images >= 5KB
-                # if len(rawbytes) // 1000 <= 5:
-                #     continue
-                # if sim_score < 0.24:
-                #     continue
-                image = Image.open(io.BytesIO(rawbytes)).convert("RGB")
-
-                images.append(image)
-                sentence_ixs.append(sim_ix)
-
-        # constrain max num 6 images
-        max_num_images = 6
-        if len(images) > max_num_images:
-            images = images[:max_num_images]
-            sentence_ixs = sentence_ixs[:max_num_images]
-
-        # reorder images according to text insertion
-        images = [images[iii] for iii in np.argsort(sentence_ixs)]
-
-        # preprocess and tokenize text
-        for ix in sentence_ixs:
-            sentences[ix] = f"<image>\n{sentences[ix]}"
-
-        if self.image_following_text_only:
-            # use pad tokens to divide sentence pieces
-            text = self.tokenizer.pad_token.join(sentences)
-        else:
-            text = " ".join(sentences)
-        # whitespace cleanup
-        text = text.replace("<image> ", "<image>").replace(" <image>", "<image>")
-        text = f"{text}{self.tokenizer.eos_token}"  # add eos token
-
-        if len(images) > 0:
-            if self.data_args.image_aspect_ratio == "dynamic_s2":
-                images, block_sizes = dynamic_s2_process_images_and_prompt(
-                    images, text, self.data_args, self.image_folder
-                )
-            elif self.data_args.image_aspect_ratio == "dynamic":
-                images, text = dynamic_process_images_and_prompt(
-                    images, text, self.data_args, self.image_folder, max_tiles=6
-                )
-            else:
-                images = torch.stack([process_image(image, self.data_args, self.image_folder) for image in images])
-
-            # the same size for all images, so we concat
-            # cur_token_len = (
-            #     images[0].shape[-2] // self.multimodal_cfg["patch_size"]
-            # ) * (images[0].shape[-1] // self.multimodal_cfg["patch_size"])
-            # cur_token_len += self.multimodal_cfg["n_extra_patch"]
-        else:
-            images = None
-            # cur_token_len = 0
-
-        input_ids = tokenizer_image_token(
-            text,
-            self.tokenizer,
-            return_tensors="pt",
-        )
-
-        image_token_id = self.tokenizer.media_token_ids["image"]
-
-        # now check the case where the last token is image patch token
-        if input_ids[-1] == image_token_id:  # need to remove one last image
-            last_non_im_patch_indices = torch.where(input_ids != image_token_id)[0][-1] + 1
-            input_ids = input_ids[:last_non_im_patch_indices]
-
-        n_im_patch = (input_ids == image_token_id).sum().item()
-
-        if self.data_args.image_aspect_ratio != "dynamic_s2":
-            images = images[:n_im_patch]
-            assert len(images) == n_im_patch, print(text, input_ids)
-        assert len(input_ids.shape) == 1, "Unexpected shape of 'input_ids' from MMC4."
-        input_ids = (
-            torch.concat([torch.tensor([self.tokenizer.bos_token_id]), input_ids])
-            if self.tokenizer.bos_token_id is not None and input_ids[0] != self.tokenizer.bos_token_id
-            else input_ids
-        )
-        targets = input_ids.clone()
-
-        if self.image_following_text_only:  # keep only text after leading image token
-            # remove loss for any token before the first <image> token
-            label_idx = 0
-            while label_idx < targets.shape[-1] and targets[label_idx] != image_token_id:
-                targets[label_idx] = IGNORE_INDEX
-                label_idx += 1
-
-            pad_token = self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0]
-
-            pad_token_idxs = torch.where(targets == pad_token)[0]
-            for pad_token_idx in pad_token_idxs:
-                token_idx = pad_token_idx + 1
-                while token_idx < targets.shape[-1] and targets[token_idx] != image_token_id:
-                    targets[token_idx] = IGNORE_INDEX
-                    token_idx += 1
-            # do not train on padding tokens
-            targets[targets == pad_token] = IGNORE_INDEX
-
-        # mask image tokens is unnecessary for llava-1.5
-        # targets[targets == IMAGE_TOKEN_INDEX] = IGNORE_INDEX
-        # print(input_ids.shape)
-
-        data_dict = dict(input_ids=input_ids, labels=targets, image=images)
-        if self.data_args.image_aspect_ratio == "dynamic_s2":
-            data_dict["block_sizes"] = block_sizes
-
-        return data_dict
-
-
-class LazyCoyoDataset(Dataset):
-    """Dataset for supervised fine-tuning.
-    This class is implemented by Ji Lin and Haotian Tang."""
-
-    num_image_tokens = 576
-
-    def __init__(
-        self,
-        data_path: str,
-        image_folder: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        data_args: DataArguments,
-        training_args: TrainingArguments,
-        # kentang-mit@: balance the total number of tokens for Coyo and MMC4.
-        n_samples_per_idx=4,
-    ):
-        super().__init__()
-
-        import pickle
-
-        n_samples = []
-        # actually shards and stats info
-        n_shards = len(os.listdir(data_path)) // 2
-        # n_shards = 100
-        count_info_list = sorted([f for f in os.listdir(data_path) if f.endswith(".count")])[:n_shards]
-        n_samples = [int(open(os.path.join(data_path, f)).read().strip()) for f in count_info_list]
-
-        print("total COYO samples", sum(n_samples))
-
-        PROCESS_GROUP_MANAGER = get_pg_manager()
-        if PROCESS_GROUP_MANAGER is not None:
-            import torch.distributed as dist
-
-            sequence_parallel_size = training_args.seq_parallel_size
-        else:
-            sequence_parallel_size = 1
-        print("sequence_parallel_size", sequence_parallel_size)
-        rank = training_args.process_index // sequence_parallel_size  # int(os.environ["RANK"])
-        world_size = training_args.world_size // sequence_parallel_size  # int(os.environ["WORLD_SIZE"])
-        shared_size = n_shards // world_size
-
-        gpu_samples = [
-            sum(n_samples[i * shared_size : (i + 1) * shared_size]) // n_samples_per_idx for i in range(world_size)
-        ]
-        self.n_samples = min(gpu_samples) * world_size  # total size
-        self.idx_offset = rank * min(gpu_samples)
-
-        shard_start, shard_end = rank * shared_size, (rank + 1) * shared_size
-        print(f" * loading data from shard {shard_start}-{shard_end}")
-
-        shard_names = [d.replace(".count", ".pkl") for d in count_info_list]
-        shard_names = shard_names[shard_start:shard_end]
-
-        full_data_list = []
-        # now load data
-        for shard_name in shard_names:
-            # load shard
-            with open(os.path.join(data_path, shard_name), "rb") as f:
-                shard_data = pickle.load(f)
-                random.seed(42)
-                if "mmc4" in data_path:
-                    random.shuffle(shard_data)  # shuffle for MMC4cap only
-                full_data_list.extend(shard_data)
-
-        print(f"* loaded totally {len(full_data_list)} samples")
-
-        # now pack the samples into groups
-        n_groups = len(full_data_list) // n_samples_per_idx
-        full_data_list = [
-            full_data_list[i : i + n_samples_per_idx] for i in range(0, len(full_data_list), n_samples_per_idx)
-        ]
-        if len(full_data_list[-1]) < n_samples_per_idx:
-            full_data_list = full_data_list[:-1]
-        assert len(full_data_list) == n_groups
-        print(f"split into {n_groups} groups")
-
-        self.data_list = full_data_list
-
-        self.tokenizer = tokenizer
-        self.data_args = data_args
-        self.image_folder = image_folder
-
-    def __len__(self):
-        # return len(self.data_list)
-        return self.n_samples
-
-    @property
-    def modality_lengths(self):
-        # Estimate the number of tokens after tokenization, used for length-grouped sampling
-        length_list = []
-        for samples in self.data_list:
-            cur_len = sum([len(conv["text" if "text" in conv else "caption"].split()) for conv in samples])
-            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
-            cur_len = cur_len + len(samples) * self.num_image_tokens // 2
-            length_list.append(cur_len)
-        return length_list
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        CONCAT_SAMPLES = False
-        info_list = self.data_list[i - self.idx_offset]
-
-        text_list = []
-        image_list = []
-
-        for sample in info_list:
-            caption_key = (
-                "text" if "text" in sample else "caption"
-            )  # kentang-mit@: remove existing <image> tokens in the sentences
-            # kentang-mit@: remove existing <image> token.
-            # if this is an html tag, we still preserve its semantic meaning
-            sample[caption_key] = sample[caption_key].replace("<image>", "<IMAGE>")
-            text_list.append(DEFAULT_IMAGE_TOKEN + "\n" + sample[caption_key] + self.tokenizer.eos_token)
-            if "image" in sample:
-                image_base64 = sample["image"]
-                rawbytes = base64.b64decode(image_base64)
-            else:
-                rawbytes = sample["rawbytes"]
-            image = Image.open(io.BytesIO(rawbytes)).convert("RGB")
-            image_list.append(image)
-
-        image_list = torch.stack([process_image(image, self.data_args, self.image_folder) for image in image_list])
-
-        if CONCAT_SAMPLES:
-            # into <image>cap<eos><image>cap<eos>...
-            text_list = "".join(text_list)
-
-            input_ids = self.tokenizer(
-                text_list,
-                return_tensors="pt",
-                padding="longest",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-            ).input_ids  # 4, seq_len
-
-            input_ids = input_ids[0]
-
-        else:
-            input_ids = [
-                tokenizer_image_token(
-                    prompt,
-                    self.tokenizer,
-                    return_tensors="pt",
-                )
-                for prompt in text_list
-            ]
-            # print([x.shape[0] for x in input_ids], [len(x.split()) for x in text_list], [len(re.findall(r"<image[^>]*>", x)) for x in text_list])
-
-            # input_ids = torch.nn.utils.rnn.pad_sequence(
-            #     input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
-            # )
-
-        targets = copy.deepcopy(input_ids)
-        for i in range(len(targets)):
-            targets[i][targets[i] == self.tokenizer.pad_token_id] = IGNORE_INDEX
-
-        return dict(input_ids=input_ids, labels=targets, image=image_list)
-
-
-class LazyWDSDataset(Dataset):
-    """Dataset for supervised fine-tuning.
-    This class is implemented by Ji Lin and Ligeng Zhu."""
-
-    def __init__(
-        self,
-        data_path: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        data_args: DataArguments,
-        image_folder: str,
-        training_args: TrainingArguments,
-    ):
-        super().__init__()
-        n_samples = []
-        n_shards = len(os.listdir(data_path)) // 3
-        for shard in range(n_shards):
-            with open(os.path.join(data_path, f"{shard:05d}_stats.json")) as f:
-                info = json.load(f)
-                n_samples.append(info["successes"])
-
-        # print(f"[DEBUG] {data_path} total samples", sum(n_samples))  # 10,881,869
-
-        PROCESS_GROUP_MANAGER = get_pg_manager()
-        if PROCESS_GROUP_MANAGER is not None:
-            import torch.distributed as dist
-
-            sequence_parallel_size = training_args.seq_parallel_size
-        else:
-            sequence_parallel_size = 1
-        print("sequence_parallel_size", sequence_parallel_size)
-        rank = training_args.process_index // sequence_parallel_size  # int(os.environ["RANK"])
-        world_size = training_args.world_size // sequence_parallel_size  # int(os.environ["WORLD_SIZE"])
-        shared_size = n_shards // world_size
-        print("rank", rank, "world_size", world_size, "shared_size", shared_size)
-        gpu_samples = [sum(n_samples[i * shared_size : (i + 1) * shared_size]) for i in range(world_size)]
-        self.n_samples = min(gpu_samples) * world_size  # total size
-        self.idx_offset = rank * min(gpu_samples)
-        shard_start, shard_end = rank * shared_size, (rank + 1) * shared_size
-        print(f" * loading data from shard {shard_start}-{shard_end}")
-
-        tar_list = [f"{shard_idx:05d}.tar" for shard_idx in range(shard_start, shard_end)]
-
-        self.data_list = []
-        t1 = time.time()
-        for tar in tar_list:
-            tmp_path = f"/tmp/ccs{tar}"
-            tar_path = os.path.join(data_path, tar)
-
-            if PROCESS_GROUP_MANAGER is not None:
-                dist.barrier()
-                if PROCESS_GROUP_MANAGER.sp_rank == 0:
-                    os.makedirs(tmp_path, exist_ok=True)
-                    os.system(f"tar -xkf {tar_path} -C {tmp_path}")
-                dist.barrier()
-            else:
-                os.makedirs(tmp_path, exist_ok=True)
-                os.system(f"tar -xkf {tar_path} -C {tmp_path}")
-
-            txt_list = [f for f in os.listdir(tmp_path) if f.endswith(".txt")]
-
-            for txt in txt_list:
-                caption = open(os.path.join(tmp_path, txt)).read().strip()
-                image_path = os.path.join(tmp_path, txt.split(".")[0] + ".jpg")
-                self.data_list.append({"caption": caption, "image": image_path})
-        t2 = time.time()
-        print(f"Loading done. Total time: {t2 - t1:.2f} seconds")
-
-        self.tokenizer = tokenizer
-        self.data_args = data_args
-        self.image_folder = image_folder
-
-    def __len__(self):
-        return self.n_samples
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-
-        # print("i", i, "idx_offset", self.idx_offset, "len", len(self.data_list))
-        info = self.data_list[i - self.idx_offset]
-        caption, image_path = info["caption"], info["image"]
-
-        rand_prompt = "<image>\n"
-        sources = [
-            {
-                "image": image_path,
-                "conversations": [
-                    {"from": "human", "value": rand_prompt},
-                    {"from": "gpt", "value": caption},
-                ],
-            }
-        ]
-
-        # one example of sources
-        # [{'id': 'GCC_train_001738742', 'image': 'GCC_train_001738742.jpg', 'conversations': [{'from': 'human', 'value': 'Provide a brief description of the given image.\n<image>'}, {'from': 'gpt', 'value': 'a sketch of an ostrich'}]}]
-        if "image" in sources[0]:
-            image = process_image(sources[0]["image"], self.data_args, self.image_folder)
-            image = torch.unsqueeze(image, dim=0)
-            # now random pick some context samples for training
-            if hasattr(self.data_args, "num_shots"):
-                if self.data_args.num_shots > 0:
-                    raise NotImplementedError
-        else:
-            raise NotImplementedError
-
-        data_dict = preprocess([sources[0]["conversations"]], self.tokenizer, has_image=True)
-
-        if isinstance(i, int):
-            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
-
-        # image exist in the data
-        if image is not None:
-            data_dict["image"] = image
-        else:
-            raise NotImplementedError
-
-        return data_dict
-
-
-class LazyCCSWebDataset(Dataset):
-    """Dataset for supervised fine-tuning.
-    This class is implemented by Ligeng Zhu."""
-
-    def __init__(
-        self,
-        data_path: str,
-        image_folder: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        data_args: DataArguments,
-        training_args: TrainingArguments,
-    ):
-        super().__init__()
-        t1 = time.time()
-
-        from llava.data.simple_vila_webdataset import VILAWebDataset
-
-        print("[DEBUG] ", osp.abspath(data_path))
-        self.dataset = VILAWebDataset(data_path=osp.abspath(data_path))
-
-        t2 = time.time()
-        print(f"Loading done. Total time: {t2 - t1:.2f} seconds")
-
-        self.tokenizer = tokenizer
-        self.data_args = data_args
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        # info = self.data_list[i - self.idx_offset]
-        # caption, image_path = info["caption"], info["image"]
-        info = self.dataset[i]
-        if ".jpg" in info:
-            caption, image_path = info[".txt"], info[".jpg"]
-        elif ".png" in info:
-            caption, image_path = info[".txt"], info[".png"]
-        elif ".webp" in info:
-            caption, image_path = info[".txt"], info[".webp"]
-        elif ".bmp" in info:
-            caption, image_path = info[".txt"], info[".bmp"]
-        elif ".tiff" in info:
-            caption, image_path = info[".txt"], info[".tiff"]
-        else:
-            print(info.keys())
-            print(info)
-            raise KeyError
-
-        caption = caption.replace("<image>", "<IMAGE>")
-        if isinstance(image_path, io.BytesIO):
-            image_path = Image.open(image_path).convert("RGB")
-
-        if not isinstance(image_path, PIL.Image.Image):
-            print(image_path)
-            print(info.keys())
-            print(type(image_path))
-            raise NotImplementedError
-
-        rand_prompt = "<image>\n"
-        sources = [
-            {
-                "image": image_path,
-                "conversations": [
-                    {"from": "human", "value": rand_prompt},
-                    {"from": "gpt", "value": caption},
-                ],
-            }
-        ]
-
-        # one example of sources
-        # [{'id': 'GCC_train_001738742', 'image': 'GCC_train_001738742.jpg', 'conversations': [{'from': 'human', 'value': 'Provide a brief description of the given image.\n<image>'}, {'from': 'gpt', 'value': 'a sketch of an ostrich'}]}]
-        if "image" in sources[0]:
-            image = process_image(sources[0]["image"], self.data_args, image_folder=None)
-            image = torch.unsqueeze(image, dim=0)
-            # now random pick some context samples for training
-            if hasattr(self.data_args, "num_shots"):
-                if self.data_args.num_shots > 0:
-                    raise NotImplementedError
-        else:
-            raise NotImplementedError
-
-        data_dict = preprocess([sources[0]["conversations"]], self.tokenizer, has_image=True)
-
-        if isinstance(i, int):
-            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
-
-        # image exist in the data
-        if image is not None:
-            data_dict["image"] = image
-        else:
-            raise NotImplementedError
-
-        return data_dict
-
-
-from functools import lru_cache
-
-
-@lru_cache(maxsize=16)
-def lru_json_load(fpath):
-    with open(fpath) as fp:
-        return json.load(fp)
-
-
-class LazyCoyoWebDataset(Dataset):
-    """Dataset for supervised fine-tuning.
-    This class is implemented by Ligeng Zhu."""
-
-    num_image_tokens = 576
-
-    def __init__(
-        self,
-        data_path: str,
-        image_folder: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        data_args: DataArguments,
-        training_args: TrainingArguments,
-        # kentang-mit@: balance the total number of tokens for Coyo and MMC4.
-        n_samples_per_idx=4,
-    ):
-        super().__init__()
-
-        from llava.data.simple_vila_webdataset import VILAWebDataset
-
-        print("[DEBUG] ", osp.abspath(data_path))
-        self.dataset = VILAWebDataset(data_path=osp.abspath(data_path), meta_path=data_args.meta_path)
-
-        if data_args.start_idx >= 0 and data_args.end_idx >= 0:
-            # Ligeng: support slicing for ablate different subsets.
-            total = len(self.dataset)
-            start_idx = int(total * data_args.start_idx)
-            end_idx = int(total * data_args.end_idx)
-            print(f"loading subset from {start_idx} to {end_idx}, total {total}")
-            self.dataset = torch.utils.data.Subset(self.dataset, range(start_idx, end_idx))
-
-        # For caption choice,
-        #   if None: use original caption
-        #   if a folder path: use specified caption to override original one (choice1)
-        #   if a folder path: use specified caption and concat with original one (choice2)
-        self.caption_choice = None
-        self.caption_choice_2 = None
-        self.data_path = data_path
-
-        if data_args.caption_choice is not None:
-            self.caption_choice = data_args.caption_choice
-            print("[recap] Override coyo caption using ", self.caption_choice)
-
-        if data_args.caption_choice_2 is not None:
-            self.caption_choice_2 = data_args.caption_choice_2
-            print("[recapv2] Override coyo caption using ", self.caption_choice_2)
-
-        print("total samples", len(self.dataset))
-        PROCESS_GROUP_MANAGER = get_pg_manager()
-        if PROCESS_GROUP_MANAGER is not None:
-            import torch.distributed as dist
-
-            sequence_parallel_size = training_args.seq_parallel_size
-            sequence_parallel_rank = PROCESS_GROUP_MANAGER.sp_rank
-        else:
-            sequence_parallel_size = 1
-        print("sequence_parallel_size", sequence_parallel_size)
-        rank = (
-            training_args.process_index // sequence_parallel_size if "RANK" in os.environ else 2
-        )  # int(os.environ["RANK"])
-        world_size = (
-            training_args.world_size // sequence_parallel_size if "WORLD_SIZE" in os.environ else 32
-        )  # int(os.environ["WORLD_SIZE"])
-        print(
-            "rank",
-            rank,
-            "world_size",
-            world_size,
-        )
-
-        self.n_samples_per_idx = n_samples_per_idx
-        # self.n_samples = len(self.dataset) // n_samples_per_idx
-        self.tokenizer = tokenizer
-        self.data_args = data_args
-
-    def __len__(self):
-        return len(self.dataset) // self.n_samples_per_idx
-
-    @property
-    def modality_lengths(self):
-        # Estimate the number of tokens after tokenization, used for length-grouped sampling
-        length_list = []
-        for samples in self.data_list:
-            cur_len = sum([len(conv["text" if "text" in conv else "caption"].split()) for conv in samples])
-            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
-            cur_len = cur_len + len(samples) * self.num_image_tokens // 2
-            length_list.append(cur_len)
-        return length_list
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        CONCAT_SAMPLES = False
-        # info_list = self.dataset[i - self.idx_offset]
-
-        begin_idx, end_idx = (
-            i * self.n_samples_per_idx,
-            (i + 1) * self.n_samples_per_idx,
-        )
-        end_idx = min(end_idx, len(self.dataset))
-
-        text_list = []
-        image_list = []
-
-        for idx in range(begin_idx, end_idx):
-            info = self.dataset[idx]
-            if ".jpg" in info:
-                caption, image_path = info[".txt"], info[".jpg"]
-            elif ".png" in info:
-                caption, image_path = info[".txt"], info[".png"]
-            elif ".webp" in info:
-                caption, image_path = info[".txt"], info[".webp"]
-            elif ".bmp" in info:
-                caption, image_path = info[".txt"], info[".bmp"]
-            elif ".tiff" in info:
-                caption, image_path = info[".txt"], info[".tiff"]
-            else:
-                print(info.keys())
-                print(info)
-                raise KeyError
-
-            if self.caption_choice is not None:
-                # load new captions
-                shard = info["__shard__"]
-                url = info[".json"]["url"]
-                tar_name = osp.relpath(osp.realpath(shard), osp.realpath(self.data_path))
-                # tar_name = osp.dirname(shard)
-                shard_json_path = osp.join(self.caption_choice, tar_name + ".json")
-                try:
-                    shard_json = lru_json_load(shard_json_path)
-                    try:
-                        caption = shard_json[url]["output"]
-                    except KeyError:
-                        print(f"{url} not in caption. fallback to original caption temporarially")
-                except:
-                    print(f"shard_json_path {shard_json_path} not found. fallback to original caption temporarially")
-            caption = caption.replace("<image>", "<IMAGE>")
-            text_list.append(DEFAULT_IMAGE_TOKEN + caption + self.tokenizer.eos_token)
-
-            if isinstance(image_path, io.BytesIO):
-                image_path = Image.open(image_path).convert("RGB")
-
-            if not isinstance(image_path, PIL.Image.Image):
-                print(image_path)
-                print(info.keys())
-                print(type(image_path))
-                raise NotImplementedError
-
-            image_list.append(image_path)
-
-        # image_list = torch.stack([process_image(image, self.data_args, image_folder=None) for image in image_list])
-        # NOTE(fix by ligeng)
-        #  now image_list should return a list of image tensor where each has a dimension of (1, c, h, w)
-        image_list = [process_image(image, self.data_args, image_folder=None).unsqueeze(0) for image in image_list]
-
-        if CONCAT_SAMPLES:
-            # into <image>cap<eos><image>cap<eos>...
-            text_list = "".join(text_list)
-
-            input_ids = self.tokenizer(
-                text_list,
-                return_tensors="pt",
-                padding="longest",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-            ).input_ids  # 4, seq_len
-
-            input_ids = input_ids[0]
-        else:
-            input_ids = [
-                tokenizer_image_token(
-                    prompt,
-                    self.tokenizer,
-                    return_tensors="pt",
-                )
-                for prompt in text_list
-            ]
-            input_ids = [
-                (
-                    torch.concat([torch.tensor([self.tokenizer.bos_token_id]), input_ids_i])
-                    if input_ids_i[0] != self.tokenizer.bos_token_id
-                    else input_ids_i
-                )
-                for input_ids_i in input_ids
-            ]
-
-        targets = copy.deepcopy(input_ids)
-        for i in range(len(targets)):
-            targets[i][targets[i] == self.tokenizer.pad_token_id] = IGNORE_INDEX
-
-        return dict(input_ids=input_ids, labels=targets, image=image_list)
-
-
-class LazyVideoWebDataset(Dataset):
-    """Dataset for supervised fine-tuning."""
-
-    def __init__(
-        self,
-        data_path: str,
-        image_folder: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        data_args: DataArguments,
-        training_args: TrainingArguments,
-        # cache_path: str,
-        # n_samples_per_idx=4,
-    ):
-        super().__init__()
-
-        # from llava.data.simple_video_dataset import SimpleVideoDataset
-
-        from llava.data.simple_vila_webdataset import VILAWebDataset
-
-        print("[DEBUG] ", osp.abspath(data_path))
-        self.dataset = VILAWebDataset(
-            data_path=osp.abspath(data_path),
-            meta_path=f"{osp.abspath(data_path)}/wids-meta.json",
-            # cache_dir=cache_path,
-        )
-
-        # None: use original caption
-        # Folder path: use original caption
-        self.caption_choice = None
-        self.data_path = data_path
-
-        if data_args.caption_choice is not None:
-            self.caption_choice = data_args.caption_choice
-            print("[recap] Override LazyVideo caption using ", self.caption_choice)
-
-        print("total samples", len(self.dataset))
-        # InternVid: TODO
-        PROCESS_GROUP_MANAGER = get_pg_manager()
-        if PROCESS_GROUP_MANAGER is not None:
-            import torch.distributed as dist
-
-            sequence_parallel_size = training_args.seq_parallel_size
-            sequence_parallel_rank = PROCESS_GROUP_MANAGER.sp_rank
-        else:
-            sequence_parallel_size = 1
-        print("sequence_parallel_size", sequence_parallel_size)
-        rank = (
-            training_args.process_index // sequence_parallel_size if "RANK" in os.environ else 2
-        )  # int(os.environ["RANK"])
-        world_size = (
-            training_args.world_size // sequence_parallel_size if "WORLD_SIZE" in os.environ else 32
-        )  # int(os.environ["WORLD_SIZE"])
-        print(
-            "rank",
-            rank,
-            "world_size",
-            world_size,
-        )
-        self.rank = rank
-        # rank = int(os.environ["RANK"]) if "RANK" in os.environ else 2
-        # world_size = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 32
-
-        self.tokenizer = tokenizer
-        self.data_args = data_args
-
-        self.missing_uids = set()
-
-    def __len__(self):
-        return len(self.dataset)
-
-    @property
-    def modality_lengths(self):
-        # Estimate the number of tokens after tokenization, used for length-grouped sampling
-        length_list = []
-        for samples in self.data_list:
-            cur_len = sum([len(conv["text" if "text" in conv else "caption"].split()) for conv in samples])
-            # The unit of cur_len is "words". We assume 1 word = 2 tokens.
-            cur_len = cur_len + len(samples) * self.num_image_tokens // 2
-            length_list.append(cur_len)
-        return length_list
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        ADD_TEXT_PROMPT = False
-        num_video_frames = self.data_args.num_video_frames if hasattr(self.data_args, "num_video_frames") else 8
-        loader_fps = self.data_args.fps if hasattr(self.data_args, "fps") else 0.0
-
-        info = self.dataset[i]
-
-        caption = ""
-        # print(info)
-        if ".mp4" in info:
-            caption, video_path = info[".txt"], info[".mp4"]
-        else:
-            video_path = None
-            caption = "Empty video."
-
-        images, frames_loaded, _ = LazySupervisedDataset._load_video(
-            video_path, num_video_frames, loader_fps, self.data_args
-        )
-
-        if frames_loaded == 0:
-            caption = "Empty video."
-
-        if self.caption_choice is not None:
-            shard = info["__shard__"]
-            uuid = osp.join(info["__shard__"], info["__key__"])
-            url = info["__key__"]
-            tar_name = osp.basename(info["__shard__"])
-
-            try:
-                shard_json_path = osp.join(self.caption_choice, tar_name.replace(".tar", ".json"))
-                shard_json = lru_json_load(shard_json_path)
-                caption = shard_json[url]["summary"]["output"]
-            except (KeyError, FileNotFoundError, json.decoder.JSONDecodeError):
-                if uuid not in self.missing_uids:
-                    print("override caption not found for ", uuid)
-                    self.missing_uids.add(uuid)
-
-            # print(f"[DEBUG {uuid}]", caption)
-
-        frames_loaded_successfully = len(images)
-        if caption is None:
-            caption = ""
-        prompt = "<image>\n" * frames_loaded_successfully + caption
-        image_tensor = torch.stack([process_image(image, self.data_args, None) for image in images])
-
-        input_ids = tokenizer_image_token(
-            prompt,
-            self.tokenizer,
-            return_tensors="pt",
-        )
-        targets = copy.deepcopy(input_ids)
-        data_dict = dict(input_ids=input_ids, labels=targets, image=image_tensor)
-
-        return data_dict
-
-
-class DataCollatorForSupervisedDatasetSeqParallel:
-    """Collate examples for supervised fine-tuning (audio version).
-    Adapted from LLaVA sequence-packing collator to support audio inputs instead of images/videos.
-    """
-
-    def __init__(
-        self,
-        tokenizer: transformers.PreTrainedTokenizer,
-        data_args: DataArguments,
-        training_args: TrainingArguments,
-        sp_degree: int,
-        sp_rank: int,
-        ring_degree: int,
-        ring_type: str,
-    ):
-        self.tokenizer = tokenizer
-        self.data_args = data_args
-        self.training_args = training_args
-        self.sp_degree = sp_degree
-        self.sp_rank = sp_rank
-        self.ring_degree = ring_degree
-        self.ring_type = ring_type
-
-    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
-        input_ids, labels, audios = [], [], []
-        audio_token_id = self.tokenizer.media_token_ids["sound"]
-
-        # --- Step 1: Collect all inputs ---
-        for instance in instances:
-            if not isinstance(instance["input_ids"], list):
-                input_ids.append(instance["input_ids"])
-            else:
-                input_ids += instance["input_ids"]
-
-            if not isinstance(instance["labels"], list):
-                labels.append(instance["labels"])
-            else:
-                labels += instance["labels"]
-
-            # Expect `instance["audio"]` tensor of shape (n_audios, 128, 3000)
-            if "sound" in instance:
-                cur_audio = instance["sound"]
-                assert len(cur_audio.shape) == 3, f"Expected (n_audios, 128, 3000), got {cur_audio.shape}"
-                if cur_audio.shape[0] == 0:
-                    warnings.warn("Loaded one sample without audio.")
-                if not isinstance(instance["input_ids"], list):
-                    audios.append(cur_audio)
-                else:
-                    audios.extend(cur_audio.chunk(cur_audio.size(0), 0))
-            else:
-                warnings.warn("Loaded one sample without audio.")
-                audios.append([])
-
-        # --- Step 2: Sanity checks ---
-        max_num_audios = max([len(_a) for _a in audios])
-        for _a, _ids in zip(audios, input_ids):
-            assert (
-                len(_a) == (_ids == audio_token_id).sum().item()
-            ), f"Mismatch between number of audio tensors and <audio> tokens. Found {len(_a)} audios but {_ids.tolist().count(audio_token_id)} tokens."
-
-        NUM_TOKENS_PER_AUDIO = getattr(self.data_args, "audio_frames", 10)  # configurable
-
-        # Dummy audio input for padding
-        dummy_audio = torch.ones((1, 128, 3000), device=input_ids[0].device)
-        dummy_input_ids = torch.tensor(
-            [self.tokenizer.bos_token_id, audio_token_id, self.tokenizer.eos_token_id],
-            device=input_ids[0].device,
-        )
-        dummy_labels = dummy_input_ids.clone()
-        dummy_labels[:2] = IGNORE_INDEX
-        dummy_seqlen = NUM_TOKENS_PER_AUDIO + 2
-        dummy_position_ids = torch.arange(0, dummy_seqlen, dtype=torch.int32)
-
-        # --- Step 3: Sort by effective length ---
-        combined = sorted(
-            zip(input_ids, labels, audios),
-            key=lambda x: len(x[2]) * (NUM_TOKENS_PER_AUDIO - 1) + x[0].size(-1),
-            reverse=True,
-        )
-        sorted_ids, sorted_labels, sorted_audios = zip(*combined)
-        sorted_ids, sorted_labels, sorted_audios = list(sorted_ids), list(sorted_labels), list(sorted_audios)
-
-        max_seq_length = self.tokenizer.model_max_length
-        max_sample_len = 0
-
-        batches, label_batches, position_ids, batch_audios, seqlens_in_batch = [], [], [], [], []
-        i = 0
-
-        # --- Step 4: Sequence Packing ---
-        while i < len(sorted_ids):
-            current_batch = torch.tensor([], dtype=torch.int32)
-            current_label_batch = torch.tensor([], dtype=torch.int32)
-            current_position_ids = torch.tensor([], dtype=torch.int32)
-            current_batch_audios = []
-            current_num_audios = 0
-            current_len = 0
-            current_num_samples = 0
-
-            while i < len(sorted_ids):
-                num_audios = (sorted_ids[i] == audio_token_id).sum().item()
-                num_audio_tokens_added = num_audios * (NUM_TOKENS_PER_AUDIO - 1)
-                num_incoming_tokens = sorted_ids[i].size(-1) + num_audio_tokens_added
-
-                # Handle Ring padding
-                if self.ring_degree > 1:
-                    RING_PAD_TOKEN_INDEX = 2
-                    pad_len = 0
-                    if self.ring_type == "ring_varlen":
-                        if num_incoming_tokens % self.sp_degree != 0:
-                            pad_len = self.sp_degree - num_incoming_tokens % self.sp_degree
-                    elif self.ring_type == "zigzag_ring_varlen":
-                        zigzag_sp_degree = self.sp_degree * 2
-                        if num_incoming_tokens % zigzag_sp_degree != 0:
-                            pad_len = zigzag_sp_degree - num_incoming_tokens % zigzag_sp_degree
-                    else:
-                        raise ValueError(f"Invalid ring_type: {self.ring_type}")
-
-                    if pad_len > 0:
-                        pad_tensor = torch.full(
-                            (pad_len,), RING_PAD_TOKEN_INDEX, dtype=sorted_ids[i].dtype, device=sorted_ids[i].device
-                        )
-                        sorted_ids[i] = torch.cat([sorted_ids[i], pad_tensor])
-                        pad_label_tensor = torch.full(
-                            (pad_len,), IGNORE_INDEX, dtype=sorted_labels[i].dtype, device=sorted_labels[i].device
-                        )
-                        sorted_labels[i] = torch.cat([sorted_labels[i], pad_label_tensor])
-                        num_incoming_tokens += pad_len
-
-                if (current_len + num_incoming_tokens <= max_seq_length):
-                    current_num_audios += num_audios
-                    current_len += num_incoming_tokens
-                    current_num_samples += 1
-                    current_position_ids = torch.cat(
-                        (current_position_ids, torch.arange(start=0, end=num_incoming_tokens)), dim=0
-                    )
-                    current_batch = torch.cat((current_batch, sorted_ids[i]), dim=0)
-                    sorted_labels[i][0] = IGNORE_INDEX
-                    current_label_batch = torch.cat((current_label_batch, sorted_labels[i]), dim=0)
-                    seqlens_in_batch.append(num_incoming_tokens)
-                    current_batch_audios.extend(sorted_audios[i])
-                    i += 1
-                    assert current_num_audios == len(current_batch_audios)
-                else:
-                    break
-
-            # --- Step 5: Padding with dummy audio if needed ---
-            MAX_RETRY = self.sp_degree
-            num_retry = 0
-            while current_num_audios < self.sp_degree and current_len < max_seq_length and num_retry <= MAX_RETRY:
-                current_num_audios += dummy_audio.size(0)
-                current_len += dummy_seqlen
-                current_num_samples += 1
-                current_position_ids = torch.cat((current_position_ids, dummy_position_ids), dim=0)
-                current_batch = torch.cat((current_batch, dummy_input_ids), dim=0)
-                current_label_batch = torch.cat((current_label_batch, dummy_labels), dim=0)
-                seqlens_in_batch.append(dummy_seqlen)
-                current_batch_audios.extend(dummy_audio)
-                num_retry += 1
-
-            if current_num_audios < self.sp_degree:
-                print(f"Warning: Skipping one packed sample with {current_num_audios} audios")
-                seqlens_in_batch = seqlens_in_batch[:-current_num_samples]
-                continue
-
-            max_sample_len = max(max_sample_len, current_len)
-            batches.append(current_batch)
-            label_batches.append(current_label_batch)
-            position_ids.append(current_position_ids)
-            batch_audios.append(current_batch_audios)
-
-        # --- Step 6: Sequence parallelism split ---
-        for i in range(len(batches)):
-            audio_token_indices = torch.where(batches[i] == audio_token_id)[0].tolist()
-            audio_ids = torch.arange(0, len(audio_token_indices), dtype=torch.int32)
-            batches[i] = extract_local_input_ids(
-                batches[i], audio_token_indices, self.sp_rank, self.sp_degree, self.tokenizer.bos_token_id
-            )
-            label_batches[i] = extract_local_input_ids(
-                label_batches[i], audio_token_indices, self.sp_rank, self.sp_degree, self.tokenizer.bos_token_id
-            )
-            batch_audios[i] = torch.concat(
-                extract_local_from_list(batch_audios[i], self.sp_rank, self.sp_degree), dim=0
-            )
-            num_audios = len(batch_audios[i])
-            assert num_audios == len(torch.where(batches[i] == audio_token_id)[0].tolist())
-
-            position_ids[i] = extract_local_position_ids(
-                position_ids[i], audio_token_indices, audio_ids, self.sp_rank, self.sp_degree, NUM_TOKENS_PER_AUDIO - 1
-            )
-
-        # --- Step 7: Final tensorization ---
-        input_ids = torch.nn.utils.rnn.pad_sequence(
-            batches, batch_first=True, padding_value=self.tokenizer.pad_token_id
-        )
-        labels = torch.nn.utils.rnn.pad_sequence(label_batches, batch_first=True, padding_value=IGNORE_INDEX)
-        seqlens_in_batch = torch.stack([torch.tensor(x) for x in seqlens_in_batch], axis=0).flatten()
-        position_ids = torch.nn.utils.rnn.pad_sequence(position_ids, batch_first=True, padding_value=-1)
-
-        batch = dict(
-            input_ids=input_ids,
-            labels=labels,
-            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
-            seqlens_in_batch=seqlens_in_batch,
-            media={"sound": batch_audios},
-            media_config={"sound": {}},
-            position_ids=position_ids,
-        )
-        return batch
-
-
-def make_supervised_data_module(
-    tokenizer: PreTrainedTokenizer,
-    data_args: DataArguments,
-    training_args: TrainingArguments,
-) -> Dict:
-    """Make dataset and collator for supervised fine-tuning.
-    This function is originally implemented by the LLaVA team and
-    modified by Jason Lu, Haotian Tang and Ligeng Zhu."""
-    datasets_mixture.register_datasets_mixtures()
-
-    from .builder import build_dataset
-
-    train_dataset = build_dataset(data_args.data_mixture, data_args, training_args, tokenizer)
-    training_args.sample_lens = [len(d) for d in train_dataset.datasets]
-
-    PROCESS_GROUP_MANAGER = get_pg_manager()
-    if PROCESS_GROUP_MANAGER is None:
-        data_collator = DataCollator(tokenizer=tokenizer)
-    else:
-        sp_degree = training_args.seq_parallel_size
-        sp_rank = PROCESS_GROUP_MANAGER.sp_rank
-        ring_degree = PROCESS_GROUP_MANAGER.ring_degree
-        ring_type = PROCESS_GROUP_MANAGER.ring_type
-        data_collator = DataCollatorForSupervisedDatasetSeqParallel(
-            tokenizer=tokenizer,
-            data_args=data_args,
-            training_args=training_args,
-            sp_degree=sp_degree,
-            sp_rank=sp_rank,
-            ring_degree=ring_degree,
-            ring_type=ring_type,
-        )
-
-    return dict(
-        train_dataset=train_dataset,
-        data_collator=data_collator,
-    )
diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
index 8556e34ca322..eea34ce0d249 100644
--- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
@@ -46,6 +46,18 @@
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
+class MusicFlamingoPreTrainedModel(PreTrainedModel):
+    config: MusicFlamingoConfig
+    base_model_prefix = "model"
+    input_modalities = ("audio", "text")
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MusicFlamingoAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+
 def eager_attention_forward(
     module: nn.Module,
     query: torch.Tensor,
@@ -255,18 +267,6 @@ def forward(
         return hidden_states, attn_weights
 
 
-@auto_docstring
-class MusicFlamingoPreTrainedModel(PreTrainedModel):
-    config: MusicFlamingoConfig
-    base_model_prefix = "model"
-    input_modalities = ("audio", "text")
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["MusicFlamingoAttention"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn = True
-    _supports_sdpa = True
-
-
 @auto_docstring(
     custom_intro="""
     The audio model from MusicFlamingo without any head or projection on top.
@@ -274,7 +274,7 @@ class MusicFlamingoPreTrainedModel(PreTrainedModel):
 )
 class MusicFlamingoEncoder(MusicFlamingoPreTrainedModel):
     """
-    MusicFlamingo encoder: Whisper encoder, average pool (time/2), then LayerNorm.
+    MusicFlamingo encoder: Whisper encoder with rotary embeddings for time information.
     """
 
     # Ignore copy
@@ -340,7 +340,6 @@ def forward(
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
         """
-
         seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
         input_features_lengths = input_features_mask.sum(-1)
         input_features_lengths = (input_features_lengths - 1) // 2 + 1  # conv2 downsampling
@@ -376,15 +375,14 @@ def forward(
             times = audio_times.to(hidden_states.device)
             freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
             angle = (-times * 2 * np.pi).to(self.conv1.weight.device)
-            angle_expanded = angle.unsqueeze(2)
-            angle_expanded = angle_expanded.expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
-
+            # audio_times is [batch_size], need to expand to [batch_size, seq_len, freq_dim]
+            angle_expanded = (
+                angle.unsqueeze(1).unsqueeze(2).expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
+            )
             freqs = freqs * angle_expanded
             hidden_states = apply_rotary_emb(freqs, hidden_states)
 
-        return BaseModelOutput(
-            last_hidden_state=hidden_states,
-        )
+        return BaseModelOutput(last_hidden_state=hidden_states)
 
     # Ignore copy
     def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
@@ -479,15 +477,14 @@ def get_audio_features(
             `torch.FloatTensor`:
                 The audio embeddings.
         """
-
-        # Encode audio
+        # Encode audio with dtype conversion and audio_times
         input_features = input_features.to(dtype=self.audio_tower.conv1.weight.dtype)
         encoder_output = self.audio_tower(
             input_features, input_features_mask=input_features_mask, audio_times=audio_times
         )
         audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
 
-        # Mask according to avg pooling (which is after attention blocks)
+        # Mask according to avg pooling
         post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
         valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
         audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
@@ -517,8 +514,6 @@ def forward(
 
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
-        audio_times (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Time embeddings for the audio features.
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
             config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -575,7 +570,6 @@ def forward(
         >>> print(decoded_outputs)
         ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
         ```"""
-
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
@@ -602,23 +596,11 @@ def forward(
         return outputs
 
     def prepare_inputs_for_generation(self, *args, **kwargs):
-        # Overwritten -- we should not pass input_features when we are in cached decoding stage
-
-        input_features = kwargs.pop("input_features", None)
-        input_features_mask = kwargs.pop("input_features_mask", None)
         audio_times = kwargs.pop("audio_times", None)
-        cache_position = kwargs.get("cache_position")
-
         model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
 
-        if cache_position is not None and cache_position[0] == 0:
-            # input_features should only be passed when we are not in cached decoding stage
-            if input_features is not None:
-                model_inputs["input_features"] = input_features
-            if input_features_mask is not None:
-                model_inputs["input_features_mask"] = input_features_mask
-            if audio_times is not None:
-                model_inputs["audio_times"] = audio_times
+        if "input_features" in model_inputs and audio_times is not None:
+            model_inputs["audio_times"] = audio_times
 
         return model_inputs
 
diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py
index ee99460f25ba..d9ef3eac8e73 100644
--- a/src/transformers/models/musicflamingo/modular_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modular_musicflamingo.py
@@ -20,18 +20,17 @@
 import torch
 from torch import nn
 
-from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...masking_utils import create_bidirectional_mask
 from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
-from ..qwen2_audio.modeling_qwen2_audio import (
-    Qwen2AudioEncoder,
-    Qwen2AudioPreTrainedModel,
+from ..audioflamingo3.modeling_audioflamingo3 import (
+    AudioFlamingo3Encoder,
+    AudioFlamingo3ForConditionalGeneration,
+    AudioFlamingo3MultiModalProjector,
+    AudioFlamingo3PreTrainedModel,
 )
-from ..voxtral.modeling_voxtral import VoxtralForConditionalGeneration, VoxtralMultiModalProjector
-from ..whisper.modeling_whisper import WhisperEncoderLayer
 from .configuration_musicflamingo import MusicFlamingoConfig
 from .rotary_embedding import RotaryEmbedding, apply_rotary_emb
 
@@ -39,11 +38,7 @@
 logger = logging.get_logger(__name__)
 
 
-class MusicFlamingoEncoderLayer(WhisperEncoderLayer):
-    pass
-
-
-class MusicFlamingoPreTrainedModel(Qwen2AudioPreTrainedModel):
+class MusicFlamingoPreTrainedModel(AudioFlamingo3PreTrainedModel):
     pass
 
 
@@ -52,9 +47,9 @@ class MusicFlamingoPreTrainedModel(Qwen2AudioPreTrainedModel):
     The audio model from MusicFlamingo without any head or projection on top.
     """
 )
-class MusicFlamingoEncoder(Qwen2AudioEncoder):
+class MusicFlamingoEncoder(AudioFlamingo3Encoder):
     """
-    MusicFlamingo encoder: Whisper encoder, average pool (time/2), then LayerNorm.
+    MusicFlamingo encoder: Whisper encoder with rotary embeddings for time information.
     """
 
     def __init__(self, config: MusicFlamingoConfig):
@@ -80,7 +75,6 @@ def forward(
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
         """
-
         seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
         input_features_lengths = input_features_mask.sum(-1)
         input_features_lengths = (input_features_lengths - 1) // 2 + 1  # conv2 downsampling
@@ -116,32 +110,18 @@ def forward(
             times = audio_times.to(hidden_states.device)
             freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
             angle = (-times * 2 * np.pi).to(self.conv1.weight.device)
-            angle_expanded = angle.unsqueeze(2)
-            angle_expanded = angle_expanded.expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
-
+            # audio_times is [batch_size], need to expand to [batch_size, seq_len, freq_dim]
+            angle_expanded = (
+                angle.unsqueeze(1).unsqueeze(2).expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
+            )
             freqs = freqs * angle_expanded
             hidden_states = apply_rotary_emb(freqs, hidden_states)
 
-        return BaseModelOutput(
-            last_hidden_state=hidden_states,
-        )
+        return BaseModelOutput(last_hidden_state=hidden_states)
 
 
-class MusicFlamingoMultiModalProjector(VoxtralMultiModalProjector):
-    """
-    Audio adaptor (small MLP) that projects MusicFlamingoEncoder features
-    to the LLM embedding space so they can replace `<sound>` tokens.
-    """
-
-    def __init__(self, config: MusicFlamingoConfig):
-        super().__init__()
-        self.linear_1 = nn.Linear(
-            config.audio_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
-        )
-        self.act = ACT2FN[config.projector_hidden_act]
-        self.linear_2 = nn.Linear(
-            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
-        )
+class MusicFlamingoMultiModalProjector(AudioFlamingo3MultiModalProjector):
+    pass
 
 
 @auto_docstring(
@@ -149,53 +129,26 @@ def __init__(self, config: MusicFlamingoConfig):
     The MusicFlamingo model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
     """
 )
-class MusicFlamingoForConditionalGeneration(VoxtralForConditionalGeneration):
-    _tp_plan = None
-    _pp_plan = None
-    _keep_in_fp32_modules_strict = None
-
-    def __init__(self, config):
-        super().__init__(config)
-
+class MusicFlamingoForConditionalGeneration(AudioFlamingo3ForConditionalGeneration):
     def get_audio_features(
         self,
         input_features: torch.FloatTensor,
         input_features_mask: torch.Tensor,
         audio_times: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
-        """
-        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
-        Args:
-            input_features (`torch.FloatTensor`):
-                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
-                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
-                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
-                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
-            input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
-                Mask to avoid performing attention on padded feature indices.
-
-        Returns:
-            `torch.FloatTensor`:
-                The audio embeddings.
-        """
-
-        # Encode audio
+        # Encode audio with dtype conversion and audio_times
         input_features = input_features.to(dtype=self.audio_tower.conv1.weight.dtype)
         encoder_output = self.audio_tower(
             input_features, input_features_mask=input_features_mask, audio_times=audio_times
         )
         audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
 
-        # Mask according to avg pooling (which is after attention blocks)
+        # Mask according to avg pooling
         post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
         valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
         audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
         return audio_embeds
 
-    def get_audio_embeds(self):
-        raise NotImplementedError("This method is not supported for MusicFlamingoForConditionalGeneration.")
-
     @can_return_tuple
     @auto_docstring
     def forward(
@@ -214,71 +167,6 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
-        r"""
-        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
-            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        audio_times (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Time embeddings for the audio features.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Example:
-
-        ```python
-        >>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
-
-        >>> model_id = "nvidia/audio-flamingo-3-hf"
-        >>> processor = AutoProcessor.from_pretrained(model_id)
-        >>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-        >>> conversations = [
-        >>>     [
-        >>>         {
-        >>>             "role": "user",
-        >>>             "content": [
-        >>>                 {"type": "text", "text": "Transcribe the input speech."},
-        >>>                 {
-        >>>                     "type": "audio",
-        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
-        >>>                 },
-        >>>             ],
-        >>>         }
-        >>>     ],
-        >>>     [
-        >>>         {
-        >>>             "role": "user",
-        >>>             "content": [
-        >>>                 {
-        >>>                     "type": "text",
-        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
-        >>>                 },
-        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
-        >>>             ],
-        >>>         }
-        >>>     ],
-        >>> ]
-
-        >>> inputs = processor.apply_chat_template(
-        >>>     conversations,
-        >>>     tokenize=True,
-        >>>     add_generation_prompt=True,
-        >>>     return_dict=True,
-        >>> ).to(model.device)
-
-        >>> outputs = model.generate(**inputs, max_new_tokens=500)
-
-        >>> decoded_outputs = processor.batch_decode(
-        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
-        >>> )
-        >>> print(decoded_outputs)
-        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
-        ```"""
-
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
@@ -305,23 +193,11 @@ def forward(
         return outputs
 
     def prepare_inputs_for_generation(self, *args, **kwargs):
-        # Overwritten -- we should not pass input_features when we are in cached decoding stage
-
-        input_features = kwargs.pop("input_features", None)
-        input_features_mask = kwargs.pop("input_features_mask", None)
         audio_times = kwargs.pop("audio_times", None)
-        cache_position = kwargs.get("cache_position")
-
         model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
 
-        if cache_position is not None and cache_position[0] == 0:
-            # input_features should only be passed when we are not in cached decoding stage
-            if input_features is not None:
-                model_inputs["input_features"] = input_features
-            if input_features_mask is not None:
-                model_inputs["input_features_mask"] = input_features_mask
-            if audio_times is not None:
-                model_inputs["audio_times"] = audio_times
+        if "input_features" in model_inputs and audio_times is not None:
+            model_inputs["audio_times"] = audio_times
 
         return model_inputs
 
diff --git a/src/transformers/models/musicflamingo/processing_musicflamingo.py b/src/transformers/models/musicflamingo/processing_musicflamingo.py
index 8fb561a60faa..8a9011e8901a 100644
--- a/src/transformers/models/musicflamingo/processing_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/processing_musicflamingo.py
@@ -153,6 +153,7 @@ def __call__(
 
             per_sample_windows: list[int] = []
             flat_chunks: list[np.ndarray] = []
+            audio_times_list: list[float] = []
 
             for audio_el in audio:
                 n_samples = int(audio_el.shape[0])
@@ -169,12 +170,19 @@ def __call__(
                     start = i * window_size
                     end = min((i + 1) * window_size, time_cap)
                     flat_chunks.append(audio_el[start:end])
+                    # Calculate the start time of this audio chunk in seconds
+                    audio_start_time = start / audio_kwargs["sampling_rate"]
+                    audio_times_list.append(audio_start_time)
 
             # Feature extraction
             audio_inputs = self.feature_extractor(flat_chunks, **audio_kwargs)
             padding_mask = audio_inputs.pop("attention_mask")
             audio_inputs["input_features_mask"] = padding_mask
 
+            # Add audio times as tensor
+            if return_tensors == "pt":
+                audio_inputs["audio_times"] = torch.tensor(audio_times_list, dtype=torch.float32)
+
             # Compute sequence lengths token counting
             audio_lengths = torch.stack([s.sum() for s in torch.split(padding_mask.sum(-1), per_sample_windows)])
             conv_output_lengths = (audio_lengths - 1) // 2 + 1  # After conv2 downsampling
@@ -290,41 +298,11 @@ def apply_transcription_request(
             **kwargs,
         )
 
-    def batch_decode(self, *args, strip_prefix=False, **kwargs):
-        """
-        Forward arguments to [`~PreTrainedTokenizer.batch_decode`] and optionally remove the assistant framing the model
-        was trained to produce.
-
-        AF3 transcription requests respond with sentences such as `"The spoken content of the audio is \"...\"."`.
-        Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
-        """
-        decoded = self.tokenizer.batch_decode(*args, **kwargs)
-        if strip_prefix:
-            decoded = [self._strip_assistant_prefix_and_quotes(text) for text in decoded]
-        return decoded
-
-    def _strip_assistant_prefix_and_quotes(self, text: str) -> str:
+    def batch_decode(self, *args, **kwargs):
         """
-        Remove the assistant prefix and surrounding quotes from a decoded transcription string.
+        Forward arguments to [`~PreTrainedTokenizer.batch_decode`].
         """
-
-        stripped = text.strip()
-
-        for prefix in (
-            "The spoken content of the audio is",
-            "The transcription of the audio is",
-        ):
-            if stripped.startswith(prefix):
-                stripped = stripped[len(prefix) :].strip()
-                break
-
-        if stripped.endswith("."):
-            stripped = stripped[:-1].strip()
-
-        if len(stripped) >= 2 and stripped[0] == stripped[-1] and stripped[0] in {"'", '"'}:
-            stripped = stripped[1:-1].strip()
-
-        return stripped
+        return self.tokenizer.batch_decode(*args, **kwargs)
 
 
 __all__ = ["MusicFlamingoProcessor"]
diff --git a/src/transformers/models/musicflamingo/rotary_embedding.py b/src/transformers/models/musicflamingo/rotary_embedding.py
index 3f4cbbdf2e2c..cae84c961a41 100644
--- a/src/transformers/models/musicflamingo/rotary_embedding.py
+++ b/src/transformers/models/musicflamingo/rotary_embedding.py
@@ -1,39 +1,39 @@
-from math import pi, log
+from math import pi
 
 import torch
-from torch.nn import Module, ModuleList
-from torch.cuda.amp import autocast
-from torch import nn, einsum, broadcast_tensors, Tensor
-
+from beartype import beartype
+from beartype.typing import Optional
 from einops import rearrange, repeat
+from torch import Tensor, broadcast_tensors, einsum, nn
+from torch.amp import autocast
+from torch.nn import Module
 
-from beartype import beartype
-from beartype.typing import Literal, Union, Optional
 
 # helper functions
-
 def exists(val):
     return val is not None
 
+
 def default(val, d):
     return val if exists(val) else d
 
-# broadcat, as tortoise-tts was using it
 
-def broadcat(tensors, dim = -1):
+# broadcat, as tortoise-tts was using it
+def broadcat(tensors, dim=-1):
     broadcasted_tensors = broadcast_tensors(*tensors)
-    return torch.cat(broadcasted_tensors, dim = dim)
+    return torch.cat(broadcasted_tensors, dim=dim)
 
-# rotary embedding helper functions
 
+# rotary embedding helper functions
 def rotate_half(x):
-    x = rearrange(x, '... (d r) -> ... d r', r = 2)
-    x1, x2 = x.unbind(dim = -1)
-    x = torch.stack((-x2, x1), dim = -1)
-    return rearrange(x, '... d r -> ... (d r)')
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
 
-@autocast(enabled = False)
-def apply_rotary_emb(freqs, t, start_index = 0, scale = 1., seq_dim = -2):
+
+@autocast("cuda", enabled=False)
+def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
     ori_dtype = t.dtype
     embed_dtype = torch.float64
     t = t.to(embed_dtype)
@@ -44,43 +44,44 @@ def apply_rotary_emb(freqs, t, start_index = 0, scale = 1., seq_dim = -2):
     rot_dim = freqs.shape[-1]
     end_index = start_index + rot_dim
 
-    assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+    assert rot_dim <= t.shape[-1], (
+        f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    )
 
     t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
     t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
-    return torch.cat((t_left, t, t_right), dim = -1).to(ori_dtype)
+    return torch.cat((t_left, t, t_right), dim=-1).to(ori_dtype)
 
 
 # learned rotation helpers
-
-def apply_learned_rotations(rotations, t, start_index = 0, freq_ranges = None):
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
     if exists(freq_ranges):
-        rotations = einsum('..., f -> ... f', rotations, freq_ranges)
-        rotations = rearrange(rotations, '... r f -> ... (r f)')
+        rotations = einsum("..., f -> ... f", rotations, freq_ranges)
+        rotations = rearrange(rotations, "... r f -> ... (r f)")
 
-    rotations = repeat(rotations, '... n -> ... (n r)', r = 2)
-    return apply_rotary_emb(rotations, t, start_index = start_index)
+    rotations = repeat(rotations, "... n -> ... (n r)", r=2)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
 
-# classes
 
+# classes
 class RotaryEmbedding(Module):
     @beartype
     def __init__(
         self,
         dim,
         custom_freqs: Optional[Tensor] = None,
-        freqs_for: Union[Literal['lang', 'pixel', 'constant']] = 'lang',
-        theta = 50000,
-        max_freq = 10,
-        num_freqs = 1,
-        learned_freq = False,
-        use_xpos = False,
-        xpos_scale_base = 512,
-        interpolate_factor = 1.,
-        theta_rescale_factor = 1.,
-        seq_before_head_dim = False,
-        cache_if_possible = True,
-        max_time = 7200 
+        freqs_for="lang",
+        theta=50000,
+        max_freq=10,
+        num_freqs=1,
+        learned_freq=False,
+        use_xpos=False,
+        xpos_scale_base=512,
+        interpolate_factor=1.0,
+        theta_rescale_factor=1.0,
+        seq_before_head_dim=False,
+        cache_if_possible=True,
+        max_time=7200,
     ):
         super().__init__()
 
@@ -96,11 +97,11 @@ def __init__(
         self.cache_if_possible = cache_if_possible
         self.max_time = max_time
 
-        self.tmp_store('cached_freqs', None)
-        self.tmp_store('cached_scales', None)
+        self.tmp_store("cached_freqs", None)
+        self.tmp_store("cached_scales", None)
 
         # Adjust theta to avoid angle wrapping after large times
-        if exists(max_time) and freqs_for == 'lang':
+        if exists(max_time) and freqs_for == "lang":
             # Make sure highest frequency completes 1 full rotation over max time
             # theta = base of exponent: higher theta → lower frequency range
             # max_time * (1/theta^(0)) = 2pi  =>  theta = max_time / (2pi)
@@ -112,20 +113,20 @@ def __init__(
 
         if exists(custom_freqs):
             freqs = custom_freqs
-        elif freqs_for == 'lang':
-            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
-        elif freqs_for == 'pixel':
-            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
-        elif freqs_for == 'constant':
+        elif freqs_for == "lang":
+            freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
             freqs = torch.ones(num_freqs).float()
 
-        self.freqs = nn.Parameter(freqs, requires_grad = learned_freq)
+        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
 
         self.learned_freq = learned_freq
 
         # dummy for device
 
-        self.tmp_store('dummy', torch.tensor(0))
+        self.tmp_store("dummy", torch.tensor(0))
 
         # default sequence dimension
 
@@ -134,17 +135,17 @@ def __init__(
 
         # interpolation factors
 
-        assert interpolate_factor >= 1.
+        assert interpolate_factor >= 1.0
         self.interpolate_factor = interpolate_factor
 
         # xpos
         if not use_xpos:
-            self.tmp_store('scale', None)
+            self.tmp_store("scale", None)
             return
 
         scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
         self.scale_base = xpos_scale_base
-        self.tmp_store('scale', scale)
+        self.tmp_store("scale", scale)
 
         # add apply_rotary_emb as static method
 
@@ -155,56 +156,60 @@ def device(self):
         return self.dummy.device
 
     def tmp_store(self, key, value):
-        self.register_buffer(key, value, persistent = False)
+        self.register_buffer(key, value, persistent=False)
 
-    def get_seq_pos(self, seq_len, device, dtype, offset = 0):
-        return (torch.arange(seq_len, device = device, dtype = dtype) + offset) / self.interpolate_factor
+    def get_seq_pos(self, seq_len, device, dtype, offset=0):
+        return (torch.arange(seq_len, device=device, dtype=dtype) + offset) / self.interpolate_factor
 
-    def rotate_queries_or_keys(self, t, seq_dim = None, offset = 0):
+    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0):
         seq_dim = default(seq_dim, self.default_seq_dim)
 
-        assert not self.use_xpos, 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
+        assert not self.use_xpos, (
+            "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"
+        )
 
         device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
 
-        freqs = self.forward(self.get_seq_pos(seq_len, device = device, dtype = dtype, offset = offset), seq_len = seq_len, offset = offset)
+        freqs = self.forward(
+            self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset), seq_len=seq_len, offset=offset
+        )
 
         if seq_dim == -3:
-            freqs = rearrange(freqs, 'n d -> n 1 d')
+            freqs = rearrange(freqs, "n d -> n 1 d")
 
-        return apply_rotary_emb(freqs, t, seq_dim = seq_dim)
+        return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
 
-    def rotate_queries_with_cached_keys(self, q, k, seq_dim = None, offset = 0):
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
         seq_dim = default(seq_dim, self.default_seq_dim)
 
         q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
         assert q_len <= k_len
 
-        rotated_q = self.rotate_queries_or_keys(q, seq_dim = seq_dim, offset = k_len - q_len + offset)
-        rotated_k = self.rotate_queries_or_keys(k, seq_dim = seq_dim, offset = offset)
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, offset=k_len - q_len + offset)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, offset=offset)
 
         rotated_q = rotated_q.type(q.dtype)
         rotated_k = rotated_k.type(k.dtype)
 
         return rotated_q, rotated_k
 
-    def rotate_queries_and_keys(self, q, k, seq_dim = None):
+    def rotate_queries_and_keys(self, q, k, seq_dim=None):
         seq_dim = default(seq_dim, self.default_seq_dim)
 
         assert self.use_xpos
         device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
 
-        seq = self.get_seq_pos(seq_len, dtype = dtype, device = device)
+        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
 
-        freqs = self.forward(seq, seq_len = seq_len)
-        scale = self.get_scale(seq, seq_len = seq_len).to(dtype)
+        freqs = self.forward(seq, seq_len=seq_len)
+        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
 
         if seq_dim == -3:
-            freqs = rearrange(freqs, 'n d -> n 1 d')
-            scale = rearrange(scale, 'n d -> n 1 d')
+            freqs = rearrange(freqs, "n d -> n 1 d")
+            scale = rearrange(scale, "n d -> n 1 d")
 
-        rotated_q = apply_rotary_emb(freqs, q, scale = scale, seq_dim = seq_dim)
-        rotated_k = apply_rotary_emb(freqs, k, scale = scale ** -1, seq_dim = seq_dim)
+        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
 
         rotated_q = rotated_q.type(q.dtype)
         rotated_k = rotated_k.type(k.dtype)
@@ -212,34 +217,22 @@ def rotate_queries_and_keys(self, q, k, seq_dim = None):
         return rotated_q, rotated_k
 
     @beartype
-    def get_scale(
-        self,
-        t: Tensor,
-        seq_len: Optional[int] = None,
-        offset = 0
-    ):
+    def get_scale(self, t: Tensor, seq_len: Optional[int] = None, offset=0):
         assert self.use_xpos
 
-        should_cache = (
-            self.cache_if_possible and
-            exists(seq_len)
-        )
+        should_cache = self.cache_if_possible and exists(seq_len)
 
-        if (
-            should_cache and \
-            exists(self.cached_scales) and \
-            (seq_len + offset) <= self.cached_scales.shape[0]
-        ):
-            return self.cached_scales[offset:(offset + seq_len)]
+        if should_cache and exists(self.cached_scales) and (seq_len + offset) <= self.cached_scales.shape[0]:
+            return self.cached_scales[offset : (offset + seq_len)]
 
-        scale = 1.
+        scale = 1.0
         if self.use_xpos:
             power = (t - len(t) // 2) / self.scale_base
-            scale = self.scale ** rearrange(power, 'n -> n 1')
-            scale = torch.cat((scale, scale), dim = -1)
+            scale = self.scale ** rearrange(power, "n -> n 1")
+            scale = torch.cat((scale, scale), dim=-1)
 
         if should_cache:
-            self.tmp_store('cached_scales', scale)
+            self.tmp_store("cached_scales", scale)
 
         return scale
 
@@ -248,12 +241,12 @@ def get_axial_freqs(self, *dims):
         all_freqs = []
 
         for ind, dim in enumerate(dims):
-            if self.freqs_for == 'pixel':
-                pos = torch.linspace(-1, 1, steps = dim, device = self.device)
+            if self.freqs_for == "pixel":
+                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
             else:
-                pos = torch.arange(dim, device = self.device)
+                pos = torch.arange(dim, device=self.device)
 
-            freqs = self.forward(pos, seq_len = dim)
+            freqs = self.forward(pos, seq_len=dim)
 
             all_axis = [None] * len(dims)
             all_axis[ind] = Colon
@@ -262,39 +255,27 @@ def get_axial_freqs(self, *dims):
             all_freqs.append(freqs[new_axis_slice])
 
         all_freqs = broadcast_tensors(*all_freqs)
-        return torch.cat(all_freqs, dim = -1)
+        return torch.cat(all_freqs, dim=-1)
 
-    @autocast(enabled = False)
-    def forward(
-        self,
-        t: Tensor,
-        seq_len = None,
-        offset = 0
-    ):
+    @autocast("cuda", enabled=False)
+    def forward(self, t: Tensor, seq_len=None, offset=0):
         should_cache = (
-            self.cache_if_possible and \
-            not self.learned_freq and \
-            exists(seq_len) and \
-            self.freqs_for != 'pixel'
+            self.cache_if_possible and not self.learned_freq and exists(seq_len) and self.freqs_for != "pixel"
         )
 
-        if (
-            should_cache and \
-            exists(self.cached_freqs) and \
-            (offset + seq_len) <= self.cached_freqs.shape[0]
-        ):
-            return self.cached_freqs[offset:(offset + seq_len)].detach()
+        if should_cache and exists(self.cached_freqs) and (offset + seq_len) <= self.cached_freqs.shape[0]:
+            return self.cached_freqs[offset : (offset + seq_len)].detach()
 
         freqs = self.freqs
 
         # Scale time to keep t * freq <= 2pi
-        if hasattr(self, 'max_time') and self.max_time is not None:
+        if hasattr(self, "max_time") and self.max_time is not None:
             t = t / self.max_time * (2 * pi)
 
-        freqs = einsum('..., f -> ... f', t.type(freqs.dtype), freqs)
-        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
+        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
 
         if should_cache:
-            self.tmp_store('cached_freqs', freqs.detach())
+            self.tmp_store("cached_freqs", freqs.detach())
 
-        return freqs
\ No newline at end of file
+        return freqs
diff --git a/src/transformers/models/musicflamingo/sound_encoder.py b/src/transformers/models/musicflamingo/sound_encoder.py
deleted file mode 100644
index 0347ae9e8ff9..000000000000
--- a/src/transformers/models/musicflamingo/sound_encoder.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2025 NVIDIA CORPORATION.
-# Licensed under the MIT license.
-
-# Adapted from https://github.com/NVlabs/VILA/tree/main under the Apache 2.0 license.
-# LICENSE is in incl_licenses directory.
-
-# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-# This file is modified from https://github.com/haotian-liu/LLaVA/
-
-from abc import abstractmethod
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from llava.model.multimodal_encoder.rotary_embedding import (
-    RotaryEmbedding,
-    apply_rotary_emb
-)
-import numpy as np
-
-
-class SoundTower(nn.Module):
-    def __init__(self, sound_tower, args, delay_load=False):
-        super().__init__()
-
-        self.is_loaded = False
-
-        self.sound_tower_name = sound_tower
-        self.cfg_only = None
-        self.pos_emb = RotaryEmbedding(
-                    dim = 256,
-                    freqs_for = 'lang',
-                    max_time = 1200.
-                    )
-
-    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
-        """
-        Computes the output length of the convolutional layers and the output length of the audio encoder
-        """
-        input_lengths = (input_lengths - 1) // 2 + 1
-        output_lengths = (input_lengths - 2) // 2 + 1
-        return input_lengths, output_lengths
-
-    def forward(self, sounds, mask=None, times=None):
-        self.pos_emb = self.pos_emb.to(sounds.device)
-        if type(sounds) is list:
-            sound_features = []
-            for sound in sounds:
-                # Calculate attention mask
-                audio_feat_lengths, audio_output_lengths = self._get_feat_extract_output_lengths(mask.sum(-1))
-                # for cases where only one window is there for the audio_clip
-                batch_size, _, max_mel_seq_len = sound.shape
-                max_seq_len = (max_mel_seq_len - 2) // 2 + 1
-                seq_range = (
-                        torch.arange(0, max_seq_len, dtype=audio_feat_lengths.dtype, device=audio_feat_lengths.device)
-                        .unsqueeze(0)
-                        .expand(batch_size, max_seq_len)
-                    )
-                lengths_expand = audio_feat_lengths.unsqueeze(1).expand(batch_size, max_seq_len)
-                padding_mask = seq_range >= lengths_expand
-                audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
-                        batch_size, 1, max_seq_len, max_seq_len
-                    )
-                audio_attention_mask = audio_attention_mask_.to(
-                        dtype=self.sound_tower.conv1.weight.dtype, device=self.sound_tower.conv1.weight.device
-                    )
-                audio_attention_mask[audio_attention_mask_] = float("-inf")
-                # Calculate features
-                sound_feature = self.sound_tower(sound, attention_mask=audio_attention_mask)
-                sound_feature = sound_feature.to(sound.dtype)
-                sound_feature = sound_feature.last_hidden_state
-                times = times.to(sound_feature.device)
-                freqs = self.pos_emb.get_axial_freqs(times.shape[0], sound_feature.shape[-2]).to(self.sound_tower.conv1.weight.device)
-                angle = (-times * 2 * np.pi).to(self.sound_tower.conv1.weight.device)
-                angle_expanded = angle.unsqueeze(1).unsqueeze(2)
-                angle_expanded = angle_expanded.expand(times.shape[0], sound_feature.shape[-2], freqs.shape[-1])
-
-                freqs = freqs * angle_expanded
-                # print(freqs.shape)
-                sound_feature = apply_rotary_emb(freqs, sound_feature.unsqueeze(0))
-
-                sound_features.append(sound_feature)
-        else:
-            # Calculate attention mask
-            if len(sounds.shape) == 5:
-                sounds = sounds.squeeze(1).squeeze(1)
-                mask = mask.squeeze(0)
-                
-            audio_feat_lengths, audio_output_lengths = self._get_feat_extract_output_lengths(mask.sum(-1))
-            # for cases where only one window is there for the audio_clip
-            
-            batch_size, _, max_mel_seq_len = sounds.shape
-            max_seq_len = (max_mel_seq_len - 2) // 2 + 1
-            seq_range = (
-                    torch.arange(0, max_seq_len, dtype=audio_feat_lengths.dtype, device=audio_feat_lengths.device)
-                    .unsqueeze(0)
-                    .expand(batch_size, max_seq_len)
-                )
-            if len(audio_feat_lengths.shape) == 1:
-                audio_feat_lengths = audio_feat_lengths.unsqueeze(1)
-            elif len(audio_feat_lengths.shape) == 3: # hard-coded, check logic
-                audio_feat_lengths = audio_feat_lengths.squeeze(-1)
-
-            lengths_expand = audio_feat_lengths.expand(batch_size, max_seq_len)
-            padding_mask = seq_range >= lengths_expand
-            audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
-                    batch_size, 1, max_seq_len, max_seq_len
-                )
-            audio_attention_mask = audio_attention_mask_.to(
-                    dtype=self.sound_tower.conv1.weight.dtype, device=self.sound_tower.conv1.weight.device
-                )
-            audio_attention_mask[audio_attention_mask_] = float("-inf")
-            # Calculate features
-            sound_features = self.sound_tower(sounds, attention_mask=audio_attention_mask)
-            sound_features = sound_features.last_hidden_state
-            times = times.to(sound_features.device)
-            freqs = self.pos_emb.get_axial_freqs(times.shape[0], sound_features.shape[-2]).to(self.sound_tower.conv1.weight.device)
-            angle = (-times * 2 * np.pi).to(self.sound_tower.conv1.weight.device)
-            angle_expanded = angle.unsqueeze(2)
-            angle_expanded = angle_expanded.expand(times.shape[0], sound_features.shape[-2], freqs.shape[-1])
-
-            freqs = freqs * angle_expanded
-            # print(freqs.shape)
-            sound_features = apply_rotary_emb(freqs, sound_features)
-            sound_features = sound_features.to(sounds.dtype)
-
-        return sound_features
-
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-
-    @property
-    def dtype(self):
-        return self.sound_tower.dtype
-
-    @property
-    def config(self):
-        if self.is_loaded:
-            return self.sound_tower.config
-        else:
-            return self.cfg_only
-            
-    @property
-    def device(self):
-        return self.sound_tower.device
-
-    @property
-    def hidden_size(self):
-        return self.config.hidden_size
-
-

From cf1e9bc82660798376eba28f4aee6f6518946cf1 Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Tue, 6 Jan 2026 13:25:08 -0500
Subject: [PATCH 03/12] Method arg docstrings

---
 .../musicflamingo/modeling_musicflamingo.py   |  9 ++-
 .../musicflamingo/modular_musicflamingo.py    | 67 +++++++++++++++++++
 2 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
index eea34ce0d249..9ca97ebcf51a 100644
--- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
@@ -339,6 +339,8 @@ def forward(
 
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
+            audio_times (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+                The start time of the audio segments in seconds.
         """
         seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
         input_features_lengths = input_features_mask.sum(-1)
@@ -509,11 +511,13 @@ def forward(
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         r"""
-        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
+        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
             Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
 
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
+        audio_times (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            The start time of the audio segments in seconds.
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
             config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -524,7 +528,7 @@ def forward(
         ```python
         >>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
 
-        >>> model_id = "nvidia/audio-flamingo-3-hf"
+        >>> model_id = "nvidia/music-flamingo-hf"
         >>> processor = AutoProcessor.from_pretrained(model_id)
         >>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
 
@@ -570,6 +574,7 @@ def forward(
         >>> print(decoded_outputs)
         ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
         ```"""
+
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py
index d9ef3eac8e73..933549d7bcdb 100644
--- a/src/transformers/models/musicflamingo/modular_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modular_musicflamingo.py
@@ -74,6 +74,8 @@ def forward(
 
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
+            audio_times (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+                The start time of the audio segments in seconds.
         """
         seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
         input_features_lengths = input_features_mask.sum(-1)
@@ -167,6 +169,71 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
+        r"""
+        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        audio_times (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            The start time of the audio segments in seconds.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
+
+        >>> model_id = "nvidia/music-flamingo-hf"
+        >>> processor = AutoProcessor.from_pretrained(model_id)
+        >>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+        >>> conversations = [
+        >>>     [
+        >>>         {
+        >>>             "role": "user",
+        >>>             "content": [
+        >>>                 {"type": "text", "text": "Transcribe the input speech."},
+        >>>                 {
+        >>>                     "type": "audio",
+        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
+        >>>                 },
+        >>>             ],
+        >>>         }
+        >>>     ],
+        >>>     [
+        >>>         {
+        >>>             "role": "user",
+        >>>             "content": [
+        >>>                 {
+        >>>                     "type": "text",
+        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
+        >>>                 },
+        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
+        >>>             ],
+        >>>         }
+        >>>     ],
+        >>> ]
+
+        >>> inputs = processor.apply_chat_template(
+        >>>     conversations,
+        >>>     tokenize=True,
+        >>>     add_generation_prompt=True,
+        >>>     return_dict=True,
+        >>> ).to(model.device)
+
+        >>> outputs = model.generate(**inputs, max_new_tokens=500)
+
+        >>> decoded_outputs = processor.batch_decode(
+        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
+        >>> )
+        >>> print(decoded_outputs)
+        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
+        ```"""
+
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 

From 44e801be50fb8c88caf5b144e76b1710fbf1f577 Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Thu, 8 Jan 2026 10:54:51 -0500
Subject: [PATCH 04/12] Add tests & docs

---
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/model_doc/musicflamingo.md     | 371 ++++++++++++++++++
 .../models/musicflamingo/__init__.py          |   2 +-
 .../configuration_musicflamingo.py            |   2 +-
 .../convert_musicflamingo_to_hf.py            |   2 +-
 .../musicflamingo/modeling_musicflamingo.py   |  45 ++-
 .../musicflamingo/modular_musicflamingo.py    |  45 ++-
 .../musicflamingo/processing_musicflamingo.py |  88 +----
 .../models/musicflamingo/rotary_embedding.py  |  16 +
 .../expected_results_batched.json             |   1 +
 .../expected_results_single.json              |   1 +
 tests/models/musicflamingo/__init__.py        |   0
 .../test_modeling_musicflamingo.py            | 351 +++++++++++++++++
 .../test_processing_musicflamingo.py          | 196 +++++++++
 14 files changed, 1031 insertions(+), 91 deletions(-)
 create mode 100644 docs/source/en/model_doc/musicflamingo.md
 create mode 100644 tests/fixtures/musicflamingo/expected_results_batched.json
 create mode 100644 tests/fixtures/musicflamingo/expected_results_single.json
 create mode 100644 tests/models/musicflamingo/__init__.py
 create mode 100644 tests/models/musicflamingo/test_modeling_musicflamingo.py
 create mode 100644 tests/models/musicflamingo/test_processing_musicflamingo.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 231eb15262ca..a8ad92c49b63 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1125,6 +1125,8 @@
         title: mllama
       - local: model_doc/mm-grounding-dino
         title: MM Grounding DINO
+      - local: model_doc/musicflamingo
+        title: MusicFlamingo
       - local: model_doc/nougat
         title: Nougat
       - local: model_doc/omdet-turbo
diff --git a/docs/source/en/model_doc/musicflamingo.md b/docs/source/en/model_doc/musicflamingo.md
new file mode 100644
index 000000000000..e3172bc0ccd6
--- /dev/null
+++ b/docs/source/en/model_doc/musicflamingo.md
@@ -0,0 +1,371 @@
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+*This model was released on 2025-11-03 and added to Hugging Face Transformers on 2026-01-08.*
+
+# Music Flamingo
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+
+Music Flamingo is a fully open large audio–language model designed for robust understanding and reasoning over music. It builds upon Audio Flamingo 3 architecture, pairing a Whisper-style audio encoder with a causal language model and performing replace-in-place audio–text fusion: the processor aligns post-pool audio frames to a dedicated placeholder token and the model replaces those token slots with projected audio embeddings during the forward pass.
+
+The model checkpoint is available at: [nvidia/music-flamingo-2601-hf](https://huggingface.co/nvidia/music-flamingo-2601-hf)
+
+Highlights:
+
+- Unified audio encoder across speech, sound, and music.
+- **Rotary Time Embeddings (RoTE)** for enhanced temporal modeling, enabling support for **up to 20 minutes of audio**.
+- **Extended long-audio support via windowing and post-pool alignment (up to 20 minutes maximum).** The model processes audio in 30-second windows with a hard limit of 40 windows (20 minutes total). Audio longer than 20 minutes will be truncated.
+- Special sound boundary tokens (`<|sound_bos|>` and `<|sound_eos|>`) for improved audio sequence modeling.
+- Deterministic fusion that preserves sequence length by replacing audio placeholder tokens with audio embeddings.
+
+This model was contributed by [Lasha Koroshinadze](https://huggingface.co/lashahub) and [Eric Bezzam](https://huggingface.co/bezzam).
+
+### Paper
+
+[Music Flamingo: Scaling Music Understanding in Audio Language Models](https://huggingface.co/papers/2511.10289)  
+S. Ghosh, A. Goel, L. Koroshinadze, S. Lee, Z. Kong, J. F. Santos, R. Duraiswami, D. Manocha, W. Ping, M. Shoeybi, B. Catanzaro  
+NVIDIA and University of Maryland  
+Project: https://research.nvidia.com/labs/adlr/MF/
+
+## Usage
+
+### Audio Instruct Mode
+
+The model supports audio-text instructions, including multi-turn interactions, all processed in batches.
+
+➡️ audio + text instruction
+
+```python
+from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/music-flamingo-2601-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates."},
+            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3"},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+
+➡️ multi-turn:
+
+```python
+from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/music-flamingo-2601-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Write a rich caption that blends the technical details (genre, BPM, key, chords, mix) with how the song feels emotionally and dynamically as it unfolds.",
+            },
+            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3"},
+        ],
+    },
+    {
+        "role": "assistant",
+        "content": [{"type": "text", "text": "This energetic Eurodance anthem at 150 BPM in E major combines bright synth arpeggios with a punchy four-on-the-floor beat..."}],
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "What instruments stand out the most?"},
+        ],
+    },
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+
+➡️ text only:
+
+```python
+from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/music-flamingo-2601-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "What is the capital of France?"},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+
+➡️ audio only:
+
+```python
+from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/music-flamingo-2601-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_2.mp3"},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+
+➡️ batched inference!
+
+```python
+from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/music-flamingo-2601-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+conversations = [
+    [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates."},
+                {
+                    "type": "audio",
+                    "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
+                },
+            ],
+        }
+    ],
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Generate a structured lyric sheet from the input music.",
+                },
+                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_2.mp3"},
+            ],
+        }
+    ],
+]
+
+inputs = processor.apply_chat_template(
+    conversations,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+
+➡️ Training:
+
+```python
+from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/music-flamingo-2601-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+model.train()
+
+conversation = [
+    [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Break the track down like a critic - list its tempo, key, and chordal motion, then explain the textures, dynamics, and emotional impact of the performance."},
+                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3"},
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [{"type": "text", "text": "This Eurodance track operates at 150 BPM in E major, with harmonic movement centering on the I-vi-IV-V family. The production features layered synth arpeggios, a four-on-the-floor kick pattern, and a mezzo-soprano lead vocal with bright timbre. Dynamically, the track builds through verses into an anthemic chorus with full synth orchestration and backing vocals, creating an uplifting, euphoric atmosphere characteristic of late 2000s dance-pop."}],
+        }
+    ],
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Describe this song from both a technical and artistic lens: mention tempo, harmony, and instrumentation, but also mood, lyrical themes, and structure.",
+                },
+                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_2.mp3"},
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [{"type": "text", "text": "This electronic pop track combines upbeat production with playful lyrical themes centered around late-night pizza cravings. The structure follows a verse-chorus format with recurring melodic motifs and rhythmic patterns that emphasize the celebratory, lighthearted mood of the piece."}],
+        }
+
+    ]
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+    output_labels=True,
+).to(model.device)
+
+loss = model(**inputs).loss
+loss.backward()
+```
+
+## How the model works
+
+### Architecture
+
+* **MusicFlamingoEncoder**
+  Whisper-style feature extractor + encoder with **Rotary Time Embeddings (RoTE)** → average-pool over time (stride 2) → LayerNorm.
+  Produces per-frame hidden states at the post-pool rate. RoTE enables the model to handle temporal information for audio sequences up to 20 minutes.
+
+* **MusicFlamingoMultiModalProjector**
+  A small MLP that maps encoder features to the language model's hidden size.
+
+* **MusicFlamingoForConditionalGeneration**
+  A causal language model that accepts text embeddings where each audio placeholder token slot is replaced, in place, by an audio frame embedding. Uses special boundary tokens (`<|sound_bos|>` and `<|sound_eos|>`) to mark audio sequences. No sequence-length change is introduced by fusion.
+
+### Processor-level alignment
+
+1. Each raw waveform is split into fixed-length windows based on the feature extractor’s `chunk_length` (seconds) and `sampling_rate` (Hz).
+2. For each window, the processor computes the number of post-pool frames `post_pool_len` that the encoder will output (matching the conv/pool schedule).
+3. The processor expands the audio placeholder token by the total number of post-pool frames across all windows.
+4. The model later replaces those token positions with the corresponding projected audio embeddings.
+
+## Long audio and windowing
+
+**Important: Maximum audio length is 20 minutes.** Audio longer than this will be truncated.
+
+* The default setup processes 30-second windows at 16 kHz mono.
+* **The processor enforces a hard limit of 40 windows per sample, resulting in a maximum of 20 minutes of audio (40 windows × 30 seconds).**
+* Rotary Time Embeddings (RoTE) provide position information for sequences up to 20 minutes (1200 seconds).
+* For each window:
+
+  * `mel_len` is the padded mel length.
+  * A conv stack reduces time as `conv_output_len = (mel_len - 1) // 2 + 1`.
+  * Post-pool frames per window: `post_pool_len = (conv_output_len - 2) // 2 + 1`.
+  * An audio placeholder token is expanded to the sum of `post_pool_len` across all windows.
+
+## Padding, attention, and caching
+
+* **Left padding vs right padding**
+  For generation with mixed prompt lengths in a batch, left padding is usually preferable.
+  For training, right padding is common; Music Flamingo's fusion mechanism itself is padding-agnostic because it replaces in place.
+* **Attention masks**
+  The processor returns `attention_mask` (text) and `input_features_mask` (audio). The model builds an internal 4-D mask on the encoder's pre-pool axis with negative infinity at pad positions.
+* **Audio boundary tokens**
+  The model uses special tokens `<|sound_bos|>` and `<|sound_eos|>` to explicitly mark the beginning and end of audio sequences.
+* **Caching**
+  During generation, `input_features` and `input_features_mask` are only passed on the first step. Subsequent steps use cached keys/values from the language model.
+
+## Troubleshooting
+
+* Empty or truncated outputs when batching
+  Use left padding for batched generation and decode only the new tokens after the prompt length, as shown in the quickstart.
+
+## MusicFlamingoConfig
+
+[[autodoc]] MusicFlamingoConfig
+
+## MusicFlamingoEncoderConfig
+
+[[autodoc]] MusicFlamingoEncoderConfig
+
+## MusicFlamingoProcessor
+
+[[autodoc]] MusicFlamingoProcessor
+
+## MusicFlamingoEncoder
+
+[[autodoc]] MusicFlamingoEncoder
+    - forward
+
+## MusicFlamingoForConditionalGeneration
+
+[[autodoc]] MusicFlamingoForConditionalGeneration
+    - forward
diff --git a/src/transformers/models/musicflamingo/__init__.py b/src/transformers/models/musicflamingo/__init__.py
index a9d654d9eb54..3cc2cd927865 100644
--- a/src/transformers/models/musicflamingo/__init__.py
+++ b/src/transformers/models/musicflamingo/__init__.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/musicflamingo/configuration_musicflamingo.py b/src/transformers/models/musicflamingo/configuration_musicflamingo.py
index b59e0b216c45..bb9635efcb4e 100644
--- a/src/transformers/models/musicflamingo/configuration_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/configuration_musicflamingo.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py b/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
index bf175afa6818..dfaa757adb77 100644
--- a/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
+++ b/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
index 9ca97ebcf51a..f6f36b57e799 100644
--- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
@@ -5,7 +5,7 @@
 #                          modular_musicflamingo.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,6 +22,7 @@
 
 import math
 from collections.abc import Callable
+from math import pi
 from typing import Optional, Union
 
 import numpy as np
@@ -57,6 +58,48 @@ class MusicFlamingoPreTrainedModel(PreTrainedModel):
     _supports_flash_attn = True
     _supports_sdpa = True
 
+    @torch.no_grad()
+    def _init_weights(self, module):
+        """Initialize the weights for MusicFlamingo-specific modules."""
+        if isinstance(module, RotaryEmbedding):
+            # Reinitialize freqs parameter
+            dim = module.dim
+            freqs_for = module.freqs_for
+            max_time = module.max_time
+            theta_rescale_factor = module.theta_rescale_factor
+            custom_freqs = None
+
+            # Adjust theta
+            if max_time is not None and freqs_for == "lang":
+                theta = max_time / (2 * pi)
+            else:
+                theta = 50000  # default value
+
+            theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+            # Generate freqs
+            if custom_freqs is not None:
+                freqs = custom_freqs
+            elif freqs_for == "lang":
+                freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+            elif freqs_for == "pixel":
+                freqs = torch.linspace(1.0, module.max_freq / 2, dim // 2) * pi
+            elif freqs_for == "constant":
+                freqs = torch.ones(module.num_freqs).float()
+
+            module.freqs.data = freqs
+
+            # Reinitialize dummy buffer
+            module.dummy.data = torch.tensor(0)
+
+            # Reinitialize scale if using xpos
+            if module.use_xpos and module.scale is not None:
+                scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+                module.scale.data = scale
+        else:
+            # Delegate to parent class for other modules
+            super()._init_weights(module)
+
 
 def eager_attention_forward(
     module: nn.Module,
diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py
index 933549d7bcdb..2a775a014819 100644
--- a/src/transformers/models/musicflamingo/modular_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modular_musicflamingo.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from math import pi
 from typing import Optional, Union
 
 import numpy as np
@@ -39,7 +40,47 @@
 
 
 class MusicFlamingoPreTrainedModel(AudioFlamingo3PreTrainedModel):
-    pass
+    @torch.no_grad()
+    def _init_weights(self, module):
+        """Initialize the weights for MusicFlamingo-specific modules."""
+        if isinstance(module, RotaryEmbedding):
+            # Reinitialize freqs parameter
+            dim = module.dim
+            freqs_for = module.freqs_for
+            max_time = module.max_time
+            theta_rescale_factor = module.theta_rescale_factor
+            custom_freqs = None
+
+            # Adjust theta
+            if max_time is not None and freqs_for == "lang":
+                theta = max_time / (2 * pi)
+            else:
+                theta = 50000  # default value
+
+            theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+            # Generate freqs
+            if custom_freqs is not None:
+                freqs = custom_freqs
+            elif freqs_for == "lang":
+                freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+            elif freqs_for == "pixel":
+                freqs = torch.linspace(1.0, module.max_freq / 2, dim // 2) * pi
+            elif freqs_for == "constant":
+                freqs = torch.ones(module.num_freqs).float()
+
+            module.freqs.data = freqs
+
+            # Reinitialize dummy buffer
+            module.dummy.data = torch.tensor(0)
+
+            # Reinitialize scale if using xpos
+            if module.use_xpos and module.scale is not None:
+                scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+                module.scale.data = scale
+        else:
+            # Delegate to parent class for other modules
+            super()._init_weights(module)
 
 
 @auto_docstring(
diff --git a/src/transformers/models/musicflamingo/processing_musicflamingo.py b/src/transformers/models/musicflamingo/processing_musicflamingo.py
index 8a9011e8901a..079e15408b8e 100644
--- a/src/transformers/models/musicflamingo/processing_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/processing_musicflamingo.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,8 +32,7 @@
 
 logger = logging.get_logger(__name__)
 
-MAX_AUDIO_LEN = 10 * 60  # 10 minutes
-DEFAULT_TRANSCRIPTION_PROMPT = "Transcribe the input speech."
+MAX_AUDIO_LEN = 20 * 60  # 20 minutes
 
 
 class MusicFlamingoProcessorKwargs(ProcessingKwargs, total=False):
@@ -215,88 +214,7 @@ def __call__(
     def model_input_names(self) -> list[str]:
         tok_names = self.tokenizer.model_input_names
         fea_names = self.feature_extractor.model_input_names
-        return list(dict.fromkeys(tok_names + fea_names + ["input_features_mask"]))
-
-    def apply_transcription_request(
-        self,
-        audio: Union[str, list[str], AudioInput],
-        prompt: Optional[Union[str, list[str]]] = None,
-        **kwargs: Unpack[MusicFlamingoProcessorKwargs],
-    ) -> BatchFeature:
-        """
-        Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.
-
-        Args:
-            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
-                the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
-            prompt (`str` or `list[str]`, *optional*):
-                Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
-                each sample uses `"Transcribe the input speech."`.
-            **kwargs:
-                Additional keyword arguments forwarded to [`~MusicFlamingoProcessor.apply_chat_template`] (for example
-                `text_kwargs`, `audio_kwargs`, ...).
-
-        Returns:
-            [`BatchFeature`]: Processor outputs ready to be passed to [`MusicFlamingoForConditionalGeneration.generate`].
-
-        """
-
-        if isinstance(audio, str):
-            audio_items: list[Union[str, np.ndarray]] = [audio]
-        elif isinstance(audio, (list, tuple)) and audio and all(isinstance(el, str) for el in audio):
-            audio_items = list(audio)
-        else:
-            audio_items = list(make_list_of_audio(audio))
-            if is_torch_available():
-                audio_items = [el.detach().cpu().numpy() if isinstance(el, torch.Tensor) else el for el in audio_items]
-
-        batch_size = len(audio_items)
-        if batch_size == 0:
-            raise ValueError("`audio` must contain at least one sample.")
-
-        if prompt is None:
-            prompts = [DEFAULT_TRANSCRIPTION_PROMPT] * batch_size
-        elif isinstance(prompt, str):
-            prompts = [prompt] * batch_size
-        elif isinstance(prompt, (list, tuple)):
-            if len(prompt) != batch_size:
-                raise ValueError(
-                    f"Received {len(prompt)} prompt(s) for {batch_size} audio sample(s); counts must match."
-                )
-            prompts = []
-            for item in prompt:
-                if item is None:
-                    prompts.append(DEFAULT_TRANSCRIPTION_PROMPT)
-                elif isinstance(item, str):
-                    prompts.append(item)
-                else:
-                    raise TypeError("Each prompt must be a string or `None`.")
-        else:
-            raise TypeError("`prompt` must be a string, a sequence of strings, or `None`.")
-
-        conversations = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": prompt_text},
-                        {"type": "audio", "path": audio_item}
-                        if isinstance(audio_item, str)
-                        else {"type": "audio", "audio": audio_item},
-                    ],
-                }
-            ]
-            for prompt_text, audio_item in zip(prompts, audio_items)
-        ]
-
-        return self.apply_chat_template(
-            conversations,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            **kwargs,
-        )
+        return list(dict.fromkeys(tok_names + fea_names + ["input_features_mask", "audio_times"]))
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/musicflamingo/rotary_embedding.py b/src/transformers/models/musicflamingo/rotary_embedding.py
index cae84c961a41..88906e725e34 100644
--- a/src/transformers/models/musicflamingo/rotary_embedding.py
+++ b/src/transformers/models/musicflamingo/rotary_embedding.py
@@ -1,3 +1,19 @@
+# coding=utf-8
+# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from math import pi
 
 import torch
diff --git a/tests/fixtures/musicflamingo/expected_results_batched.json b/tests/fixtures/musicflamingo/expected_results_batched.json
new file mode 100644
index 000000000000..d712d9147e40
--- /dev/null
+++ b/tests/fixtures/musicflamingo/expected_results_batched.json
@@ -0,0 +1 @@
+{"token_ids": [[1986, 3754, 374, 458, 44855, 19461, 98875, 378, 107, 14, 378, 107, 35, 681, 55964, 11598, 55564, 429, 57843, 279, 9906, 11, 10581, 52760, 6097, 13450, 315, 20729, 2420, 448, 279, 9842, 11, 6335, 55964, 2307, 27235, 315, 11416, 19461, 98875, 13, 220, 576, 8090, 315, 279, 6573, 374, 220, 16, 21, 18, 13, 20, 24, 6486, 624, 12151, 78, 609, 5309, 1365, 576, 5492, 10797, 518, 264, 74391, 378, 107, 16, 20, 15, 378, 107, 33, 8795, 323, 374, 40876, 304, 378, 107, 36, 378, 107, 36505, 624, 56324, 367, 609, 24039, 1365, 362, 43361, 11, 1550, 55964, 69, 45101, 5670, 14087, 279, 26112, 13, 576, 36290, 16266, 374, 5798, 389, 264, 3040, 55964, 263, 55964, 1782, 55964, 30449, 14346, 23196, 5383, 448, 41854, 10323, 11, 4131, 11144, 4131, 546, 11, 323, 10296, 15588, 55964, 71, 1862, 11, 678, 3108, 55964, 331, 2627, 311, 264, 20380, 88, 42898, 21529, 429, 31676, 38969, 448, 279, 46289, 13, 22735, 291, 389, 1909, 525, 9906, 11, 796, 10311, 8212, 657, 42898, 11508, 323, 57267, 11016, 29853, 429, 23146, 3941, 264, 6884, 37952, 2070, 11, 1393, 27861, 357, 3435, 315, 51690, 672, 6782, 16896, 912, 4960, 95070, 13, 576, 6514, 374, 4240, 323, 73056, 11, 448, 26447, 312, 22328, 323, 7626, 389, 279, 6782, 16896, 311, 1855, 7990, 11, 323, 87761, 429, 65059, 17361, 2090, 2041, 72028, 31273, 624, 53, 3683, 83984, 1365, 576, 2990, 374, 264, 8778, 752, 45648, 55964, 82, 46288, 5652, 6693, 6792, 20512, 374, 2797, 11, 9906, 11, 323, 10078, 11486, 88, 13, 2932, 27321, 279, 61584, 304, 264, 2420, 55964, 32436, 11, 10581, 52760, 1707, 11, 1667, 3100, 3233, 55964, 83, 2886, 369, 9649, 44029, 323, 27671, 312, 22328, 3446, 6895, 311, 2444, 2878, 279, 60738, 6514, 13, 576, 25407, 1555, 374, 46804, 4240, 11, 448, 902, 24939, 476, 60366, 5424, 624, 43, 10920, 938, 62087, 1365, 576, 23261, 5772, 3948, 2163, 2948, 11, 78322, 11, 323, 279, 86335, 2355, 315, 264, 5025, 13, 576, 8622, 11, 58077, 9704, 2293, 9, 2073, 3838, 1035, 847, 1879, 387, 2041, 498, 12390, 9, 2293, 70499, 279, 55810, 323, 71790, 279, 5492, 748, 14269, 6200, 13, 6944, 5128, 1741, 438, 353, 2073, 2610, 13020, 279, 9759, 3403, 847, 6084, 1717, 21461, 854, 9, 323, 353, 2073, 19389, 21059, 88148, 1119, 25678, 24908, 854, 9, 40368, 279, 1042, 1229, 323, 37550, 16232, 624, 33039, 28596, 609, 52611, 1365, 576, 26112, 11017, 264, 11416, 19461, 98875, 6789, 25, 458, 40945, 19706, 429, 63564, 279, 9842, 9382, 11, 8110, 553, 49299, 429, 19131, 279, 25407, 19221, 11, 264, 855, 55964, 6150, 355, 429, 22111, 23504, 448, 16062, 42898, 13617, 11, 323, 264, 68897, 55810, 1380, 279, 9704, 43594, 916, 2480, 55964, 339, 27535, 6782, 16896, 323, 63141, 24668, 46096, 13, 42305, 278, 18303, 3410, 3550, 369, 279, 42898, 11508, 311, 32405, 1573, 279, 1590, 55810, 11, 892, 374, 48580, 448, 5107, 81241, 13617, 323, 264, 26447, 11893, 304, 279, 23196, 20612, 11, 11695, 64283, 304, 458, 94509, 60658, 429, 86409, 279, 1887, 10581, 52760, 59512, 624, 785, 90767, 70971, 1365, 39659, 292, 7203, 26558, 13771, 2878, 279, 468, 55964, 36505, 1853, 266, 14011, 2070, 13, 24209, 288, 3545, 10775, 1526, 279, 3275, 4142, 3090, 4142, 40, 4142, 53, 32724, 320, 34, 2, 76, 4142, 32, 4142, 36, 4142, 33, 701, 6825, 264, 11285, 2420, 55964, 20521, 2666, 1393, 20337, 4637, 23270, 13, 576, 855, 55964, 6150, 355, 38919, 264, 14246, 24456, 320, 33, 22, 8, 429, 2608, 724, 23504, 1573, 52483, 311, 279, 98205, 55964, 3057, 291, 55810, 11, 892, 92836, 389, 279, 358, 4142, 3090, 4142, 53, 320, 36, 4142, 32, 4142, 33, 8, 323, 27861, 3275, 320, 34, 2, 76, 8, 55659, 311, 6359, 458, 22268, 8111, 4879, 13, 576, 990, 315, 11577, 55659, 1741, 438, 3303, 1630, 22, 11, 425, 21, 11, 323, 356, 2, 76, 22, 11367, 1894, 2041, 86743, 279, 8084, 3598, 55964, 792, 32206, 624, 27489, 93567, 609, 9608, 1365, 576, 3754, 505, 28146, 458, 94509, 11, 37550, 16566, 11, 77749, 279, 63943, 95629, 315, 3309, 55964, 16, 24, 24, 15, 82, 14, 22140, 55964, 17, 15, 15, 15, 82, 19461, 98875, 448, 18706, 2420, 44029, 13, 11445, 9906, 42898, 26308, 11, 15254, 55964, 30449, 55964, 2307, 23230, 11, 323, 37583, 28180, 517, 25407, 9704, 1992, 432, 94463, 2878, 279, 63909, 315, 6335, 55964, 32436, 2420, 22268, 11852, 429, 9213, 311, 30640, 2176, 7203, 323, 25975, 13, 151645], [334, 68043, 220, 16, 1019, 33648, 9287, 88828, 304, 51454, 11, 12711, 28347, 261, 304, 279, 3054, 11, 24353, 20783, 18707, 30789, 11, 22502, 4614, 389, 697, 49293, 271, 334, 68043, 220, 17, 1019, 26843, 2367, 98091, 389, 279, 39612, 11, 304, 17172, 582, 6950, 11, 14697, 41315, 311, 279, 17788, 11, 34254, 8048, 1616, 2238, 5135, 271, 334, 1143, 29869, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1449, 22721, 264, 10526, 17970, 11, 304, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 271, 334, 68043, 220, 19, 1019, 17814, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 271, 334, 1143, 29869, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1449, 22721, 264, 10526, 17970, 11, 304, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 271, 334, 32848, 1019, 641, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 1406, 334, 68043, 220, 19, 1019, 17814, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 271, 334, 1143, 29869, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1449, 22721, 264, 10526, 17970, 11, 304, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 271, 334, 68043, 220, 19, 1019, 17814, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 271, 334, 2662, 299, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1896, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 151645, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669]], "transcriptions": ["This track is an energetic Eurodance / Dance‑Pop anthem that blends the bright, melodic sensibilities of mainstream pop with the driving, club‑ready pulse of classic Eurodance.  The duration of the piece is 163.59 seconds.\nTempo & Key – The song moves at a brisk 150 BPM and is rooted in E major.\nInstrumentation & Production – A polished, high‑fidelity production frames the arrangement. The rhythm foundation is built on a four‑on‑the‑floor electronic drum pattern with crisp kick, snappy snare, and tight hi‑hats, all side‑chained to a punchy synth bass that locks tightly with the drums. Layered on top are bright, arpeggiated synth leads and lush pad textures that sweep across a wide stereo field, while occasional stabs of supersaw synths add extra sparkle. The mix is clean and glossy, with subtle reverb and delay on the synths to create depth, and mastering that emphasizes loudness without sacrificing clarity.\nVocal Characteristics – The lead is a female mezzo‑soprano whose timbre is clear, bright, and slightly breathy. She delivers the melody in a pop‑oriented, melodic style, using light auto‑tune for pitch polish and generous reverb/delay to sit within the expansive mix. The vocal line is predominantly clean, with no harsh or screamed elements.\nLyrical Themes – The lyrics revolve around love, longing, and the transformative power of a relationship. The central, repetitive hook—*“What would my world be without you?”*—anchors the chorus and underscores the song’s emotional core. Other lines such as *“You lit the stars above my sleepless nights”* and *“Turn silent whispers into endless flights”* illustrate the yearning and hopeful tone.\nSong Structure & Dynamics – The arrangement follows a classic Eurodance layout: an instrumental intro that establishes the driving beat, followed by verses that introduce the vocal narrative, a pre‑chorus that builds tension with rising synth layers, and a soaring chorus where the hook repeats over full‑throttle synths and layered backing vocals. Instrumental breaks provide space for the synth leads to shine before the final chorus, which is reinforced with additional harmonic layers and a subtle lift in the drum intensity, culminating in an uplifting outro that fades the main melodic motif.\nTheoretical Insight – Harmonic movement stays largely within the E‑major diatonic field. Verses often cycle through the vi–IV–I–V progression (C#m–A–E–B), creating a familiar pop‑rock feel while maintaining forward momentum. The pre‑chorus introduces a secondary dominant (B7) that heightens tension before resolving to the tonic‑centered chorus, which leans on the I–IV–V (E–A–B) and occasional vi (C#m) chords to deliver an anthemic release. The use of extended chords such as Amaj7, B6, and C#m7 adds color without disrupting the overall major‑key brightness.\nOverall Mood & Context – The track exudes an uplifting, hopeful atmosphere, marrying the euphoria of late‑1990s/early‑2000s Eurodance with contemporary pop polish. Its bright synth palette, dance‑floor‑ready tempo, and emotionally resonant vocal hook place it squarely within the lineage of club‑oriented pop anthems that aim to inspire both movement and sentiment.", "**Verse 1**\nMidnight cravings in bloom, lights flicker in the room, pepperoni dreams arise, pizza party on your skies\n\n**Verse 2**\nCheese melts on the crust, in flavor we trust, boxes stacked to the moon, slices gone way too soon\n\n**Chorus**\nLate night pizza feeling right, every bite a pure delight, in the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n**Verse 4**\nTake a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain\n\n**Chorus**\nLate night pizza feeling right, every bite a pure delight, in the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n**Bridge**\nIn the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n\n**Verse 4**\nTake a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain\n\n**Chorus**\nLate night pizza feeling right, every bite a pure delight, in the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n**Verse 4**\nTake a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain\n\n**Outro**\nLate night pizza feeling right, take a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain"]}
\ No newline at end of file
diff --git a/tests/fixtures/musicflamingo/expected_results_single.json b/tests/fixtures/musicflamingo/expected_results_single.json
new file mode 100644
index 000000000000..7b04d208f819
--- /dev/null
+++ b/tests/fixtures/musicflamingo/expected_results_single.json
@@ -0,0 +1 @@
+{"token_ids": [[1986, 3754, 374, 458, 44855, 19461, 98875, 378, 107, 14, 378, 107, 35, 681, 55964, 11598, 55564, 429, 57843, 279, 9906, 11, 10581, 52760, 6097, 13450, 315, 20729, 2420, 448, 279, 9842, 11, 6335, 55964, 2307, 27235, 315, 11416, 19461, 98875, 13, 220, 576, 8090, 315, 279, 6573, 374, 220, 16, 21, 18, 13, 20, 24, 6486, 624, 12151, 78, 609, 5309, 1365, 576, 5492, 10797, 518, 264, 74391, 378, 107, 16, 20, 15, 378, 107, 33, 8795, 323, 374, 40876, 304, 378, 107, 36, 378, 107, 36505, 624, 56324, 367, 609, 24039, 1365, 362, 43361, 11, 1550, 55964, 69, 45101, 5670, 14087, 279, 26112, 13, 576, 36290, 16266, 374, 5798, 389, 264, 3040, 55964, 263, 55964, 1782, 55964, 30449, 14346, 10323, 11, 41854, 4131, 546, 55931, 391, 13617, 11, 323, 10296, 15588, 55964, 9198, 12624, 429, 2506, 279, 56406, 59035, 13, 362, 20380, 88, 42898, 21529, 1212, 74558, 279, 25281, 11, 1393, 9906, 11, 796, 10311, 8212, 657, 2990, 6782, 16896, 323, 57267, 11, 6884, 55964, 267, 64853, 35995, 6777, 279, 10581, 52760, 29677, 13, 576, 6514, 374, 4240, 323, 32136, 11, 448, 3108, 55964, 8819, 25111, 389, 279, 35995, 311, 29100, 6292, 279, 10323, 11, 323, 26447, 312, 22328, 323, 7626, 389, 279, 11508, 311, 1855, 264, 5530, 315, 11893, 13, 10824, 287, 65059, 264, 17361, 11, 6335, 55964, 2307, 2188, 2041, 72028, 31273, 11, 10693, 279, 6782, 16896, 323, 46096, 311, 3931, 1526, 279, 27950, 26112, 624, 53, 3683, 83984, 1365, 576, 2990, 374, 264, 8778, 752, 45648, 55964, 82, 46288, 5652, 448, 264, 2797, 11, 9906, 6792, 20512, 13, 6252, 9691, 374, 10581, 52760, 323, 2420, 55964, 32436, 11, 16445, 28659, 8388, 323, 27861, 10581, 56145, 19828, 20527, 13, 86045, 1127, 525, 15233, 448, 3100, 3233, 55964, 83, 2886, 369, 9649, 44029, 11, 27671, 312, 22328, 323, 7626, 369, 7990, 11, 323, 6437, 25111, 311, 2444, 31520, 304, 279, 6514, 624, 43, 10920, 938, 62087, 1365, 576, 23261, 5772, 3948, 2163, 2948, 11, 78322, 11, 323, 279, 86335, 2355, 315, 264, 5025, 13, 576, 8622, 11, 58077, 9704, 2293, 9, 2073, 3838, 1035, 847, 1879, 387, 2041, 498, 12390, 9, 2293, 70499, 279, 55810, 11, 1393, 49299, 6177, 42020, 5335, 315, 1894, 11, 3100, 11, 323, 14269, 43293, 26087, 2610, 13020, 279, 9759, 3403, 847, 6084, 1717, 21461, 2419, 1036, 19389, 21059, 88148, 1119, 25678, 24908, 64212, 576, 45250, 3405, 71790, 264, 1042, 1229, 369, 279, 27430, 748, 9362, 624, 33039, 28596, 609, 52611, 1365, 576, 26112, 11017, 264, 11416, 19461, 98875, 6789, 25, 458, 40945, 19706, 429, 63564, 279, 42898, 59512, 11, 8110, 553, 264, 32387, 11, 264, 855, 55964, 6150, 355, 429, 22111, 23504, 11, 323, 264, 68897, 55810, 1380, 279, 9704, 43594, 13, 362, 14164, 38919, 264, 8112, 81241, 6407, 1573, 13451, 311, 279, 1590, 55810, 11, 892, 374, 63141, 448, 5107, 25407, 17774, 550, 323, 264, 59387, 42898, 2990, 369, 53129, 5421, 13, 21886, 20612, 37168, 1526, 1817, 3772, 11, 448, 279, 55810, 23988, 279, 28640, 4979, 11, 1429, 22268, 8111, 4445, 1573, 264, 9814, 11, 18293, 60658, 429, 86409, 279, 42898, 35995, 624, 785, 90767, 70971, 1365, 39659, 292, 7203, 18652, 389, 279, 358, 55964, 9971, 55964, 3090, 55964, 53, 2997, 304, 468, 378, 107, 36505, 320, 36, 378, 107, 4142, 378, 107, 34, 145346, 76, 378, 107, 4142, 378, 107, 32, 378, 107, 4142, 378, 107, 33, 701, 448, 20757, 990, 315, 279, 3275, 320, 34, 145346, 76, 8, 323, 16824, 320, 32, 8, 55659, 311, 1855, 264, 8205, 11, 94509, 2666, 13, 576, 32724, 3545, 10797, 504, 279, 98205, 320, 36, 8, 311, 279, 8674, 8922, 320, 34, 145346, 76, 8, 304, 49299, 11, 1221, 53066, 311, 279, 1186, 5600, 85296, 320, 32, 8, 323, 24456, 320, 33, 8, 304, 69256, 4776, 11, 8241, 264, 2797, 5530, 315, 23504, 55964, 22998, 429, 39273, 279, 3754, 748, 63943, 19530, 292, 3668, 624, 27489, 93567, 609, 9608, 1365, 34869, 1095, 53408, 323, 15254, 30449, 4802, 11, 279, 5492, 40155, 279, 3309, 55964, 17, 15, 15, 15, 82, 14, 22140, 55964, 17, 15, 16, 15, 82, 19461, 98875, 56005, 11, 1380, 73056, 2420, 46096, 2270, 6335, 55964, 2307, 5670, 13, 11445, 9906, 42898, 26308, 11, 68897, 25407, 9704, 11, 323, 59035, 9382, 1281, 432, 264, 39657, 48482, 2666, 55964, 18536, 55564, 6188, 369, 2176, 8887, 1486, 323, 69732, 12695, 13, 151645]], "transcriptions": ["This track is an energetic Eurodance / Dance‑Pop anthem that blends the bright, melodic sensibilities of mainstream pop with the driving, club‑ready pulse of classic Eurodance.  The duration of the piece is 163.59 seconds.\nTempo & Key – The song moves at a brisk 150 BPM and is rooted in E major.\nInstrumentation & Production – A polished, high‑fidelity production frames the arrangement. The rhythm foundation is built on a four‑on‑the‑floor electronic kick, crisp snare/clap layers, and tight hi‑hat patterns that keep the groove relentless. A punchy synth bass underpins the harmony, while bright, arpeggiated lead synths and lush, wide‑stereo pads carry the melodic hooks. The mix is clean and spacious, with side‑chain compression on the pads to accentuate the kick, and subtle reverb and delay on the leads to create a sense of lift. Mastering emphasizes a loud, club‑ready level without sacrificing clarity, allowing the synths and vocals to cut through the dense arrangement.\nVocal Characteristics – The lead is a female mezzo‑soprano with a clear, bright timbre. Her delivery is melodic and pop‑oriented, featuring sustained notes and occasional melismatic flourishes. Vocals are processed with light auto‑tune for pitch polish, generous reverb and delay for depth, and solid compression to sit firmly in the mix.\nLyrical Themes – The lyrics revolve around love, longing, and the transformative power of a relationship. The central, repetitive hook—*“What would my world be without you?”*—anchors the chorus, while verses paint vivid images of color, light, and emotional dependence (“You lit the stars above my sleepless nights,” “Turn silent whispers into endless flights”). The recurring question underscores a yearning for the beloved’s presence.\nSong Structure & Dynamics – The arrangement follows a classic Eurodance layout: an instrumental intro that establishes the synth motif, followed by a verse, a pre‑chorus that builds tension, and a soaring chorus where the hook repeats. A bridge introduces a slight harmonic shift before returning to the final chorus, which is layered with additional vocal harmonies and a heightened synth lead for maximal impact. Dynamic intensity rises through each section, with the chorus delivering the loudest, most anthemic moment before a brief, filtered outro that fades the synth pads.\nTheoretical Insight – Harmonic movement centers on the I‑vi‑IV‑V family in E major (E – C♯m – A – B), with frequent use of the vi (C♯m) and IV (A) chords to create a warm, uplifting feel. The progression often moves from the tonic (E) to the relative minor (C♯m) in verses, then lifts to the subdominant (A) and dominant (B) in choruses, providing a clear sense of tension‑release that fuels the track’s euphoric character.\nOverall Mood & Context – Radiating optimism and dancefloor energy, the song captures the late‑2000s/early‑2010s Eurodance revival, where glossy pop vocals met club‑ready production. Its bright synth palette, soaring vocal hook, and relentless beat make it a quintessential feel‑good anthem designed for both radio play and nightclub rotation."]}
\ No newline at end of file
diff --git a/tests/models/musicflamingo/__init__.py b/tests/models/musicflamingo/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py
new file mode 100644
index 000000000000..ca7b038248b4
--- /dev/null
+++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py
@@ -0,0 +1,351 @@
+# coding=utf-8
+# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch MusicFlamingo model."""
+
+import json
+import tempfile
+import unittest
+from pathlib import Path
+
+import pytest
+
+from transformers import (
+    AutoProcessor,
+    MusicFlamingoConfig,
+    MusicFlamingoForConditionalGeneration,
+    is_torch_available,
+)
+from transformers.testing_utils import (
+    cleanup,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+
+class MusicFlamingoModelTester:
+    """
+    Builds a tiny MusicFlamingo config and synthetic inputs that respect MusicFlamingo's
+    post-pool token accounting: num <sound> tokens per sample == post-pool frame count.
+    """
+
+    def __init__(
+        self,
+        parent,
+        audio_token_id=0,
+        seq_length=25,
+        feat_seq_length=60,
+        text_config=None,
+        audio_config=None,
+        is_training=True,
+    ):
+        self.parent = parent
+        self.audio_token_id = audio_token_id
+        self.seq_length = seq_length
+        self.feat_seq_length = feat_seq_length
+        self.is_training = is_training
+
+        # Small text backbone (Qwen2-ish)
+        if text_config is None:
+            text_config = {
+                "model_type": "qwen2",
+                "intermediate_size": 36,
+                "initializer_range": 0.02,
+                "hidden_size": 32,
+                "max_position_embeddings": 52,
+                "num_hidden_layers": 2,
+                "num_attention_heads": 4,
+                "num_key_value_heads": 2,
+                "use_labels": True,
+                "use_mrope": False,
+                "vocab_size": 99,
+                "pad_token_id": 1,  # Ensure pad token != audio token
+            }
+        # Small audio encoder (MusicFlamingo Whisper-style)
+        if audio_config is None:
+            audio_config = {
+                "model_type": "musicflamingo_encoder",
+                "hidden_size": 16,
+                "num_attention_heads": 4,
+                "intermediate_size": 16,
+                "num_hidden_layers": 2,
+                "num_mel_bins": 80,
+                "max_source_positions": 30,
+                "initializer_range": 0.02,
+            }
+
+        self.text_config = text_config
+        self.audio_config = audio_config
+
+        self.batch_size = 3
+        self.vocab_size = text_config["vocab_size"]
+        self.hidden_size = text_config["hidden_size"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.encoder_seq_length = seq_length
+
+    def get_config(self):
+        return MusicFlamingoConfig(
+            text_config=self.text_config,
+            audio_config=self.audio_config,
+            audio_token_id=self.audio_token_id,
+        )
+
+    def prepare_config_and_inputs(self):
+        # (#windows == batch_size, n_mels, T_mel)
+        input_features_values = floats_tensor(
+            [self.batch_size, self.audio_config["num_mel_bins"], self.feat_seq_length]
+        )
+        config = self.get_config()
+        # Per-window mel validity (all ones => full length)
+        input_features_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
+        return config, input_features_values, input_features_mask
+
+    def _post_pool_tokens_per_window(self, T_mel):
+        # Mirror MusicFlamingo processor math:
+        pre = (T_mel - 1) // 2 + 1
+        post = (pre - 2) // 2 + 1
+        return post
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_features_values, input_features_mask = self.prepare_config_and_inputs()
+        # Every window has same T_mel here
+        num_audio_tokens_per_sample = self._post_pool_tokens_per_window(input_features_values.shape[-1])
+
+        # Build token ids with valid range and K <sound> tokens
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
+        attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=torch_device)
+        attention_mask[:, :1] = 0  # left padding sentinel
+
+        # Fill first K positions (after padding) with the audio token id, for each sample
+        input_ids[:, 1 : 1 + num_audio_tokens_per_sample] = config.audio_token_id
+
+        inputs_dict = {
+            "input_features": input_features_values,
+            "input_features_mask": input_features_mask,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class MusicFlamingoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    """
+    Model tester for `MusicFlamingoForConditionalGeneration`.
+    """
+
+    all_model_classes = (MusicFlamingoForConditionalGeneration,) if is_torch_available() else ()
+    # TODO: @eustlb, this is incorrect
+    pipeline_model_mapping = (
+        {
+            "text-to-speech": MusicFlamingoForConditionalGeneration,
+            "audio-text-to-text": MusicFlamingoForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = MusicFlamingoModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MusicFlamingoConfig, has_text_modality=False)
+
+    @unittest.skip(
+        reason="This test does not apply to MusicFlamingo since High-level inputs_embeds corresponding to audio tokens are replaced when input features are provided."
+    )
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+
+    @unittest.skip(reason="Compile not yet supported for MusicFlamingo models")
+    @pytest.mark.torch_compile_test
+    def test_sdpa_can_compile_dynamic(self):
+        pass
+
+    @unittest.skip(reason="Compile not yet supported for MusicFlamingo models")
+    def test_sdpa_can_dispatch_on_flash(self):
+        pass
+
+    @unittest.skip(reason="MusicFlamingo tests avoid right-padding equivalence; fusion is in-place.")
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip(reason="MusicFlamingo has no separate base model without a head.")
+    def test_model_base_model_prefix(self):
+        pass
+
+    def test_sdpa_can_dispatch_composite_models(self):
+        # MusicFlamingo is audio+text composite; verify SDPA toggles propagate to submodules.
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
+        if not self._is_composite:
+            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                # SDPA (default)
+                model_sdpa = model_class.from_pretrained(tmpdirname)
+                model_sdpa = model_sdpa.eval().to(torch_device)
+
+                text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
+                audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager"
+
+                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+                self.assertTrue(model.language_model.config._attn_implementation == text_attn)
+                self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn)
+
+                # Eager
+                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
+                model_eager = model_eager.eval().to(torch_device)
+                self.assertTrue(model_eager.config._attn_implementation == "eager")
+                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
+                self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager")
+
+                for _, submodule in model_eager.named_modules():
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                        raise ValueError("The eager model should not have SDPA attention layers")
+
+
+@require_torch
+class MusicFlamingoForConditionalGenerationIntegrationTest(unittest.TestCase):
+    """
+    Slow tests against the public checkpoint to validate processor-model alignment and in-place fusion.
+    """
+
+    @classmethod
+    def setUp(cls):
+        cleanup(torch_device, gc_collect=True)
+        cls.checkpoint = "nvidia/music-flamingo-2601-hf"
+        cls.processor = AutoProcessor.from_pretrained(cls.checkpoint)
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    @slow
+    def test_fixture_single_matches(self):
+        """
+        reproducer (creates JSON directly in repo): https://gist.github.com/ebezzam/c979f0f1a2b9223fa137faf1c02022d4#file-reproducer-py
+        """
+        path = Path(__file__).parent.parent.parent / "fixtures/musicflamingo/expected_results_single.json"
+        with open(path, "r", encoding="utf-8") as f:
+            raw = json.load(f)
+        exp_ids = torch.tensor(raw["token_ids"])
+        exp_txt = raw["transcriptions"]
+
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
+                    },
+                    {
+                        "type": "audio",
+                        "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
+                    },
+                ],
+            }
+        ]
+
+        model = MusicFlamingoForConditionalGeneration.from_pretrained(
+            self.checkpoint, device_map=torch_device, dtype=torch.bfloat16
+        ).eval()
+
+        batch = self.processor.apply_chat_template(
+            conversation, tokenize=True, add_generation_prompt=True, return_dict=True
+        ).to(model.device, dtype=model.dtype)
+        seq = model.generate(**batch)
+        inp_len = batch["input_ids"].shape[1]
+        gen_ids = seq[:, inp_len:] if seq.shape[1] >= inp_len else seq
+
+        torch.testing.assert_close(gen_ids.cpu(), exp_ids)
+        txt = self.processor.batch_decode(gen_ids, skip_special_tokens=True)
+        self.assertListEqual(txt, exp_txt)
+
+    @slow
+    def test_fixture_batched_matches(self):
+        """
+        reproducer (creates JSON directly in repo): https://gist.github.com/ebezzam/c979f0f1a2b9223fa137faf1c02022d4#file-reproducer-py
+        """
+        path = Path(__file__).parent.parent.parent / "fixtures/musicflamingo/expected_results_batched.json"
+        with open(path, "r", encoding="utf-8") as f:
+            raw = json.load(f)
+        exp_ids = torch.tensor(raw["token_ids"])
+        exp_txt = raw["transcriptions"]
+
+        conversations = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
+                        },
+                        {
+                            "type": "audio",
+                            "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
+                        },
+                    ],
+                }
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Generate a structured lyric sheet from the input music.",
+                        },
+                        {
+                            "type": "audio",
+                            "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_2.mp3",
+                        },
+                    ],
+                }
+            ],
+        ]
+
+        model = MusicFlamingoForConditionalGeneration.from_pretrained(
+            self.checkpoint, device_map=torch_device, dtype=torch.bfloat16
+        ).eval()
+
+        batch = self.processor.apply_chat_template(
+            conversations, tokenize=True, add_generation_prompt=True, return_dict=True
+        ).to(model.device, dtype=model.dtype)
+        seq = model.generate(**batch)
+        inp_len = batch["input_ids"].shape[1]
+        gen_ids = seq[:, inp_len:] if seq.shape[1] >= inp_len else seq
+
+        torch.testing.assert_close(gen_ids.cpu(), exp_ids)
+        txt = self.processor.batch_decode(gen_ids, skip_special_tokens=True)
+        self.assertListEqual(txt, exp_txt)
diff --git a/tests/models/musicflamingo/test_processing_musicflamingo.py b/tests/models/musicflamingo/test_processing_musicflamingo.py
new file mode 100644
index 000000000000..8bde6cbd4723
--- /dev/null
+++ b/tests/models/musicflamingo/test_processing_musicflamingo.py
@@ -0,0 +1,196 @@
+# coding=utf-8
+# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+from parameterized import parameterized
+
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    MusicFlamingoProcessor,
+    WhisperFeatureExtractor,
+)
+from transformers.testing_utils import require_librosa, require_torch, require_torchaudio
+
+from ...test_processing_common import MODALITY_INPUT_DATA, ProcessorTesterMixin
+
+
+class MusicFlamingoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = MusicFlamingoProcessor
+
+    @classmethod
+    @require_torch
+    @require_torchaudio
+    def setUpClass(cls):
+        cls.checkpoint = "nvidia/music-flamingo-2601-hf"
+        cls.tmpdirname = tempfile.mkdtemp()
+
+        processor = MusicFlamingoProcessor.from_pretrained(cls.checkpoint)
+        processor.save_pretrained(cls.tmpdirname)
+
+    @require_torch
+    @require_torchaudio
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    @require_torch
+    @require_torchaudio
+    def get_audio_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).audio_processor
+
+    @require_torch
+    @require_torchaudio
+    def get_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
+
+    @require_torch
+    @require_torchaudio
+    def test_can_load_various_tokenizers(self):
+        processor = MusicFlamingoProcessor.from_pretrained(self.checkpoint)
+        tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
+        self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
+
+    @require_torch
+    @require_torchaudio
+    def test_save_load_pretrained_default(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
+        processor = MusicFlamingoProcessor.from_pretrained(self.checkpoint)
+        feature_extractor = processor.feature_extractor
+
+        processor = MusicFlamingoProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            processor.save_pretrained(tmpdir)
+            reloaded = MusicFlamingoProcessor.from_pretrained(tmpdir)
+
+        self.assertEqual(reloaded.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertEqual(reloaded.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(reloaded.feature_extractor, WhisperFeatureExtractor)
+
+    @require_torch
+    @require_torchaudio
+    def test_tokenizer_integration(self):
+        slow_tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, use_fast=False)
+        fast_tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, from_slow=True, legacy=False)
+
+        prompt = (
+            "<|im_start|>system\nAnswer the questions.<|im_end|>"
+            "<|im_start|>user\n<sound>What is it?<|im_end|>"
+            "<|im_start|>assistant\n"
+        )
+        EXPECTED_OUTPUT = [
+            "<|im_start|>",
+            "system",
+            "Ċ",
+            "Answer",
+            "Ġthe",
+            "Ġquestions",
+            ".",
+            "<|im_end|>",
+            "<|im_start|>",
+            "user",
+            "Ċ",
+            "<sound>",
+            "What",
+            "Ġis",
+            "Ġit",
+            "?",
+            "<|im_end|>",
+            "<|im_start|>",
+            "assistant",
+            "Ċ",
+        ]
+
+        self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
+        self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
+
+    @require_torch
+    @require_torchaudio
+    def test_chat_template(self):
+        processor = AutoProcessor.from_pretrained(self.checkpoint)
+        expected_prompt = (
+            "<|im_start|>system\nYou are Music Flamingo, a multimodal assistant for language and music. "
+            "On each turn you receive an audio clip which contains music and optional text, "
+            "you will receive at least one or both; use your world knowledge and reasoning "
+            "to help the user with any task. Interpret the entirety of the content any input music"
+            "--regardlenss of whether the user calls it audio, music, or sound.<|im_end|>\n"
+            "<|im_start|>user\n<sound>What is surprising about the relationship between the barking and the music?<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+
+        conversations = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What is surprising about the relationship between the barking and the music?",
+                    },
+                    {
+                        "type": "audio",
+                        "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
+                    },
+                ],
+            }
+        ]
+
+        formatted = processor.tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=True)
+        self.assertEqual(expected_prompt, formatted)
+
+    @require_torch
+    @require_torchaudio
+    def test_apply_transcription_request_single(self):
+        processor = AutoProcessor.from_pretrained(self.checkpoint)
+
+        audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav"
+        helper_outputs = processor.apply_transcription_request(audio=audio_url)
+
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Transcribe the input speech."},
+                    {"type": "audio", "audio": audio_url},
+                ],
+            }
+        ]
+        manual_outputs = processor.apply_chat_template(
+            conversation,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+        )
+
+        for key in ("input_ids", "attention_mask", "input_features", "input_features_mask"):
+            self.assertIn(key, helper_outputs)
+            self.assertTrue(helper_outputs[key].equal(manual_outputs[key]))
+
+    # Overwrite to remove skip numpy inputs (still need to keep as many cases as parent)
+    @require_librosa
+    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
+    def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str):
+        if return_tensors == "np":
+            self.skipTest("MusicFlamingo only supports PyTorch tensors")
+        self._test_apply_chat_template(
+            "audio", batch_size, return_tensors, "audio_input_name", "feature_extractor", MODALITY_INPUT_DATA["audio"]
+        )

From c9734457ba9be3102a84b8e260973f83a17a1cfc Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Sat, 10 Jan 2026 05:30:47 -0500
Subject: [PATCH 05/12] Fix AF3 dtype bug

---
 .../models/audioflamingo3/modeling_audioflamingo3.py           | 3 +++
 .../models/audioflamingo3/modular_audioflamingo3.py            | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
index 91cdf260e71e..56227f72c3cc 100644
--- a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
@@ -342,6 +342,9 @@ def forward(
         input_features_lengths = (input_features_lengths - 1) // 2 + 1  # conv2 downsampling
         input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
 
+        # Cast to model dtype
+        input_features = input_features.to(dtype=self.conv1.weight.dtype, device=self.conv1.weight.device)
+
         # Conv front-end
         inputs_embeds = nn.functional.gelu(self.conv1(input_features))
         inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
diff --git a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
index c2157fe98606..53f3eaba92a0 100644
--- a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
@@ -79,6 +79,9 @@ def forward(
         input_features_lengths = (input_features_lengths - 1) // 2 + 1  # conv2 downsampling
         input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
 
+        # Cast to model dtype
+        input_features = input_features.to(dtype=self.conv1.weight.dtype, device=self.conv1.weight.device)
+
         # Conv front-end
         inputs_embeds = nn.functional.gelu(self.conv1(input_features))
         inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))

From e3a17fb5e70f3947d97ffc2edf3b3ce8dffe5fa8 Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Sat, 10 Jan 2026 07:44:26 -0500
Subject: [PATCH 06/12] Fix the MF performance issue

---
 .../musicflamingo/modeling_musicflamingo.py      | 16 ++++++++++++++--
 .../musicflamingo/modular_musicflamingo.py       | 16 ++++++++++++++--
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
index f6f36b57e799..6cde87dcf9a4 100644
--- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
@@ -644,11 +644,23 @@ def forward(
         return outputs
 
     def prepare_inputs_for_generation(self, *args, **kwargs):
+        # Overwritten -- we should not pass input_features when we are in cached decoding stage
+
+        input_features = kwargs.pop("input_features", None)
+        input_features_mask = kwargs.pop("input_features_mask", None)
         audio_times = kwargs.pop("audio_times", None)
+        cache_position = kwargs.get("cache_position")
+
         model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
 
-        if "input_features" in model_inputs and audio_times is not None:
-            model_inputs["audio_times"] = audio_times
+        if cache_position is not None and cache_position[0] == 0:
+            # input_features should only be passed when we are not in cached decoding stage
+            if input_features is not None:
+                model_inputs["input_features"] = input_features
+            if input_features_mask is not None:
+                model_inputs["input_features_mask"] = input_features_mask
+            if audio_times is not None:
+                model_inputs["audio_times"] = audio_times
 
         return model_inputs
 
diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py
index 2a775a014819..6f9d165d9462 100644
--- a/src/transformers/models/musicflamingo/modular_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modular_musicflamingo.py
@@ -301,11 +301,23 @@ def forward(
         return outputs
 
     def prepare_inputs_for_generation(self, *args, **kwargs):
+        # Overwritten -- we should not pass input_features when we are in cached decoding stage
+
+        input_features = kwargs.pop("input_features", None)
+        input_features_mask = kwargs.pop("input_features_mask", None)
         audio_times = kwargs.pop("audio_times", None)
+        cache_position = kwargs.get("cache_position")
+
         model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
 
-        if "input_features" in model_inputs and audio_times is not None:
-            model_inputs["audio_times"] = audio_times
+        if cache_position is not None and cache_position[0] == 0:
+            # input_features should only be passed when we are not in cached decoding stage
+            if input_features is not None:
+                model_inputs["input_features"] = input_features
+            if input_features_mask is not None:
+                model_inputs["input_features_mask"] = input_features_mask
+            if audio_times is not None:
+                model_inputs["audio_times"] = audio_times
 
         return model_inputs
 

From 627dee879a51762db0eb14268df47d04cd99ae38 Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Sat, 10 Jan 2026 10:51:11 -0500
Subject: [PATCH 07/12] Fix pos embeddings

---
 .../musicflamingo/modeling_musicflamingo.py   | 19 ++++++++++---------
 .../musicflamingo/modular_musicflamingo.py    | 19 ++++++++++---------
 .../musicflamingo/processing_musicflamingo.py | 11 +++++++----
 .../models/musicflamingo/rotary_embedding.py  |  5 ++++-
 4 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
index 6cde87dcf9a4..b1ceca6f0d8d 100644
--- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
@@ -25,7 +25,6 @@
 from math import pi
 from typing import Optional, Union
 
-import numpy as np
 import torch
 from torch import nn
 
@@ -417,14 +416,16 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
 
         if audio_times is not None:
-            times = audio_times.to(hidden_states.device)
-            freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
-            angle = (-times * 2 * np.pi).to(self.conv1.weight.device)
-            # audio_times is [batch_size], need to expand to [batch_size, seq_len, freq_dim]
-            angle_expanded = (
-                angle.unsqueeze(1).unsqueeze(2).expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
-            )
-            freqs = freqs * angle_expanded
+            # Ensure audio_times is on correct device once
+            # We use pos_emb device because RotaryEmbedding.forward requires inputs on same device as parameters
+            device = self.pos_emb.freqs.device
+            dtype = hidden_states.dtype
+            audio_times = audio_times.to(device=device, dtype=dtype)
+
+            # Compute rotary embeddings directly from absolute times
+            # pos_emb(t) computes frequencies for t
+            freqs = self.pos_emb(audio_times)
+
             hidden_states = apply_rotary_emb(freqs, hidden_states)
 
         return BaseModelOutput(last_hidden_state=hidden_states)
diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py
index 6f9d165d9462..97e968df0576 100644
--- a/src/transformers/models/musicflamingo/modular_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modular_musicflamingo.py
@@ -17,7 +17,6 @@
 from math import pi
 from typing import Optional, Union
 
-import numpy as np
 import torch
 from torch import nn
 
@@ -150,14 +149,16 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
 
         if audio_times is not None:
-            times = audio_times.to(hidden_states.device)
-            freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
-            angle = (-times * 2 * np.pi).to(self.conv1.weight.device)
-            # audio_times is [batch_size], need to expand to [batch_size, seq_len, freq_dim]
-            angle_expanded = (
-                angle.unsqueeze(1).unsqueeze(2).expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
-            )
-            freqs = freqs * angle_expanded
+            # Ensure audio_times is on correct device once
+            # We use pos_emb device because RotaryEmbedding.forward requires inputs on same device as parameters
+            device = self.pos_emb.freqs.device
+            dtype = hidden_states.dtype
+            audio_times = audio_times.to(device=device, dtype=dtype)
+
+            # Compute rotary embeddings directly from absolute times
+            # pos_emb(t) computes frequencies for t
+            freqs = self.pos_emb(audio_times)
+
             hidden_states = apply_rotary_emb(freqs, hidden_states)
 
         return BaseModelOutput(last_hidden_state=hidden_states)
diff --git a/src/transformers/models/musicflamingo/processing_musicflamingo.py b/src/transformers/models/musicflamingo/processing_musicflamingo.py
index 079e15408b8e..5f95ec6dd14d 100644
--- a/src/transformers/models/musicflamingo/processing_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/processing_musicflamingo.py
@@ -152,7 +152,7 @@ def __call__(
 
             per_sample_windows: list[int] = []
             flat_chunks: list[np.ndarray] = []
-            audio_times_list: list[float] = []
+            audio_times_list: list[torch.Tensor] = []
 
             for audio_el in audio:
                 n_samples = int(audio_el.shape[0])
@@ -170,8 +170,11 @@ def __call__(
                     end = min((i + 1) * window_size, time_cap)
                     flat_chunks.append(audio_el[start:end])
                     # Calculate the start time of this audio chunk in seconds
-                    audio_start_time = start / audio_kwargs["sampling_rate"]
-                    audio_times_list.append(audio_start_time)
+                    start_sec = start / audio_kwargs["sampling_rate"]
+
+                    # Generate 750 timestamps at 40ms intervals (30s / 750 = 0.04s)
+                    chunk_times = torch.arange(750).float() * 0.04 + start_sec
+                    audio_times_list.append(chunk_times)
 
             # Feature extraction
             audio_inputs = self.feature_extractor(flat_chunks, **audio_kwargs)
@@ -180,7 +183,7 @@ def __call__(
 
             # Add audio times as tensor
             if return_tensors == "pt":
-                audio_inputs["audio_times"] = torch.tensor(audio_times_list, dtype=torch.float32)
+                audio_inputs["audio_times"] = torch.stack(audio_times_list).to(dtype=torch.float32)
 
             # Compute sequence lengths token counting
             audio_lengths = torch.stack([s.sum() for s in torch.split(padding_mask.sum(-1), per_sample_windows)])
diff --git a/src/transformers/models/musicflamingo/rotary_embedding.py b/src/transformers/models/musicflamingo/rotary_embedding.py
index 88906e725e34..a29d0b067604 100644
--- a/src/transformers/models/musicflamingo/rotary_embedding.py
+++ b/src/transformers/models/musicflamingo/rotary_embedding.py
@@ -55,7 +55,10 @@ def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
     t = t.to(embed_dtype)
     if t.ndim == 3:
         seq_len = t.shape[seq_dim]
-        freqs = freqs[-seq_len:].to(t)
+        if freqs.ndim == 2:
+            freqs = freqs[-seq_len:].to(t)
+        else:
+            freqs = freqs.to(t)
 
     rot_dim = freqs.shape[-1]
     end_index = start_index + rot_dim

From 4c48132d27fb82d3a60d83f0c8a4385fd1293ccf Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Mon, 12 Jan 2026 10:52:22 -0500
Subject: [PATCH 08/12] Fix embeddings & format

---
 .../models/musicflamingo/__init__.py          |  1 -
 .../configuration_musicflamingo.py            |  1 -
 .../convert_musicflamingo_to_hf.py            |  1 -
 .../musicflamingo/modeling_musicflamingo.py   | 64 +++++++++----------
 .../musicflamingo/modular_musicflamingo.py    | 46 ++++++-------
 .../musicflamingo/processing_musicflamingo.py |  1 -
 .../models/musicflamingo/rotary_embedding.py  |  1 -
 .../test_modeling_musicflamingo.py            |  1 -
 .../test_processing_musicflamingo.py          |  1 -
 9 files changed, 49 insertions(+), 68 deletions(-)

diff --git a/src/transformers/models/musicflamingo/__init__.py b/src/transformers/models/musicflamingo/__init__.py
index 3cc2cd927865..8d136b00e0ad 100644
--- a/src/transformers/models/musicflamingo/__init__.py
+++ b/src/transformers/models/musicflamingo/__init__.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
diff --git a/src/transformers/models/musicflamingo/configuration_musicflamingo.py b/src/transformers/models/musicflamingo/configuration_musicflamingo.py
index bb9635efcb4e..957d58676091 100644
--- a/src/transformers/models/musicflamingo/configuration_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/configuration_musicflamingo.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
diff --git a/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py b/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
index dfaa757adb77..8b395ae018e3 100644
--- a/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
+++ b/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
index b1ceca6f0d8d..e2387f889ceb 100644
--- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
@@ -4,7 +4,6 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_musicflamingo.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-# coding=utf-8
 # Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
@@ -23,7 +22,6 @@
 import math
 from collections.abc import Callable
 from math import pi
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -105,8 +103,8 @@ def eager_attention_forward(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-    scaling: Optional[float] = None,
+    attention_mask: torch.Tensor | None,
+    scaling: float | None = None,
     dropout: float = 0.0,
     **kwargs,
 ):
@@ -137,8 +135,8 @@ def __init__(
         is_decoder: bool = False,
         bias: bool = True,
         is_causal: bool = False,
-        layer_idx: Optional[int] = None,
-        config: Optional[MusicFlamingoConfig] = None,
+        layer_idx: int | None = None,
+        config: MusicFlamingoConfig | None = None,
     ):
         super().__init__()
         self.embed_dim = embed_dim
@@ -172,15 +170,15 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Cache] = None,
-        attention_mask: Optional[torch.Tensor] = None,
+        key_value_states: torch.Tensor | None = None,
+        past_key_values: Cache | None = None,
+        attention_mask: torch.Tensor | None = None,
         output_attentions: bool = False,
-        cache_position: Optional[torch.Tensor] = None,
+        cache_position: torch.Tensor | None = None,
         # TODO: we need a refactor so that the different attention modules can get their specific kwargs
         # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -367,8 +365,8 @@ def set_input_embeddings(self, value: nn.Module):
     def forward(
         self,
         input_features: torch.Tensor,
-        input_features_mask: Optional[torch.Tensor] = None,
-        audio_times: Optional[torch.Tensor] = None,
+        input_features_mask: torch.Tensor | None = None,
+        audio_times: torch.Tensor | None = None,
         **kwargs,
     ):
         r"""
@@ -416,15 +414,11 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
 
         if audio_times is not None:
-            # Ensure audio_times is on correct device once
-            # We use pos_emb device because RotaryEmbedding.forward requires inputs on same device as parameters
-            device = self.pos_emb.freqs.device
-            dtype = hidden_states.dtype
-            audio_times = audio_times.to(device=device, dtype=dtype)
-
-            # Compute rotary embeddings directly from absolute times
-            # pos_emb(t) computes frequencies for t
-            freqs = self.pos_emb(audio_times)
+            times = audio_times.to(hidden_states.device)
+            freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
+            angle = (-times * 2 * pi).to(self.conv1.weight.device)
+            angle_expanded = angle.unsqueeze(2).expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
+            freqs = freqs * angle_expanded
 
             hidden_states = apply_rotary_emb(freqs, hidden_states)
 
@@ -505,7 +499,7 @@ def get_audio_features(
         self,
         input_features: torch.FloatTensor,
         input_features_mask: torch.Tensor,
-        audio_times: Optional[torch.Tensor] = None,
+        audio_times: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         """
         This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
@@ -540,18 +534,18 @@ def get_audio_features(
     @auto_docstring
     def forward(
         self,
-        input_ids: Optional[torch.LongTensor] = None,
-        input_features: Optional[torch.FloatTensor] = None,
-        input_features_mask: Optional[torch.Tensor] = None,
-        audio_times: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        logits_to_keep: Union[int, torch.Tensor] = 0,
+        input_ids: torch.LongTensor | None = None,
+        input_features: torch.FloatTensor | None = None,
+        input_features_mask: torch.Tensor | None = None,
+        audio_times: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         r"""
diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py
index 97e968df0576..7ab2a011a60a 100644
--- a/src/transformers/models/musicflamingo/modular_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modular_musicflamingo.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
@@ -15,7 +14,6 @@
 # limitations under the License.
 
 from math import pi
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -100,8 +98,8 @@ def __init__(self, config: MusicFlamingoConfig):
     def forward(
         self,
         input_features: torch.Tensor,
-        input_features_mask: Optional[torch.Tensor] = None,
-        audio_times: Optional[torch.Tensor] = None,
+        input_features_mask: torch.Tensor | None = None,
+        audio_times: torch.Tensor | None = None,
         **kwargs,
     ):
         r"""
@@ -149,15 +147,11 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
 
         if audio_times is not None:
-            # Ensure audio_times is on correct device once
-            # We use pos_emb device because RotaryEmbedding.forward requires inputs on same device as parameters
-            device = self.pos_emb.freqs.device
-            dtype = hidden_states.dtype
-            audio_times = audio_times.to(device=device, dtype=dtype)
-
-            # Compute rotary embeddings directly from absolute times
-            # pos_emb(t) computes frequencies for t
-            freqs = self.pos_emb(audio_times)
+            times = audio_times.to(hidden_states.device)
+            freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
+            angle = (-times * 2 * pi).to(self.conv1.weight.device)
+            angle_expanded = angle.unsqueeze(2).expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
+            freqs = freqs * angle_expanded
 
             hidden_states = apply_rotary_emb(freqs, hidden_states)
 
@@ -178,7 +172,7 @@ def get_audio_features(
         self,
         input_features: torch.FloatTensor,
         input_features_mask: torch.Tensor,
-        audio_times: Optional[torch.Tensor] = None,
+        audio_times: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         # Encode audio with dtype conversion and audio_times
         input_features = input_features.to(dtype=self.audio_tower.conv1.weight.dtype)
@@ -197,18 +191,18 @@ def get_audio_features(
     @auto_docstring
     def forward(
         self,
-        input_ids: Optional[torch.LongTensor] = None,
-        input_features: Optional[torch.FloatTensor] = None,
-        input_features_mask: Optional[torch.Tensor] = None,
-        audio_times: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        logits_to_keep: Union[int, torch.Tensor] = 0,
+        input_ids: torch.LongTensor | None = None,
+        input_features: torch.FloatTensor | None = None,
+        input_features_mask: torch.Tensor | None = None,
+        audio_times: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         r"""
diff --git a/src/transformers/models/musicflamingo/processing_musicflamingo.py b/src/transformers/models/musicflamingo/processing_musicflamingo.py
index 5f95ec6dd14d..003f4f9d5300 100644
--- a/src/transformers/models/musicflamingo/processing_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/processing_musicflamingo.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
diff --git a/src/transformers/models/musicflamingo/rotary_embedding.py b/src/transformers/models/musicflamingo/rotary_embedding.py
index a29d0b067604..401a8455d92f 100644
--- a/src/transformers/models/musicflamingo/rotary_embedding.py
+++ b/src/transformers/models/musicflamingo/rotary_embedding.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py
index ca7b038248b4..e603340a6f46 100644
--- a/tests/models/musicflamingo/test_modeling_musicflamingo.py
+++ b/tests/models/musicflamingo/test_modeling_musicflamingo.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
diff --git a/tests/models/musicflamingo/test_processing_musicflamingo.py b/tests/models/musicflamingo/test_processing_musicflamingo.py
index 8bde6cbd4723..145ed3091aff 100644
--- a/tests/models/musicflamingo/test_processing_musicflamingo.py
+++ b/tests/models/musicflamingo/test_processing_musicflamingo.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #

From d67c114336337b5ec8e646862c38b99502af09e2 Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Fri, 23 Jan 2026 17:41:04 -0500
Subject: [PATCH 09/12] Remove external deps

---
 .../models/musicflamingo/rotary_embedding.py  | 27 ++++++++-----------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/musicflamingo/rotary_embedding.py b/src/transformers/models/musicflamingo/rotary_embedding.py
index 401a8455d92f..6781b5f37481 100644
--- a/src/transformers/models/musicflamingo/rotary_embedding.py
+++ b/src/transformers/models/musicflamingo/rotary_embedding.py
@@ -16,9 +16,6 @@
 from math import pi
 
 import torch
-from beartype import beartype
-from beartype.typing import Optional
-from einops import rearrange, repeat
 from torch import Tensor, broadcast_tensors, einsum, nn
 from torch.amp import autocast
 from torch.nn import Module
@@ -41,10 +38,10 @@ def broadcat(tensors, dim=-1):
 
 # rotary embedding helper functions
 def rotate_half(x):
-    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x = x.reshape(*x.shape[:-1], -1, 2)
     x1, x2 = x.unbind(dim=-1)
     x = torch.stack((-x2, x1), dim=-1)
-    return rearrange(x, "... d r -> ... (d r)")
+    return x.flatten(-2)
 
 
 @autocast("cuda", enabled=False)
@@ -75,19 +72,18 @@ def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
 def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
     if exists(freq_ranges):
         rotations = einsum("..., f -> ... f", rotations, freq_ranges)
-        rotations = rearrange(rotations, "... r f -> ... (r f)")
+        rotations = rotations.flatten(-2)
 
-    rotations = repeat(rotations, "... n -> ... (n r)", r=2)
+    rotations = torch.repeat_interleave(rotations, 2, dim=-1)
     return apply_rotary_emb(rotations, t, start_index=start_index)
 
 
 # classes
 class RotaryEmbedding(Module):
-    @beartype
     def __init__(
         self,
         dim,
-        custom_freqs: Optional[Tensor] = None,
+        custom_freqs: Tensor | None = None,
         freqs_for="lang",
         theta=50000,
         max_freq=10,
@@ -193,7 +189,7 @@ def rotate_queries_or_keys(self, t, seq_dim=None, offset=0):
         )
 
         if seq_dim == -3:
-            freqs = rearrange(freqs, "n d -> n 1 d")
+            freqs = freqs.unsqueeze(1)
 
         return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
 
@@ -223,8 +219,8 @@ def rotate_queries_and_keys(self, q, k, seq_dim=None):
         scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
 
         if seq_dim == -3:
-            freqs = rearrange(freqs, "n d -> n 1 d")
-            scale = rearrange(scale, "n d -> n 1 d")
+            freqs = freqs.unsqueeze(1)
+            scale = scale.unsqueeze(1)
 
         rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
         rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
@@ -234,8 +230,7 @@ def rotate_queries_and_keys(self, q, k, seq_dim=None):
 
         return rotated_q, rotated_k
 
-    @beartype
-    def get_scale(self, t: Tensor, seq_len: Optional[int] = None, offset=0):
+    def get_scale(self, t: Tensor, seq_len: int | None = None, offset=0):
         assert self.use_xpos
 
         should_cache = self.cache_if_possible and exists(seq_len)
@@ -246,7 +241,7 @@ def get_scale(self, t: Tensor, seq_len: Optional[int] = None, offset=0):
         scale = 1.0
         if self.use_xpos:
             power = (t - len(t) // 2) / self.scale_base
-            scale = self.scale ** rearrange(power, "n -> n 1")
+            scale = self.scale ** power.unsqueeze(-1)
             scale = torch.cat((scale, scale), dim=-1)
 
         if should_cache:
@@ -291,7 +286,7 @@ def forward(self, t: Tensor, seq_len=None, offset=0):
             t = t / self.max_time * (2 * pi)
 
         freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
-        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        freqs = torch.repeat_interleave(freqs, 2, dim=-1)
 
         if should_cache:
             self.tmp_store("cached_freqs", freqs.detach())

From aedd3411005b086e055c2e46806257bcfbdf9352 Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Fri, 23 Jan 2026 17:53:12 -0500
Subject: [PATCH 10/12] Update processor token names

---
 .../musicflamingo/modeling_musicflamingo.py   | 266 +++++++++++++++-
 .../musicflamingo/modular_musicflamingo.py    | 286 ++++++++++++++++-
 .../musicflamingo/processing_musicflamingo.py |  22 +-
 .../models/musicflamingo/rotary_embedding.py  | 294 ------------------
 4 files changed, 559 insertions(+), 309 deletions(-)
 delete mode 100644 src/transformers/models/musicflamingo/rotary_embedding.py

diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
index e2387f889ceb..1d0b3e36895f 100644
--- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
@@ -24,7 +24,9 @@
 from math import pi
 
 import torch
-from torch import nn
+from torch import Tensor, broadcast_tensors, einsum, nn
+from torch.amp import autocast
+from torch.nn import Module
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache, EncoderDecoderCache
@@ -38,12 +40,272 @@
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
 from ..auto import AutoModel, AutoModelForCausalLM
 from .configuration_musicflamingo import MusicFlamingoConfig, MusicFlamingoEncoderConfig
-from .rotary_embedding import RotaryEmbedding, apply_rotary_emb
 
 
 logger = logging.get_logger(__name__)
 
 
+###################################################################################################
+###################################################################################################
+###################################################################################################
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    return val if exists(val) else d
+
+
+# rotary embedding helper functions
+def rotate_half(x):
+    x = x.reshape(*x.shape[:-1], -1, 2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+@autocast("cuda", enabled=False)
+def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
+    ori_dtype = t.dtype
+    embed_dtype = torch.float64
+    t = t.to(embed_dtype)
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        if freqs.ndim == 2:
+            freqs = freqs[-seq_len:].to(t)
+        else:
+            freqs = freqs.to(t)
+
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+
+    assert rot_dim <= t.shape[-1], (
+        f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    )
+
+    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    return torch.cat((t_left, t, t_right), dim=-1).to(ori_dtype)
+
+
+# classes
+class RotaryEmbedding(Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Tensor | None = None,
+        freqs_for="lang",
+        theta=50000,
+        max_freq=10,
+        num_freqs=1,
+        learned_freq=False,
+        use_xpos=False,
+        xpos_scale_base=512,
+        interpolate_factor=1.0,
+        theta_rescale_factor=1.0,
+        seq_before_head_dim=False,
+        cache_if_possible=True,
+        max_time=7200,
+    ):
+        super().__init__()
+
+        self.dim = dim
+        self.freqs_for = freqs_for
+        self.max_freq = max_freq
+        self.num_freqs = num_freqs
+        self.learned_freq = learned_freq
+        self.use_xpos = use_xpos
+        self.xpos_scale_base = xpos_scale_base
+        self.interpolate_factor = interpolate_factor
+        self.theta_rescale_factor = theta_rescale_factor
+        self.cache_if_possible = cache_if_possible
+        self.max_time = max_time
+
+        self.tmp_store("cached_freqs", None)
+        self.tmp_store("cached_scales", None)
+
+        # Adjust theta to avoid angle wrapping after large times
+        if exists(max_time) and freqs_for == "lang":
+            # Make sure highest frequency completes 1 full rotation over max time
+            # theta = base of exponent: higher theta → lower frequency range
+            # max_time * (1/theta^(0)) = 2pi  =>  theta = max_time / (2pi)
+            theta = max_time / (2 * pi)
+
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+        self.theta = theta
+
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+
+        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
+
+        self.learned_freq = learned_freq
+
+        # dummy for device
+
+        self.tmp_store("dummy", torch.tensor(0))
+
+        # default sequence dimension
+
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+
+        # interpolation factors
+
+        assert interpolate_factor >= 1.0
+        self.interpolate_factor = interpolate_factor
+
+        # xpos
+        if not use_xpos:
+            self.tmp_store("scale", None)
+            return
+
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.tmp_store("scale", scale)
+
+        # add apply_rotary_emb as static method
+
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+
+    @property
+    def device(self):
+        return self.dummy.device
+
+    def tmp_store(self, key, value):
+        self.register_buffer(key, value, persistent=False)
+
+    def get_seq_pos(self, seq_len, device, dtype, offset=0):
+        return (torch.arange(seq_len, device=device, dtype=dtype) + offset) / self.interpolate_factor
+
+    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert not self.use_xpos, (
+            "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"
+        )
+
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+
+        freqs = self.forward(
+            self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset), seq_len=seq_len, offset=offset
+        )
+
+        if seq_dim == -3:
+            freqs = freqs.unsqueeze(1)
+
+        return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
+
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, offset=k_len - q_len + offset)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, offset=offset)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    def rotate_queries_and_keys(self, q, k, seq_dim=None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+
+        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
+
+        freqs = self.forward(seq, seq_len=seq_len)
+        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
+
+        if seq_dim == -3:
+            freqs = freqs.unsqueeze(1)
+            scale = scale.unsqueeze(1)
+
+        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    def get_scale(self, t: Tensor, seq_len: int | None = None, offset=0):
+        assert self.use_xpos
+
+        should_cache = self.cache_if_possible and exists(seq_len)
+
+        if should_cache and exists(self.cached_scales) and (seq_len + offset) <= self.cached_scales.shape[0]:
+            return self.cached_scales[offset : (offset + seq_len)]
+
+        scale = 1.0
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** power.unsqueeze(-1)
+            scale = torch.cat((scale, scale), dim=-1)
+
+        if should_cache:
+            self.tmp_store("cached_scales", scale)
+
+        return scale
+
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == "pixel":
+                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
+            else:
+                pos = torch.arange(dim, device=self.device)
+
+            freqs = self.forward(pos, seq_len=dim)
+
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim=-1)
+
+    @autocast("cuda", enabled=False)
+    def forward(self, t: Tensor, seq_len=None, offset=0):
+        should_cache = (
+            self.cache_if_possible and not self.learned_freq and exists(seq_len) and self.freqs_for != "pixel"
+        )
+
+        if should_cache and exists(self.cached_freqs) and (offset + seq_len) <= self.cached_freqs.shape[0]:
+            return self.cached_freqs[offset : (offset + seq_len)].detach()
+
+        freqs = self.freqs
+
+        # Scale time to keep t * freq <= 2pi
+        if hasattr(self, "max_time") and self.max_time is not None:
+            t = t / self.max_time * (2 * pi)
+
+        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = torch.repeat_interleave(freqs, 2, dim=-1)
+
+        if should_cache:
+            self.tmp_store("cached_freqs", freqs.detach())
+
+        return freqs
+
+
 @auto_docstring
 class MusicFlamingoPreTrainedModel(PreTrainedModel):
     config: MusicFlamingoConfig
diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py
index 7ab2a011a60a..3005467ad146 100644
--- a/src/transformers/models/musicflamingo/modular_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/modular_musicflamingo.py
@@ -16,7 +16,9 @@
 from math import pi
 
 import torch
-from torch import nn
+from torch import Tensor, broadcast_tensors, einsum, nn
+from torch.amp import autocast
+from torch.nn import Module
 
 from ...cache_utils import Cache
 from ...masking_utils import create_bidirectional_mask
@@ -30,12 +32,292 @@
     AudioFlamingo3PreTrainedModel,
 )
 from .configuration_musicflamingo import MusicFlamingoConfig
-from .rotary_embedding import RotaryEmbedding, apply_rotary_emb
 
 
 logger = logging.get_logger(__name__)
 
 
+###################################################################################################
+###################################################################################################
+###################################################################################################
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    return val if exists(val) else d
+
+
+def broadcat(tensors, dim=-1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+    return torch.cat(broadcasted_tensors, dim=dim)
+
+
+# rotary embedding helper functions
+def rotate_half(x):
+    x = x.reshape(*x.shape[:-1], -1, 2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+@autocast("cuda", enabled=False)
+def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
+    ori_dtype = t.dtype
+    embed_dtype = torch.float64
+    t = t.to(embed_dtype)
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        if freqs.ndim == 2:
+            freqs = freqs[-seq_len:].to(t)
+        else:
+            freqs = freqs.to(t)
+
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+
+    assert rot_dim <= t.shape[-1], (
+        f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    )
+
+    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    return torch.cat((t_left, t, t_right), dim=-1).to(ori_dtype)
+
+
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
+    if exists(freq_ranges):
+        rotations = einsum("..., f -> ... f", rotations, freq_ranges)
+        rotations = rotations.flatten(-2)
+
+    rotations = torch.repeat_interleave(rotations, 2, dim=-1)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
+
+
+# classes
+class RotaryEmbedding(Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Tensor | None = None,
+        freqs_for="lang",
+        theta=50000,
+        max_freq=10,
+        num_freqs=1,
+        learned_freq=False,
+        use_xpos=False,
+        xpos_scale_base=512,
+        interpolate_factor=1.0,
+        theta_rescale_factor=1.0,
+        seq_before_head_dim=False,
+        cache_if_possible=True,
+        max_time=7200,
+    ):
+        super().__init__()
+
+        self.dim = dim
+        self.freqs_for = freqs_for
+        self.max_freq = max_freq
+        self.num_freqs = num_freqs
+        self.learned_freq = learned_freq
+        self.use_xpos = use_xpos
+        self.xpos_scale_base = xpos_scale_base
+        self.interpolate_factor = interpolate_factor
+        self.theta_rescale_factor = theta_rescale_factor
+        self.cache_if_possible = cache_if_possible
+        self.max_time = max_time
+
+        self.tmp_store("cached_freqs", None)
+        self.tmp_store("cached_scales", None)
+
+        # Adjust theta to avoid angle wrapping after large times
+        if exists(max_time) and freqs_for == "lang":
+            # Make sure highest frequency completes 1 full rotation over max time
+            # theta = base of exponent: higher theta → lower frequency range
+            # max_time * (1/theta^(0)) = 2pi  =>  theta = max_time / (2pi)
+            theta = max_time / (2 * pi)
+
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+        self.theta = theta
+
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+
+        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
+
+        self.learned_freq = learned_freq
+
+        # dummy for device
+
+        self.tmp_store("dummy", torch.tensor(0))
+
+        # default sequence dimension
+
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+
+        # interpolation factors
+
+        assert interpolate_factor >= 1.0
+        self.interpolate_factor = interpolate_factor
+
+        # xpos
+        if not use_xpos:
+            self.tmp_store("scale", None)
+            return
+
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.tmp_store("scale", scale)
+
+        # add apply_rotary_emb as static method
+
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+
+    @property
+    def device(self):
+        return self.dummy.device
+
+    def tmp_store(self, key, value):
+        self.register_buffer(key, value, persistent=False)
+
+    def get_seq_pos(self, seq_len, device, dtype, offset=0):
+        return (torch.arange(seq_len, device=device, dtype=dtype) + offset) / self.interpolate_factor
+
+    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert not self.use_xpos, (
+            "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"
+        )
+
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+
+        freqs = self.forward(
+            self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset), seq_len=seq_len, offset=offset
+        )
+
+        if seq_dim == -3:
+            freqs = freqs.unsqueeze(1)
+
+        return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
+
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, offset=k_len - q_len + offset)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, offset=offset)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    def rotate_queries_and_keys(self, q, k, seq_dim=None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+
+        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
+
+        freqs = self.forward(seq, seq_len=seq_len)
+        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
+
+        if seq_dim == -3:
+            freqs = freqs.unsqueeze(1)
+            scale = scale.unsqueeze(1)
+
+        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    def get_scale(self, t: Tensor, seq_len: int | None = None, offset=0):
+        assert self.use_xpos
+
+        should_cache = self.cache_if_possible and exists(seq_len)
+
+        if should_cache and exists(self.cached_scales) and (seq_len + offset) <= self.cached_scales.shape[0]:
+            return self.cached_scales[offset : (offset + seq_len)]
+
+        scale = 1.0
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** power.unsqueeze(-1)
+            scale = torch.cat((scale, scale), dim=-1)
+
+        if should_cache:
+            self.tmp_store("cached_scales", scale)
+
+        return scale
+
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == "pixel":
+                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
+            else:
+                pos = torch.arange(dim, device=self.device)
+
+            freqs = self.forward(pos, seq_len=dim)
+
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim=-1)
+
+    @autocast("cuda", enabled=False)
+    def forward(self, t: Tensor, seq_len=None, offset=0):
+        should_cache = (
+            self.cache_if_possible and not self.learned_freq and exists(seq_len) and self.freqs_for != "pixel"
+        )
+
+        if should_cache and exists(self.cached_freqs) and (offset + seq_len) <= self.cached_freqs.shape[0]:
+            return self.cached_freqs[offset : (offset + seq_len)].detach()
+
+        freqs = self.freqs
+
+        # Scale time to keep t * freq <= 2pi
+        if hasattr(self, "max_time") and self.max_time is not None:
+            t = t / self.max_time * (2 * pi)
+
+        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = torch.repeat_interleave(freqs, 2, dim=-1)
+
+        if should_cache:
+            self.tmp_store("cached_freqs", freqs.detach())
+
+        return freqs
+
+
+###################################################################################################
+###################################################################################################
+###################################################################################################
+
+
 class MusicFlamingoPreTrainedModel(AudioFlamingo3PreTrainedModel):
     @torch.no_grad()
     def _init_weights(self, module):
diff --git a/src/transformers/models/musicflamingo/processing_musicflamingo.py b/src/transformers/models/musicflamingo/processing_musicflamingo.py
index 003f4f9d5300..401816a4ca37 100644
--- a/src/transformers/models/musicflamingo/processing_musicflamingo.py
+++ b/src/transformers/models/musicflamingo/processing_musicflamingo.py
@@ -70,9 +70,9 @@ class MusicFlamingoProcessor(ProcessorMixin):
             template will be used.
         audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
             Special token used to represent audio inputs in the chat template.
-        sound_bos_token (`Optional[str]`, *optional*, defaults to `"<|sound_bos|>"`):
+        audio_bos_token (`Optional[str]`, *optional*, defaults to `"<|sound_bos|>"`):
             Special token used to represent the beginning of an audio sequence.
-        sound_eos_token (`Optional[str]`, *optional*, defaults to `"<|sound_eos|>"`):
+        audio_eos_token (`Optional[str]`, *optional*, defaults to `"<|sound_eos|>"`):
             Special token used to represent the end of an audio sequence.
     """
 
@@ -82,15 +82,15 @@ def __init__(
         tokenizer,
         chat_template=None,
         audio_token="<sound>",
-        sound_bos_token="<|sound_bos|>",
-        sound_eos_token="<|sound_eos|>",
+        audio_bos_token="<|sound_bos|>",
+        audio_eos_token="<|sound_eos|>",
     ):
         self.audio_token = audio_token
-        self.sound_bos_token = sound_bos_token
-        self.sound_eos_token = sound_eos_token
+        self.audio_bos_token = audio_bos_token
+        self.audio_eos_token = audio_eos_token
         self.audio_token_id = tokenizer.convert_tokens_to_ids(audio_token)
-        self.sound_bos_token_id = tokenizer.convert_tokens_to_ids(sound_bos_token)
-        self.sound_eos_token_id = tokenizer.convert_tokens_to_ids(sound_eos_token)
+        self.audio_bos_token_id = tokenizer.convert_tokens_to_ids(audio_bos_token)
+        self.audio_eos_token_id = tokenizer.convert_tokens_to_ids(audio_eos_token)
         super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
 
     def __call__(
@@ -193,7 +193,7 @@ def __call__(
             for i, audio_length in enumerate(audio_tokens_lengths):
                 expanded = re.sub(
                     re.escape(self.audio_token),
-                    self.sound_bos_token + self.audio_token * audio_length + self.sound_eos_token,
+                    self.audio_bos_token + self.audio_token * audio_length + self.audio_eos_token,
                     text[i],
                 )
                 text[i] = expanded
@@ -205,8 +205,8 @@ def __call__(
         if output_labels:
             labels = data["input_ids"].clone()
             labels[labels == self.audio_token_id] = -100
-            labels[labels == self.sound_bos_token_id] = -100
-            labels[labels == self.sound_eos_token_id] = -100
+            labels[labels == self.audio_bos_token_id] = -100
+            labels[labels == self.audio_eos_token_id] = -100
             labels[labels == self.tokenizer.pad_token_id] = -100
             data["labels"] = labels
 
diff --git a/src/transformers/models/musicflamingo/rotary_embedding.py b/src/transformers/models/musicflamingo/rotary_embedding.py
deleted file mode 100644
index 6781b5f37481..000000000000
--- a/src/transformers/models/musicflamingo/rotary_embedding.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from math import pi
-
-import torch
-from torch import Tensor, broadcast_tensors, einsum, nn
-from torch.amp import autocast
-from torch.nn import Module
-
-
-# helper functions
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    return val if exists(val) else d
-
-
-# broadcat, as tortoise-tts was using it
-def broadcat(tensors, dim=-1):
-    broadcasted_tensors = broadcast_tensors(*tensors)
-    return torch.cat(broadcasted_tensors, dim=dim)
-
-
-# rotary embedding helper functions
-def rotate_half(x):
-    x = x.reshape(*x.shape[:-1], -1, 2)
-    x1, x2 = x.unbind(dim=-1)
-    x = torch.stack((-x2, x1), dim=-1)
-    return x.flatten(-2)
-
-
-@autocast("cuda", enabled=False)
-def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
-    ori_dtype = t.dtype
-    embed_dtype = torch.float64
-    t = t.to(embed_dtype)
-    if t.ndim == 3:
-        seq_len = t.shape[seq_dim]
-        if freqs.ndim == 2:
-            freqs = freqs[-seq_len:].to(t)
-        else:
-            freqs = freqs.to(t)
-
-    rot_dim = freqs.shape[-1]
-    end_index = start_index + rot_dim
-
-    assert rot_dim <= t.shape[-1], (
-        f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
-    )
-
-    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
-    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
-    return torch.cat((t_left, t, t_right), dim=-1).to(ori_dtype)
-
-
-# learned rotation helpers
-def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
-    if exists(freq_ranges):
-        rotations = einsum("..., f -> ... f", rotations, freq_ranges)
-        rotations = rotations.flatten(-2)
-
-    rotations = torch.repeat_interleave(rotations, 2, dim=-1)
-    return apply_rotary_emb(rotations, t, start_index=start_index)
-
-
-# classes
-class RotaryEmbedding(Module):
-    def __init__(
-        self,
-        dim,
-        custom_freqs: Tensor | None = None,
-        freqs_for="lang",
-        theta=50000,
-        max_freq=10,
-        num_freqs=1,
-        learned_freq=False,
-        use_xpos=False,
-        xpos_scale_base=512,
-        interpolate_factor=1.0,
-        theta_rescale_factor=1.0,
-        seq_before_head_dim=False,
-        cache_if_possible=True,
-        max_time=7200,
-    ):
-        super().__init__()
-
-        self.dim = dim
-        self.freqs_for = freqs_for
-        self.max_freq = max_freq
-        self.num_freqs = num_freqs
-        self.learned_freq = learned_freq
-        self.use_xpos = use_xpos
-        self.xpos_scale_base = xpos_scale_base
-        self.interpolate_factor = interpolate_factor
-        self.theta_rescale_factor = theta_rescale_factor
-        self.cache_if_possible = cache_if_possible
-        self.max_time = max_time
-
-        self.tmp_store("cached_freqs", None)
-        self.tmp_store("cached_scales", None)
-
-        # Adjust theta to avoid angle wrapping after large times
-        if exists(max_time) and freqs_for == "lang":
-            # Make sure highest frequency completes 1 full rotation over max time
-            # theta = base of exponent: higher theta → lower frequency range
-            # max_time * (1/theta^(0)) = 2pi  =>  theta = max_time / (2pi)
-            theta = max_time / (2 * pi)
-
-        theta *= theta_rescale_factor ** (dim / (dim - 2))
-
-        self.theta = theta
-
-        if exists(custom_freqs):
-            freqs = custom_freqs
-        elif freqs_for == "lang":
-            freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-        elif freqs_for == "pixel":
-            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
-        elif freqs_for == "constant":
-            freqs = torch.ones(num_freqs).float()
-
-        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
-
-        self.learned_freq = learned_freq
-
-        # dummy for device
-
-        self.tmp_store("dummy", torch.tensor(0))
-
-        # default sequence dimension
-
-        self.seq_before_head_dim = seq_before_head_dim
-        self.default_seq_dim = -3 if seq_before_head_dim else -2
-
-        # interpolation factors
-
-        assert interpolate_factor >= 1.0
-        self.interpolate_factor = interpolate_factor
-
-        # xpos
-        if not use_xpos:
-            self.tmp_store("scale", None)
-            return
-
-        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
-        self.scale_base = xpos_scale_base
-        self.tmp_store("scale", scale)
-
-        # add apply_rotary_emb as static method
-
-        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
-
-    @property
-    def device(self):
-        return self.dummy.device
-
-    def tmp_store(self, key, value):
-        self.register_buffer(key, value, persistent=False)
-
-    def get_seq_pos(self, seq_len, device, dtype, offset=0):
-        return (torch.arange(seq_len, device=device, dtype=dtype) + offset) / self.interpolate_factor
-
-    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0):
-        seq_dim = default(seq_dim, self.default_seq_dim)
-
-        assert not self.use_xpos, (
-            "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"
-        )
-
-        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
-
-        freqs = self.forward(
-            self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset), seq_len=seq_len, offset=offset
-        )
-
-        if seq_dim == -3:
-            freqs = freqs.unsqueeze(1)
-
-        return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
-
-    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
-        seq_dim = default(seq_dim, self.default_seq_dim)
-
-        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
-        assert q_len <= k_len
-
-        rotated_q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, offset=k_len - q_len + offset)
-        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, offset=offset)
-
-        rotated_q = rotated_q.type(q.dtype)
-        rotated_k = rotated_k.type(k.dtype)
-
-        return rotated_q, rotated_k
-
-    def rotate_queries_and_keys(self, q, k, seq_dim=None):
-        seq_dim = default(seq_dim, self.default_seq_dim)
-
-        assert self.use_xpos
-        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
-
-        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
-
-        freqs = self.forward(seq, seq_len=seq_len)
-        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
-
-        if seq_dim == -3:
-            freqs = freqs.unsqueeze(1)
-            scale = scale.unsqueeze(1)
-
-        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
-        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
-
-        rotated_q = rotated_q.type(q.dtype)
-        rotated_k = rotated_k.type(k.dtype)
-
-        return rotated_q, rotated_k
-
-    def get_scale(self, t: Tensor, seq_len: int | None = None, offset=0):
-        assert self.use_xpos
-
-        should_cache = self.cache_if_possible and exists(seq_len)
-
-        if should_cache and exists(self.cached_scales) and (seq_len + offset) <= self.cached_scales.shape[0]:
-            return self.cached_scales[offset : (offset + seq_len)]
-
-        scale = 1.0
-        if self.use_xpos:
-            power = (t - len(t) // 2) / self.scale_base
-            scale = self.scale ** power.unsqueeze(-1)
-            scale = torch.cat((scale, scale), dim=-1)
-
-        if should_cache:
-            self.tmp_store("cached_scales", scale)
-
-        return scale
-
-    def get_axial_freqs(self, *dims):
-        Colon = slice(None)
-        all_freqs = []
-
-        for ind, dim in enumerate(dims):
-            if self.freqs_for == "pixel":
-                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
-            else:
-                pos = torch.arange(dim, device=self.device)
-
-            freqs = self.forward(pos, seq_len=dim)
-
-            all_axis = [None] * len(dims)
-            all_axis[ind] = Colon
-
-            new_axis_slice = (Ellipsis, *all_axis, Colon)
-            all_freqs.append(freqs[new_axis_slice])
-
-        all_freqs = broadcast_tensors(*all_freqs)
-        return torch.cat(all_freqs, dim=-1)
-
-    @autocast("cuda", enabled=False)
-    def forward(self, t: Tensor, seq_len=None, offset=0):
-        should_cache = (
-            self.cache_if_possible and not self.learned_freq and exists(seq_len) and self.freqs_for != "pixel"
-        )
-
-        if should_cache and exists(self.cached_freqs) and (offset + seq_len) <= self.cached_freqs.shape[0]:
-            return self.cached_freqs[offset : (offset + seq_len)].detach()
-
-        freqs = self.freqs
-
-        # Scale time to keep t * freq <= 2pi
-        if hasattr(self, "max_time") and self.max_time is not None:
-            t = t / self.max_time * (2 * pi)
-
-        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
-        freqs = torch.repeat_interleave(freqs, 2, dim=-1)
-
-        if should_cache:
-            self.tmp_store("cached_freqs", freqs.detach())
-
-        return freqs

From b78da82200ba2a38c5cb72990c364b0d16a99004 Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Fri, 23 Jan 2026 20:10:12 -0500
Subject: [PATCH 11/12] Migrate MF to AF3

---
 docs/source/en/_toctree.yml                   |   2 -
 docs/source/en/model_doc/musicflamingo.md     | 371 -------
 src/transformers/models/__init__.py           |   1 -
 .../configuration_audioflamingo3.py           |  16 +
 .../convert_musicflamingo_to_hf.py            |  33 +-
 .../audioflamingo3/modeling_audioflamingo3.py | 330 ++++++-
 .../audioflamingo3/modular_audioflamingo3.py  | 347 ++++++-
 .../processing_audioflamingo3.py              |  29 +-
 .../models/auto/configuration_auto.py         |   5 -
 src/transformers/models/auto/modeling_auto.py |   4 -
 .../models/auto/processing_auto.py            |   1 -
 .../models/musicflamingo/__init__.py          |  30 -
 .../configuration_musicflamingo.py            | 209 ----
 .../musicflamingo/modeling_musicflamingo.py   | 925 ------------------
 .../musicflamingo/modular_musicflamingo.py    | 602 ------------
 .../musicflamingo/processing_musicflamingo.py | 228 -----
 .../expected_results_batched.json             |   1 -
 .../expected_results_single.json              |   1 -
 tests/models/musicflamingo/__init__.py        |   0
 .../test_modeling_musicflamingo.py            | 350 -------
 .../test_processing_musicflamingo.py          | 195 ----
 21 files changed, 733 insertions(+), 2947 deletions(-)
 delete mode 100644 docs/source/en/model_doc/musicflamingo.md
 rename src/transformers/models/{musicflamingo => audioflamingo3}/convert_musicflamingo_to_hf.py (91%)
 delete mode 100644 src/transformers/models/musicflamingo/__init__.py
 delete mode 100644 src/transformers/models/musicflamingo/configuration_musicflamingo.py
 delete mode 100644 src/transformers/models/musicflamingo/modeling_musicflamingo.py
 delete mode 100644 src/transformers/models/musicflamingo/modular_musicflamingo.py
 delete mode 100644 src/transformers/models/musicflamingo/processing_musicflamingo.py
 delete mode 100644 tests/fixtures/musicflamingo/expected_results_batched.json
 delete mode 100644 tests/fixtures/musicflamingo/expected_results_single.json
 delete mode 100644 tests/models/musicflamingo/__init__.py
 delete mode 100644 tests/models/musicflamingo/test_modeling_musicflamingo.py
 delete mode 100644 tests/models/musicflamingo/test_processing_musicflamingo.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 73362f3f8e62..cc4d8719e1e7 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1127,8 +1127,6 @@
         title: mllama
       - local: model_doc/mm-grounding-dino
         title: MM Grounding DINO
-      - local: model_doc/musicflamingo
-        title: MusicFlamingo
       - local: model_doc/nougat
         title: Nougat
       - local: model_doc/omdet-turbo
diff --git a/docs/source/en/model_doc/musicflamingo.md b/docs/source/en/model_doc/musicflamingo.md
deleted file mode 100644
index e3172bc0ccd6..000000000000
--- a/docs/source/en/model_doc/musicflamingo.md
+++ /dev/null
@@ -1,371 +0,0 @@
-<!--Copyright 2026 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-*This model was released on 2025-11-03 and added to Hugging Face Transformers on 2026-01-08.*
-
-# Music Flamingo
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-## Overview
-
-Music Flamingo is a fully open large audio–language model designed for robust understanding and reasoning over music. It builds upon Audio Flamingo 3 architecture, pairing a Whisper-style audio encoder with a causal language model and performing replace-in-place audio–text fusion: the processor aligns post-pool audio frames to a dedicated placeholder token and the model replaces those token slots with projected audio embeddings during the forward pass.
-
-The model checkpoint is available at: [nvidia/music-flamingo-2601-hf](https://huggingface.co/nvidia/music-flamingo-2601-hf)
-
-Highlights:
-
-- Unified audio encoder across speech, sound, and music.
-- **Rotary Time Embeddings (RoTE)** for enhanced temporal modeling, enabling support for **up to 20 minutes of audio**.
-- **Extended long-audio support via windowing and post-pool alignment (up to 20 minutes maximum).** The model processes audio in 30-second windows with a hard limit of 40 windows (20 minutes total). Audio longer than 20 minutes will be truncated.
-- Special sound boundary tokens (`<|sound_bos|>` and `<|sound_eos|>`) for improved audio sequence modeling.
-- Deterministic fusion that preserves sequence length by replacing audio placeholder tokens with audio embeddings.
-
-This model was contributed by [Lasha Koroshinadze](https://huggingface.co/lashahub) and [Eric Bezzam](https://huggingface.co/bezzam).
-
-### Paper
-
-[Music Flamingo: Scaling Music Understanding in Audio Language Models](https://huggingface.co/papers/2511.10289)  
-S. Ghosh, A. Goel, L. Koroshinadze, S. Lee, Z. Kong, J. F. Santos, R. Duraiswami, D. Manocha, W. Ping, M. Shoeybi, B. Catanzaro  
-NVIDIA and University of Maryland  
-Project: https://research.nvidia.com/labs/adlr/MF/
-
-## Usage
-
-### Audio Instruct Mode
-
-The model supports audio-text instructions, including multi-turn interactions, all processed in batches.
-
-➡️ audio + text instruction
-
-```python
-from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/music-flamingo-2601-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates."},
-            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3"},
-        ],
-    }
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-).to(model.device)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-print(decoded_outputs)
-```
-
-➡️ multi-turn:
-
-```python
-from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/music-flamingo-2601-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "Write a rich caption that blends the technical details (genre, BPM, key, chords, mix) with how the song feels emotionally and dynamically as it unfolds.",
-            },
-            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3"},
-        ],
-    },
-    {
-        "role": "assistant",
-        "content": [{"type": "text", "text": "This energetic Eurodance anthem at 150 BPM in E major combines bright synth arpeggios with a punchy four-on-the-floor beat..."}],
-    },
-    {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "What instruments stand out the most?"},
-        ],
-    },
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-).to(model.device)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-print(decoded_outputs)
-```
-
-➡️ text only:
-
-```python
-from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/music-flamingo-2601-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "What is the capital of France?"},
-        ],
-    }
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-).to(model.device)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-print(decoded_outputs)
-```
-
-➡️ audio only:
-
-```python
-from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/music-flamingo-2601-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_2.mp3"},
-        ],
-    }
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-).to(model.device)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-print(decoded_outputs)
-```
-
-➡️ batched inference!
-
-```python
-from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/music-flamingo-2601-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-conversations = [
-    [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates."},
-                {
-                    "type": "audio",
-                    "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
-                },
-            ],
-        }
-    ],
-    [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "Generate a structured lyric sheet from the input music.",
-                },
-                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_2.mp3"},
-            ],
-        }
-    ],
-]
-
-inputs = processor.apply_chat_template(
-    conversations,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-).to(model.device)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-print(decoded_outputs)
-```
-
-➡️ Training:
-
-```python
-from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
-
-model_id = "nvidia/music-flamingo-2601-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-model.train()
-
-conversation = [
-    [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "Break the track down like a critic - list its tempo, key, and chordal motion, then explain the textures, dynamics, and emotional impact of the performance."},
-                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3"},
-            ],
-        },
-        {
-            "role": "assistant",
-            "content": [{"type": "text", "text": "This Eurodance track operates at 150 BPM in E major, with harmonic movement centering on the I-vi-IV-V family. The production features layered synth arpeggios, a four-on-the-floor kick pattern, and a mezzo-soprano lead vocal with bright timbre. Dynamically, the track builds through verses into an anthemic chorus with full synth orchestration and backing vocals, creating an uplifting, euphoric atmosphere characteristic of late 2000s dance-pop."}],
-        }
-    ],
-    [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "Describe this song from both a technical and artistic lens: mention tempo, harmony, and instrumentation, but also mood, lyrical themes, and structure.",
-                },
-                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_2.mp3"},
-            ],
-        },
-        {
-            "role": "assistant",
-            "content": [{"type": "text", "text": "This electronic pop track combines upbeat production with playful lyrical themes centered around late-night pizza cravings. The structure follows a verse-chorus format with recurring melodic motifs and rhythmic patterns that emphasize the celebratory, lighthearted mood of the piece."}],
-        }
-
-    ]
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_dict=True,
-    output_labels=True,
-).to(model.device)
-
-loss = model(**inputs).loss
-loss.backward()
-```
-
-## How the model works
-
-### Architecture
-
-* **MusicFlamingoEncoder**
-  Whisper-style feature extractor + encoder with **Rotary Time Embeddings (RoTE)** → average-pool over time (stride 2) → LayerNorm.
-  Produces per-frame hidden states at the post-pool rate. RoTE enables the model to handle temporal information for audio sequences up to 20 minutes.
-
-* **MusicFlamingoMultiModalProjector**
-  A small MLP that maps encoder features to the language model's hidden size.
-
-* **MusicFlamingoForConditionalGeneration**
-  A causal language model that accepts text embeddings where each audio placeholder token slot is replaced, in place, by an audio frame embedding. Uses special boundary tokens (`<|sound_bos|>` and `<|sound_eos|>`) to mark audio sequences. No sequence-length change is introduced by fusion.
-
-### Processor-level alignment
-
-1. Each raw waveform is split into fixed-length windows based on the feature extractor’s `chunk_length` (seconds) and `sampling_rate` (Hz).
-2. For each window, the processor computes the number of post-pool frames `post_pool_len` that the encoder will output (matching the conv/pool schedule).
-3. The processor expands the audio placeholder token by the total number of post-pool frames across all windows.
-4. The model later replaces those token positions with the corresponding projected audio embeddings.
-
-## Long audio and windowing
-
-**Important: Maximum audio length is 20 minutes.** Audio longer than this will be truncated.
-
-* The default setup processes 30-second windows at 16 kHz mono.
-* **The processor enforces a hard limit of 40 windows per sample, resulting in a maximum of 20 minutes of audio (40 windows × 30 seconds).**
-* Rotary Time Embeddings (RoTE) provide position information for sequences up to 20 minutes (1200 seconds).
-* For each window:
-
-  * `mel_len` is the padded mel length.
-  * A conv stack reduces time as `conv_output_len = (mel_len - 1) // 2 + 1`.
-  * Post-pool frames per window: `post_pool_len = (conv_output_len - 2) // 2 + 1`.
-  * An audio placeholder token is expanded to the sum of `post_pool_len` across all windows.
-
-## Padding, attention, and caching
-
-* **Left padding vs right padding**
-  For generation with mixed prompt lengths in a batch, left padding is usually preferable.
-  For training, right padding is common; Music Flamingo's fusion mechanism itself is padding-agnostic because it replaces in place.
-* **Attention masks**
-  The processor returns `attention_mask` (text) and `input_features_mask` (audio). The model builds an internal 4-D mask on the encoder's pre-pool axis with negative infinity at pad positions.
-* **Audio boundary tokens**
-  The model uses special tokens `<|sound_bos|>` and `<|sound_eos|>` to explicitly mark the beginning and end of audio sequences.
-* **Caching**
-  During generation, `input_features` and `input_features_mask` are only passed on the first step. Subsequent steps use cached keys/values from the language model.
-
-## Troubleshooting
-
-* Empty or truncated outputs when batching
-  Use left padding for batched generation and decode only the new tokens after the prompt length, as shown in the quickstart.
-
-## MusicFlamingoConfig
-
-[[autodoc]] MusicFlamingoConfig
-
-## MusicFlamingoEncoderConfig
-
-[[autodoc]] MusicFlamingoEncoderConfig
-
-## MusicFlamingoProcessor
-
-[[autodoc]] MusicFlamingoProcessor
-
-## MusicFlamingoEncoder
-
-[[autodoc]] MusicFlamingoEncoder
-    - forward
-
-## MusicFlamingoForConditionalGeneration
-
-[[autodoc]] MusicFlamingoForConditionalGeneration
-    - forward
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 96e707d05c20..9e2261d2bc8f 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -256,7 +256,6 @@
     from .mpt import *
     from .mra import *
     from .mt5 import *
-    from .musicflamingo import *
     from .musicgen import *
     from .musicgen_melody import *
     from .mvp import *
diff --git a/src/transformers/models/audioflamingo3/configuration_audioflamingo3.py b/src/transformers/models/audioflamingo3/configuration_audioflamingo3.py
index 9eca494dbd79..311d94a9cc9c 100644
--- a/src/transformers/models/audioflamingo3/configuration_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/configuration_audioflamingo3.py
@@ -63,6 +63,14 @@ class AudioFlamingo3EncoderConfig(PretrainedConfig):
             Scale embeddings by dividing by sqrt(hidden_size).
         max_source_positions (`int`, *optional*, defaults to 1500):
             The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
+        use_rotary_embedding (`bool`, *optional*, defaults to `False`):
+            Whether to use rotary embeddings (RoTE) in the encoder.
+        rotary_dim (`int`, *optional*, defaults to 256):
+            Dimension for the rotary embeddings.
+        rotary_freqs_for (`str`, *optional*, defaults to `"lang"`):
+            Frequency type for rotary embeddings.
+        rotary_max_time (`float`, *optional*, defaults to 1200.0):
+            Maximum time (in seconds) for rotary embeddings scaling.
 
     Example:
 
@@ -104,6 +112,10 @@ def __init__(
         initializer_range=0.02,
         scale_embedding=False,
         max_source_positions=1500,
+        use_rotary_embedding=False,
+        rotary_dim=256,
+        rotary_freqs_for="lang",
+        rotary_max_time=1200.0,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -122,6 +134,10 @@ def __init__(
         self.num_hidden_layers = num_hidden_layers
         self.scale_embedding = scale_embedding
         self.max_source_positions = max_source_positions
+        self.use_rotary_embedding = use_rotary_embedding
+        self.rotary_dim = rotary_dim
+        self.rotary_freqs_for = rotary_freqs_for
+        self.rotary_max_time = rotary_max_time
 
 
 class AudioFlamingo3Config(PretrainedConfig):
diff --git a/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py b/src/transformers/models/audioflamingo3/convert_musicflamingo_to_hf.py
similarity index 91%
rename from src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
rename to src/transformers/models/audioflamingo3/convert_musicflamingo_to_hf.py
index 8b395ae018e3..68ca2910ada2 100644
--- a/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
+++ b/src/transformers/models/audioflamingo3/convert_musicflamingo_to_hf.py
@@ -28,11 +28,12 @@
 from safetensors.torch import safe_open
 
 from transformers import (
+    AudioFlamingo3Config,
+    AudioFlamingo3EncoderConfig,
+    AudioFlamingo3ForConditionalGeneration,
+    AudioFlamingo3Processor,
     AutoTokenizer,
     GenerationConfig,
-    MusicFlamingoConfig,
-    MusicFlamingoForConditionalGeneration,
-    MusicFlamingoProcessor,
     Qwen2Config,
     WhisperFeatureExtractor,
 )
@@ -103,10 +104,13 @@ def write_processor(src_root: Path, dst_root: Path):
     )
     # fmt: on
 
-    processor = MusicFlamingoProcessor(
+    processor = AudioFlamingo3Processor(
         feature_extractor=WhisperFeatureExtractor(feature_size=128, return_attention_mask=True),
         tokenizer=AutoTokenizer.from_pretrained(str(llm_dir), chat_template=tokenizer_chat_template, use_fast=True),
         chat_template=processor_chat_template,
+        max_audio_len=1200,
+        audio_bos_token="<|sound_bos|>",
+        audio_eos_token="<|sound_eos|>",
     )
     processor.save_pretrained(str(dst_root))
 
@@ -138,7 +142,7 @@ def _resolve_component_dir(dirpath: Path):
     return ("file", cands[0]) if len(cands) == 1 else None
 
 
-def merge_and_shard_weights(src_root: Path, dst_root: Path, processor: MusicFlamingoProcessor):
+def merge_and_shard_weights(src_root: Path, dst_root: Path, processor: AudioFlamingo3Processor):
     state: dict[str, Any] = {}
     for tag in PREFIX_MAP.keys():
         comp = _resolve_component_dir(src_root / tag)
@@ -182,10 +186,19 @@ def merge_and_shard_weights(src_root: Path, dst_root: Path, processor: MusicFlam
         rope_theta=1000000.0,
         use_cache=False,
     )
-    config = MusicFlamingoConfig(text_config=text_config, audio_token_id=tok.get_vocab()["<sound>"])
-    model = MusicFlamingoForConditionalGeneration(config).to(dtype=torch.bfloat16)
 
-    # Update state dict to new key names if necessary
+    audio_encoder_config = AudioFlamingo3EncoderConfig(
+        use_rotary_embedding=True,
+        rotary_max_time=1200.0,
+        rotary_freqs_for="lang",
+    )
+    config = AudioFlamingo3Config(
+        text_config=text_config,
+        audio_config=audio_encoder_config,
+        audio_token_id=tok.get_vocab()["<sound>"],
+    )
+    model = AudioFlamingo3ForConditionalGeneration(config).to(dtype=torch.bfloat16)
+
     projector_key_mapping = {
         "multi_modal_projector.layers.0.weight": "multi_modal_projector.linear_1.weight",
         "multi_modal_projector.layers.0.bias": "multi_modal_projector.linear_1.bias",
@@ -237,7 +250,7 @@ def merge_and_shard_weights(src_root: Path, dst_root: Path, processor: MusicFlam
 2) Convert to the Hugging Face Transformers format (locally):
 
 ```
-python src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py \
+python src/transformers/models/audioflamingo3/convert_musicflamingo_to_hf.py \
   --src_dir music-flamingo \
   --dst_dir music-flamingo-hf
 ```
@@ -245,7 +258,7 @@ def merge_and_shard_weights(src_root: Path, dst_root: Path, processor: MusicFlam
 3) Convert and push directly to the Hub (requires `huggingface-cli login` or `HF_TOKEN`):
 
 ```
-python src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py \
+python src/transformers/models/audioflamingo3/convert_musicflamingo_to_hf.py \
   --src_dir music-flamingo \
   --dst_dir music-flamingo-hf \
   --push_to_hub <username-or-org>/music-flamingo-hf
diff --git a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
index 275fb03a360d..5b6cc5bcca08 100644
--- a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
@@ -22,9 +22,12 @@
 
 import math
 from collections.abc import Callable
+from math import pi
 
 import torch
-from torch import nn
+from torch import Tensor, broadcast_tensors, einsum, nn
+from torch.amp import autocast
+from torch.nn import Module
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache, EncoderDecoderCache
@@ -43,6 +46,246 @@
 logger = logging.get_logger(__name__)
 
 
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    return val if exists(val) else d
+
+
+def rotate_half(x):
+    x = x.reshape(*x.shape[:-1], -1, 2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+@autocast("cuda", enabled=False)
+def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
+    ori_dtype = t.dtype
+    embed_dtype = torch.float64
+    t = t.to(embed_dtype)
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        if freqs.ndim == 2:
+            freqs = freqs[-seq_len:].to(t)
+        else:
+            freqs = freqs.to(t)
+
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+
+    assert rot_dim <= t.shape[-1], (
+        f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    )
+
+    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    return torch.cat((t_left, t, t_right), dim=-1).to(ori_dtype)
+
+
+class RotaryEmbedding(Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Tensor | None = None,
+        freqs_for="lang",
+        theta=50000,
+        max_freq=10,
+        num_freqs=1,
+        learned_freq=False,
+        use_xpos=False,
+        xpos_scale_base=512,
+        interpolate_factor=1.0,
+        theta_rescale_factor=1.0,
+        seq_before_head_dim=False,
+        cache_if_possible=True,
+        max_time=7200,
+    ):
+        super().__init__()
+
+        self.dim = dim
+        self.freqs_for = freqs_for
+        self.max_freq = max_freq
+        self.num_freqs = num_freqs
+        self.learned_freq = learned_freq
+        self.use_xpos = use_xpos
+        self.xpos_scale_base = xpos_scale_base
+        self.interpolate_factor = interpolate_factor
+        self.theta_rescale_factor = theta_rescale_factor
+        self.cache_if_possible = cache_if_possible
+        self.max_time = max_time
+
+        self.tmp_store("cached_freqs", None)
+        self.tmp_store("cached_scales", None)
+
+        if exists(max_time) and freqs_for == "lang":
+            theta = max_time / (2 * pi)
+
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+        self.theta = theta
+
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+
+        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
+
+        self.learned_freq = learned_freq
+
+        self.tmp_store("dummy", torch.tensor(0))
+
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+
+        assert interpolate_factor >= 1.0
+        self.interpolate_factor = interpolate_factor
+
+        if not use_xpos:
+            self.tmp_store("scale", None)
+            return
+
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.tmp_store("scale", scale)
+
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+
+    @property
+    def device(self):
+        return self.dummy.device
+
+    def tmp_store(self, key, value):
+        self.register_buffer(key, value, persistent=False)
+
+    def get_seq_pos(self, seq_len, device, dtype, offset=0):
+        return (torch.arange(seq_len, device=device, dtype=dtype) + offset) / self.interpolate_factor
+
+    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert not self.use_xpos, (
+            "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"
+        )
+
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+
+        freqs = self.forward(
+            self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset), seq_len=seq_len, offset=offset
+        )
+
+        if seq_dim == -3:
+            freqs = freqs.unsqueeze(1)
+
+        return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
+
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, offset=k_len - q_len + offset)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, offset=offset)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    def rotate_queries_and_keys(self, q, k, seq_dim=None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+
+        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
+
+        freqs = self.forward(seq, seq_len=seq_len)
+        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
+
+        if seq_dim == -3:
+            freqs = freqs.unsqueeze(1)
+            scale = scale.unsqueeze(1)
+
+        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    def get_scale(self, t: Tensor, seq_len: int | None = None, offset=0):
+        assert self.use_xpos
+
+        should_cache = self.cache_if_possible and exists(seq_len)
+
+        if should_cache and exists(self.cached_scales) and (seq_len + offset) <= self.cached_scales.shape[0]:
+            return self.cached_scales[offset : (offset + seq_len)]
+
+        scale = 1.0
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** power.unsqueeze(-1)
+            scale = torch.cat((scale, scale), dim=-1)
+
+        if should_cache:
+            self.tmp_store("cached_scales", scale)
+
+        return scale
+
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == "pixel":
+                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
+            else:
+                pos = torch.arange(dim, device=self.device)
+
+            freqs = self.forward(pos, seq_len=dim)
+
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim=-1)
+
+    @autocast("cuda", enabled=False)
+    def forward(self, t: Tensor, seq_len=None, offset=0):
+        should_cache = (
+            self.cache_if_possible and not self.learned_freq and exists(seq_len) and self.freqs_for != "pixel"
+        )
+
+        if should_cache and exists(self.cached_freqs) and (offset + seq_len) <= self.cached_freqs.shape[0]:
+            return self.cached_freqs[offset : (offset + seq_len)].detach()
+
+        freqs = self.freqs
+
+        if hasattr(self, "max_time") and self.max_time is not None:
+            t = t / self.max_time * (2 * pi)
+
+        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = torch.repeat_interleave(freqs, 2, dim=-1)
+
+        if should_cache:
+            self.tmp_store("cached_freqs", freqs.detach())
+
+        return freqs
+
+
 def eager_attention_forward(
     module: nn.Module,
     query: torch.Tensor,
@@ -263,6 +506,48 @@ class AudioFlamingo3PreTrainedModel(PreTrainedModel):
     _supports_flash_attn = True
     _supports_sdpa = True
 
+    @torch.no_grad()
+    def _init_weights(self, module):
+        """Initialize the weights for AudioFlamingo3-specific modules."""
+        if isinstance(module, RotaryEmbedding):
+            # Reinitialize freqs parameter
+            dim = module.dim
+            freqs_for = module.freqs_for
+            max_time = module.max_time
+            theta_rescale_factor = module.theta_rescale_factor
+            custom_freqs = None
+
+            # Adjust theta
+            if max_time is not None and freqs_for == "lang":
+                theta = max_time / (2 * pi)
+            else:
+                theta = 50000  # default value
+
+            theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+            # Generate freqs
+            if custom_freqs is not None:
+                freqs = custom_freqs
+            elif freqs_for == "lang":
+                freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+            elif freqs_for == "pixel":
+                freqs = torch.linspace(1.0, module.max_freq / 2, dim // 2) * pi
+            elif freqs_for == "constant":
+                freqs = torch.ones(module.num_freqs).float()
+
+            module.freqs.data = freqs
+
+            # Reinitialize dummy buffer
+            module.dummy.data = torch.tensor(0)
+
+            # Reinitialize scale if using xpos
+            if module.use_xpos and module.scale is not None:
+                scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+                module.scale.data = scale
+        else:
+            # Delegate to parent class for other modules
+            super()._init_weights(module)
+
 
 @auto_docstring(
     custom_intro="""
@@ -280,7 +565,7 @@ class AudioFlamingo3Encoder(AudioFlamingo3PreTrainedModel):
     input_modalities = "audio"
     _no_split_modules = ["AudioFlamingo3EncoderLayer"]
 
-    def __init__(self, config: AudioFlamingo3EncoderConfig):
+    def __init__(self, config: AudioFlamingo3Config):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
@@ -303,6 +588,12 @@ def __init__(self, config: AudioFlamingo3EncoderConfig):
         self.avg_pooler = nn.AvgPool1d(2, stride=2)
 
         self.gradient_checkpointing = False
+        if getattr(config, "use_rotary_embedding", False):
+            self.pos_emb = RotaryEmbedding(
+                dim=config.rotary_dim,
+                freqs_for=config.rotary_freqs_for,
+                max_time=config.rotary_max_time,
+            )
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -322,6 +613,7 @@ def forward(
         self,
         input_features: torch.Tensor,
         input_features_mask: torch.Tensor | None = None,
+        audio_times: torch.Tensor | None = None,
         **kwargs,
     ):
         r"""
@@ -334,6 +626,8 @@ def forward(
 
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
+            audio_times (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+                The start time of the audio segments in seconds. Only used if rotary embeddings are enabled.
         """
 
         seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
@@ -370,6 +664,19 @@ def forward(
         hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
         hidden_states = self.layer_norm(hidden_states)
 
+        if (
+            hasattr(self.config, "use_rotary_embedding")
+            and self.config.use_rotary_embedding
+            and audio_times is not None
+        ):
+            times = audio_times.to(hidden_states.device)
+            freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
+            angle = (-times * 2 * pi).to(self.conv1.weight.device)
+            angle_expanded = angle.unsqueeze(2).expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
+            freqs = freqs * angle_expanded
+
+            hidden_states = apply_rotary_emb(freqs, hidden_states)
+
         return BaseModelOutput(
             last_hidden_state=hidden_states,
         )
@@ -446,7 +753,10 @@ def get_decoder(self):
         return self.language_model.get_decoder()
 
     def get_audio_features(
-        self, input_features: torch.FloatTensor, input_features_mask: torch.Tensor
+        self,
+        input_features: torch.FloatTensor,
+        input_features_mask: torch.Tensor,
+        audio_times: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         """
         This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
@@ -459,6 +769,8 @@ def get_audio_features(
                 and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
             input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
                 Mask to avoid performing attention on padded feature indices.
+            audio_times (`torch.Tensor` of shape `(batch_size,)`, *optional*):
+                The start time of the audio segments in seconds.
 
         Returns:
             `torch.FloatTensor`:
@@ -466,7 +778,9 @@ def get_audio_features(
         """
 
         # Encode audio
-        encoder_output = self.audio_tower(input_features, input_features_mask=input_features_mask)
+        encoder_output = self.audio_tower(
+            input_features, input_features_mask=input_features_mask, audio_times=audio_times
+        )
         audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
 
         # Mask according to avg pooling (which is after attention blocks)
@@ -482,6 +796,7 @@ def forward(
         input_ids: torch.LongTensor | None = None,
         input_features: torch.FloatTensor | None = None,
         input_features_mask: torch.Tensor | None = None,
+        audio_times: torch.Tensor | None = None,
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.LongTensor | None = None,
         past_key_values: Cache | None = None,
@@ -498,6 +813,8 @@ def forward(
 
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
+        audio_times (`torch.Tensor` of shape `(batch_size,)`, *optional*):
+            The start time of the audio segments in seconds.
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
             config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -559,7 +876,7 @@ def forward(
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
         if input_features is not None and input_ids is not None:
-            audio_embeds = self.get_audio_features(input_features, input_features_mask)
+            audio_embeds = self.get_audio_features(input_features, input_features_mask, audio_times=audio_times)
 
             # replace text-audio token placeholders with audio embeddings
             audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
@@ -585,6 +902,7 @@ def prepare_inputs_for_generation(self, *args, **kwargs):
 
         input_features = kwargs.pop("input_features", None)
         input_features_mask = kwargs.pop("input_features_mask", None)
+        audio_times = kwargs.pop("audio_times", None)
         cache_position = kwargs.get("cache_position")
 
         model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
@@ -595,6 +913,8 @@ def prepare_inputs_for_generation(self, *args, **kwargs):
                 model_inputs["input_features"] = input_features
             if input_features_mask is not None:
                 model_inputs["input_features_mask"] = input_features_mask
+            if audio_times is not None:
+                model_inputs["audio_times"] = audio_times
 
         return model_inputs
 
diff --git a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
index 67f021e078ba..028726f72632 100644
--- a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
@@ -14,8 +14,12 @@
 # limitations under the License.
 
 
+from math import pi
+
 import torch
-from torch import nn
+from torch import Tensor, broadcast_tensors, einsum, nn
+from torch.amp import autocast
+from torch.nn import Module
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache
@@ -35,12 +39,307 @@
 logger = logging.get_logger(__name__)
 
 
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    return val if exists(val) else d
+
+
+def broadcat(tensors, dim=-1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+    return torch.cat(broadcasted_tensors, dim=dim)
+
+
+def rotate_half(x):
+    x = x.reshape(*x.shape[:-1], -1, 2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+@autocast("cuda", enabled=False)
+def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
+    ori_dtype = t.dtype
+    embed_dtype = torch.float64
+    t = t.to(embed_dtype)
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        if freqs.ndim == 2:
+            freqs = freqs[-seq_len:].to(t)
+        else:
+            freqs = freqs.to(t)
+
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+
+    assert rot_dim <= t.shape[-1], (
+        f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    )
+
+    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    return torch.cat((t_left, t, t_right), dim=-1).to(ori_dtype)
+
+
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
+    if exists(freq_ranges):
+        rotations = einsum("..., f -> ... f", rotations, freq_ranges)
+        rotations = rotations.flatten(-2)
+
+    rotations = torch.repeat_interleave(rotations, 2, dim=-1)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
+
+
+class RotaryEmbedding(Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Tensor | None = None,
+        freqs_for="lang",
+        theta=50000,
+        max_freq=10,
+        num_freqs=1,
+        learned_freq=False,
+        use_xpos=False,
+        xpos_scale_base=512,
+        interpolate_factor=1.0,
+        theta_rescale_factor=1.0,
+        seq_before_head_dim=False,
+        cache_if_possible=True,
+        max_time=7200,
+    ):
+        super().__init__()
+
+        self.dim = dim
+        self.freqs_for = freqs_for
+        self.max_freq = max_freq
+        self.num_freqs = num_freqs
+        self.learned_freq = learned_freq
+        self.use_xpos = use_xpos
+        self.xpos_scale_base = xpos_scale_base
+        self.interpolate_factor = interpolate_factor
+        self.theta_rescale_factor = theta_rescale_factor
+        self.cache_if_possible = cache_if_possible
+        self.max_time = max_time
+
+        self.tmp_store("cached_freqs", None)
+        self.tmp_store("cached_scales", None)
+
+        if exists(max_time) and freqs_for == "lang":
+            theta = max_time / (2 * pi)
+
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+        self.theta = theta
+
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+
+        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
+
+        self.learned_freq = learned_freq
+
+        self.tmp_store("dummy", torch.tensor(0))
+
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+
+        assert interpolate_factor >= 1.0
+        self.interpolate_factor = interpolate_factor
+
+        if not use_xpos:
+            self.tmp_store("scale", None)
+            return
+
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.tmp_store("scale", scale)
+
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+
+    @property
+    def device(self):
+        return self.dummy.device
+
+    def tmp_store(self, key, value):
+        self.register_buffer(key, value, persistent=False)
+
+    def get_seq_pos(self, seq_len, device, dtype, offset=0):
+        return (torch.arange(seq_len, device=device, dtype=dtype) + offset) / self.interpolate_factor
+
+    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert not self.use_xpos, (
+            "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"
+        )
+
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+
+        freqs = self.forward(
+            self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset), seq_len=seq_len, offset=offset
+        )
+
+        if seq_dim == -3:
+            freqs = freqs.unsqueeze(1)
+
+        return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
+
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, offset=k_len - q_len + offset)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, offset=offset)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    def rotate_queries_and_keys(self, q, k, seq_dim=None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+
+        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
+
+        freqs = self.forward(seq, seq_len=seq_len)
+        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
+
+        if seq_dim == -3:
+            freqs = freqs.unsqueeze(1)
+            scale = scale.unsqueeze(1)
+
+        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
+
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+
+        return rotated_q, rotated_k
+
+    def get_scale(self, t: Tensor, seq_len: int | None = None, offset=0):
+        assert self.use_xpos
+
+        should_cache = self.cache_if_possible and exists(seq_len)
+
+        if should_cache and exists(self.cached_scales) and (seq_len + offset) <= self.cached_scales.shape[0]:
+            return self.cached_scales[offset : (offset + seq_len)]
+
+        scale = 1.0
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** power.unsqueeze(-1)
+            scale = torch.cat((scale, scale), dim=-1)
+
+        if should_cache:
+            self.tmp_store("cached_scales", scale)
+
+        return scale
+
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == "pixel":
+                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
+            else:
+                pos = torch.arange(dim, device=self.device)
+
+            freqs = self.forward(pos, seq_len=dim)
+
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim=-1)
+
+    @autocast("cuda", enabled=False)
+    def forward(self, t: Tensor, seq_len=None, offset=0):
+        should_cache = (
+            self.cache_if_possible and not self.learned_freq and exists(seq_len) and self.freqs_for != "pixel"
+        )
+
+        if should_cache and exists(self.cached_freqs) and (offset + seq_len) <= self.cached_freqs.shape[0]:
+            return self.cached_freqs[offset : (offset + seq_len)].detach()
+
+        freqs = self.freqs
+
+        if hasattr(self, "max_time") and self.max_time is not None:
+            t = t / self.max_time * (2 * pi)
+
+        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = torch.repeat_interleave(freqs, 2, dim=-1)
+
+        if should_cache:
+            self.tmp_store("cached_freqs", freqs.detach())
+
+        return freqs
+
+
 class AudioFlamingo3EncoderLayer(WhisperEncoderLayer):
     pass
 
 
 class AudioFlamingo3PreTrainedModel(Qwen2AudioPreTrainedModel):
-    pass
+    @torch.no_grad()
+    def _init_weights(self, module):
+        """Initialize the weights for AudioFlamingo3-specific modules."""
+        if isinstance(module, RotaryEmbedding):
+            # Reinitialize freqs parameter
+            dim = module.dim
+            freqs_for = module.freqs_for
+            max_time = module.max_time
+            theta_rescale_factor = module.theta_rescale_factor
+            custom_freqs = None
+
+            # Adjust theta
+            if max_time is not None and freqs_for == "lang":
+                theta = max_time / (2 * pi)
+            else:
+                theta = 50000  # default value
+
+            theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+            # Generate freqs
+            if custom_freqs is not None:
+                freqs = custom_freqs
+            elif freqs_for == "lang":
+                freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+            elif freqs_for == "pixel":
+                freqs = torch.linspace(1.0, module.max_freq / 2, dim // 2) * pi
+            elif freqs_for == "constant":
+                freqs = torch.ones(module.num_freqs).float()
+
+            module.freqs.data = freqs
+
+            # Reinitialize dummy buffer
+            module.dummy.data = torch.tensor(0)
+
+            # Reinitialize scale if using xpos
+            if module.use_xpos and module.scale is not None:
+                scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+                module.scale.data = scale
+        else:
+            # Delegate to parent class for other modules
+            super()._init_weights(module)
 
 
 @auto_docstring(
@@ -53,11 +352,21 @@ class AudioFlamingo3Encoder(Qwen2AudioEncoder):
     AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
     """
 
+    def __init__(self, config: AudioFlamingo3Config):
+        super().__init__(config)
+        if getattr(config, "use_rotary_embedding", False):
+            self.pos_emb = RotaryEmbedding(
+                dim=config.rotary_dim,
+                freqs_for=config.rotary_freqs_for,
+                max_time=config.rotary_max_time,
+            )
+
     @can_return_tuple
     def forward(
         self,
         input_features: torch.Tensor,
         input_features_mask: torch.Tensor | None = None,
+        audio_times: torch.Tensor | None = None,
         **kwargs,
     ):
         r"""
@@ -70,6 +379,8 @@ def forward(
 
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
+            audio_times (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+                The start time of the audio segments in seconds. Only used if rotary embeddings are enabled.
         """
 
         seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
@@ -106,6 +417,19 @@ def forward(
         hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
         hidden_states = self.layer_norm(hidden_states)
 
+        if (
+            hasattr(self.config, "use_rotary_embedding")
+            and self.config.use_rotary_embedding
+            and audio_times is not None
+        ):
+            times = audio_times.to(hidden_states.device)
+            freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
+            angle = (-times * 2 * pi).to(self.conv1.weight.device)
+            angle_expanded = angle.unsqueeze(2).expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
+            freqs = freqs * angle_expanded
+
+            hidden_states = apply_rotary_emb(freqs, hidden_states)
+
         return BaseModelOutput(
             last_hidden_state=hidden_states,
         )
@@ -142,7 +466,10 @@ def __init__(self, config):
         super().__init__(config)
 
     def get_audio_features(
-        self, input_features: torch.FloatTensor, input_features_mask: torch.Tensor
+        self,
+        input_features: torch.FloatTensor,
+        input_features_mask: torch.Tensor,
+        audio_times: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         """
         This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
@@ -155,6 +482,8 @@ def get_audio_features(
                 and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
             input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
                 Mask to avoid performing attention on padded feature indices.
+            audio_times (`torch.Tensor` of shape `(batch_size,)`, *optional*):
+                The start time of the audio segments in seconds.
 
         Returns:
             `torch.FloatTensor`:
@@ -162,7 +491,9 @@ def get_audio_features(
         """
 
         # Encode audio
-        encoder_output = self.audio_tower(input_features, input_features_mask=input_features_mask)
+        encoder_output = self.audio_tower(
+            input_features, input_features_mask=input_features_mask, audio_times=audio_times
+        )
         audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
 
         # Mask according to avg pooling (which is after attention blocks)
@@ -181,6 +512,7 @@ def forward(
         input_ids: torch.LongTensor | None = None,
         input_features: torch.FloatTensor | None = None,
         input_features_mask: torch.Tensor | None = None,
+        audio_times: torch.Tensor | None = None,
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.LongTensor | None = None,
         past_key_values: Cache | None = None,
@@ -197,6 +529,8 @@ def forward(
 
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
+        audio_times (`torch.Tensor` of shape `(batch_size,)`, *optional*):
+            The start time of the audio segments in seconds.
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
             config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -258,7 +592,7 @@ def forward(
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
         if input_features is not None and input_ids is not None:
-            audio_embeds = self.get_audio_features(input_features, input_features_mask)
+            audio_embeds = self.get_audio_features(input_features, input_features_mask, audio_times=audio_times)
 
             # replace text-audio token placeholders with audio embeddings
             audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
@@ -284,6 +618,7 @@ def prepare_inputs_for_generation(self, *args, **kwargs):
 
         input_features = kwargs.pop("input_features", None)
         input_features_mask = kwargs.pop("input_features_mask", None)
+        audio_times = kwargs.pop("audio_times", None)
         cache_position = kwargs.get("cache_position")
 
         model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
@@ -294,6 +629,8 @@ def prepare_inputs_for_generation(self, *args, **kwargs):
                 model_inputs["input_features"] = input_features
             if input_features_mask is not None:
                 model_inputs["input_features_mask"] = input_features_mask
+            if audio_times is not None:
+                model_inputs["audio_times"] = audio_times
 
         return model_inputs
 
diff --git a/src/transformers/models/audioflamingo3/processing_audioflamingo3.py b/src/transformers/models/audioflamingo3/processing_audioflamingo3.py
index 1dd766c14da2..6d05cb887f5f 100644
--- a/src/transformers/models/audioflamingo3/processing_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/processing_audioflamingo3.py
@@ -82,9 +82,15 @@ def __init__(
         audio_token="<sound>",
         default_transcription_prompt="Transcribe the input speech.",
         max_audio_len=600,
+        audio_bos_token=None,
+        audio_eos_token=None,
     ):
         self.audio_token = audio_token
+        self.audio_bos_token = audio_bos_token
+        self.audio_eos_token = audio_eos_token
         self.audio_token_id = tokenizer.convert_tokens_to_ids(audio_token)
+        self.audio_bos_token_id = tokenizer.convert_tokens_to_ids(audio_bos_token) if audio_bos_token else None
+        self.audio_eos_token_id = tokenizer.convert_tokens_to_ids(audio_eos_token) if audio_eos_token else None
         self.default_transcription_prompt = default_transcription_prompt
         self.max_audio_len = max_audio_len
         super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
@@ -152,6 +158,7 @@ def __call__(
 
             per_sample_windows: list[int] = []
             flat_chunks: list[np.ndarray] = []
+            audio_times_list: list[torch.Tensor] = []
 
             for audio_el in audio:
                 n_samples = int(audio_el.shape[0])
@@ -168,19 +175,33 @@ def __call__(
                     start = i * window_size
                     end = min((i + 1) * window_size, time_cap)
                     flat_chunks.append(audio_el[start:end])
+                    # Calculate the start time of this audio chunk in seconds
+                    start_sec = start / audio_kwargs["sampling_rate"]
+
+                    # Generate 750 timestamps at 40ms intervals (30s / 750 = 0.04s)
+                    if is_torch_available():
+                        chunk_times = torch.arange(750).float() * 0.04 + start_sec
+                        audio_times_list.append(chunk_times)
 
             # Feature extraction
             audio_inputs = self.feature_extractor(flat_chunks, **audio_kwargs)
             padding_mask = audio_inputs.pop("attention_mask")
             audio_inputs["input_features_mask"] = padding_mask
 
+            # Add audio times as tensor
+            if return_tensors == "pt" and audio_times_list:
+                audio_inputs["audio_times"] = torch.stack(audio_times_list).to(dtype=torch.float32)
+
             # Compute sequence lengths token counting
             audio_lengths = torch.stack([s.sum() for s in torch.split(padding_mask.sum(-1), per_sample_windows)])
             audio_tokens_lengths = self._get_audio_token_length(audio_lengths)
 
             # expand audio tokens in text
             for i, audio_length in enumerate(audio_tokens_lengths):
-                expanded = re.sub(re.escape(self.audio_token), self.audio_token * audio_length, text[i])
+                replacement = self.audio_token * audio_length
+                if self.audio_bos_token is not None and self.audio_eos_token is not None:
+                    replacement = self.audio_bos_token + replacement + self.audio_eos_token
+                expanded = re.sub(re.escape(self.audio_token), replacement, text[i])
                 text[i] = expanded
 
         # Tokenize
@@ -190,6 +211,10 @@ def __call__(
         if output_labels:
             labels = data["input_ids"].clone()
             labels[labels == self.audio_token_id] = -100
+            if self.audio_bos_token_id is not None:
+                labels[labels == self.audio_bos_token_id] = -100
+            if self.audio_eos_token_id is not None:
+                labels[labels == self.audio_eos_token_id] = -100
             labels[labels == self.tokenizer.pad_token_id] = -100
             data["labels"] = labels
 
@@ -199,7 +224,7 @@ def __call__(
     def model_input_names(self) -> list[str]:
         tok_names = self.tokenizer.model_input_names
         fea_names = self.feature_extractor.model_input_names
-        return list(dict.fromkeys(tok_names + fea_names + ["input_features_mask"]))
+        return list(dict.fromkeys(tok_names + fea_names + ["input_features_mask", "audio_times"]))
 
     def apply_transcription_request(
         self,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6c63b1278a23..a110207c5e67 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -284,8 +284,6 @@
         ("mpt", "MptConfig"),
         ("mra", "MraConfig"),
         ("mt5", "MT5Config"),
-        ("musicflamingo", "MusicFlamingoConfig"),
-        ("musicflamingo_encoder", "MusicFlamingoEncoderConfig"),
         ("musicgen", "MusicgenConfig"),
         ("musicgen_melody", "MusicgenMelodyConfig"),
         ("mvp", "MvpConfig"),
@@ -750,8 +748,6 @@
         ("mpt", "MPT"),
         ("mra", "MRA"),
         ("mt5", "MT5"),
-        ("musicflamingo", "MusicFlamingo"),
-        ("musicflamingo_encoder", "MusicFlamingoEncoder"),
         ("musicgen", "MusicGen"),
         ("musicgen_melody", "MusicGen Melody"),
         ("mvp", "MVP"),
@@ -956,7 +952,6 @@
 SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
     [
         ("audioflamingo3_encoder", "audioflamingo3"),
-        ("musicflamingo_encoder", "musicflamingo"),
         ("openai-gpt", "openai"),
         ("blip-2", "blip_2"),
         ("data2vec-audio", "data2vec"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 2945681e8c04..ef135b8db03b 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -284,8 +284,6 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("mpt", "MptModel"),
         ("mra", "MraModel"),
         ("mt5", "MT5Model"),
-        ("musicflamingo", "MusicFlamingoForConditionalGeneration"),
-        ("musicflamingo_encoder", "MusicFlamingoEncoder"),
         ("musicgen", "MusicgenModel"),
         ("musicgen_melody", "MusicgenMelodyModel"),
         ("mvp", "MvpModel"),
@@ -509,7 +507,6 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("mpnet", "MPNetForMaskedLM"),
         ("mpt", "MptForCausalLM"),
         ("mra", "MraForMaskedLM"),
-        ("musicflamingo", "MusicFlamingoForConditionalGeneration"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nanochat", "NanoChatForCausalLM"),
         ("nllb-moe", "NllbMoeForConditionalGeneration"),
@@ -1189,7 +1186,6 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("marian", "MarianMTModel"),
         ("mbart", "MBartForConditionalGeneration"),
         ("mt5", "MT5ForConditionalGeneration"),
-        ("musicflamingo", "MusicFlamingoForConditionalGeneration"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nllb-moe", "NllbMoeForConditionalGeneration"),
         ("pegasus", "PegasusForConditionalGeneration"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index cc2aa5e8df62..8ec4f0254d24 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -112,7 +112,6 @@
         ("mllama", "MllamaProcessor"),
         ("mm-grounding-dino", "GroundingDinoProcessor"),
         ("moonshine", "Wav2Vec2Processor"),
-        ("musicflamingo", "MusicFlamingoProcessor"),
         ("omdet-turbo", "OmDetTurboProcessor"),
         ("oneformer", "OneFormerProcessor"),
         ("ovis2", "Ovis2Processor"),
diff --git a/src/transformers/models/musicflamingo/__init__.py b/src/transformers/models/musicflamingo/__init__.py
deleted file mode 100644
index 8d136b00e0ad..000000000000
--- a/src/transformers/models/musicflamingo/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule
-from ...utils.import_utils import define_import_structure
-
-
-if TYPE_CHECKING:
-    from .configuration_musicflamingo import *
-    from .modeling_musicflamingo import *
-    from .processing_musicflamingo import *
-else:
-    import sys
-
-    _file = globals()["__file__"]
-    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/musicflamingo/configuration_musicflamingo.py b/src/transformers/models/musicflamingo/configuration_musicflamingo.py
deleted file mode 100644
index 957d58676091..000000000000
--- a/src/transformers/models/musicflamingo/configuration_musicflamingo.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
-from ..auto import CONFIG_MAPPING, AutoConfig
-
-
-logger = logging.get_logger(__name__)
-
-
-class MusicFlamingoEncoderConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of an [`MusicFlamingoEncoder`]. It is used to instantiate an
-    MusicFlamingo audio encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the audio encoder of the MusicFlamingo
-    architecture.
-
-    e.g. [nvidia/music-flamingo-hf](https://huggingface.co/nvidia/music-flamingo-hf)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        num_mel_bins (`int`, *optional*, defaults to 128):
-            Number of mel features used per input features. Should correspond to the value used in the
-            `MusicFlamingoProcessor` class.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of encoder layers.
-        num_attention_heads (`int`, *optional*, defaults to 20):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 5120):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
-        layerdrop (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the [LayerDrop paper](https://huggingface.co/papers/1909.11556)
-            for more details.
-        activation_function (`str`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        hidden_size (`int`, *optional*, defaults to 1280):
-            Dimensionality of the layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        activation_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for activations inside the fully connected layer.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        scale_embedding (`bool`, *optional*, defaults to `False`):
-            Scale embeddings by dividing by sqrt(hidden_size).
-        max_source_positions (`int`, *optional*, defaults to 1500):
-            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
-
-    Example:
-
-    ```python
-    >>> from transformers import MusicFlamingoEncoderConfig, MusicFlamingoEncoder
-
-    >>> # Initializing an MusicFlamingoEncoderConfig
-    >>> configuration = MusicFlamingoEncoderConfig()
-
-    >>> # Initializing an MusicFlamingoEncoder (with random weights)
-    >>> model = MusicFlamingoEncoder(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "musicflamingo_encoder"
-
-    attribute_map = {
-        "d_model": "hidden_size",
-        "encoder_layers": "num_hidden_layers",
-        "encoder_attention_heads": "num_attention_heads",
-        "encoder_ffn_dim": "intermediate_size",
-        "encoder_layerdrop": "layerdrop",
-    }
-
-    def __init__(
-        self,
-        num_mel_bins=128,
-        num_hidden_layers=32,
-        num_attention_heads=20,
-        intermediate_size=5120,
-        layerdrop=0.0,
-        activation_function="gelu",
-        hidden_size=1280,
-        dropout=0.0,
-        attention_dropout=0.0,
-        activation_dropout=0.0,
-        initializer_range=0.02,
-        scale_embedding=False,
-        max_source_positions=1500,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.num_mel_bins = num_mel_bins
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.activation_function = activation_function
-        self.initializer_range = initializer_range
-        self.layerdrop = layerdrop
-        self.num_hidden_layers = num_hidden_layers
-        self.scale_embedding = scale_embedding
-        self.max_source_positions = max_source_positions
-
-
-class MusicFlamingoConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of an [`MusicFlamingoForConditionalGeneration`]. It is used to instantiate an
-    MusicFlamingo model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the MusicFlamingo.
-
-    e.g. [nvidia/music-flamingo-hf](https://huggingface.co/nvidia/music-flamingo-hf)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        audio_config (`Union[MusicFlamingoEncoderConfig, dict]`, *optional*, defaults to `MusicFlamingoEncoderConfig`):
-            The config object or dictionary of the audio backbone.
-        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
-            The config object or dictionary of the text backbone.
-        audio_token_id (`int`, *optional*, defaults to 151669):
-            The audio token index to encode the audio prompt.
-        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
-            Activation function used in the projector.
-        projector_bias (`bool`, *optional*, defaults to `True`):
-            Whether to include bias terms in the projector.
-
-    Example:
-
-    ```python
-    >>> from transformers import MusicFlamingoForConditionalGeneration, MusicFlamingoConfig, MusicFlamingoEncoderConfig, Qwen2Config
-
-    >>> # Initializing an MusicFlamingoEncoder config
-    >>> audio_config = MusicFlamingoEncoderConfig()
-
-    >>> # Initializing a Qwen2 config
-    >>> text_config = Qwen2Config()
-
-    >>> # Initializing an MusicFlamingo configuration
-    >>> configuration = MusicFlamingoConfig(audio_config, text_config)
-
-    >>> # Initializing a model from the musicflamingo style configuration
-    >>> model = MusicFlamingoForConditionalGeneration(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "musicflamingo"
-    sub_configs = {
-        "audio_config": MusicFlamingoEncoderConfig,
-        "text_config": AutoConfig,
-    }
-
-    def __init__(
-        self,
-        audio_config=None,
-        text_config=None,
-        audio_token_id=151669,
-        projector_hidden_act="gelu",
-        projector_bias=True,
-        **kwargs,
-    ):
-        self.audio_token_id = audio_token_id
-
-        if isinstance(audio_config, dict):
-            audio_config["model_type"] = audio_config.get("model_type", "musicflamingo_encoder")
-            audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config)
-        elif audio_config is None:
-            audio_config = CONFIG_MAPPING["musicflamingo_encoder"]()
-
-        self.audio_config = audio_config
-
-        if isinstance(text_config, dict):
-            text_config["model_type"] = text_config.get("model_type", "qwen2")
-            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
-        elif text_config is None:
-            text_config = CONFIG_MAPPING["qwen2"]()
-
-        self.text_config = text_config
-        self.projector_hidden_act = projector_hidden_act
-        self.projector_bias = projector_bias
-
-        super().__init__(**kwargs)
-
-
-__all__ = ["MusicFlamingoConfig", "MusicFlamingoEncoderConfig"]
diff --git a/src/transformers/models/musicflamingo/modeling_musicflamingo.py b/src/transformers/models/musicflamingo/modeling_musicflamingo.py
deleted file mode 100644
index 1d0b3e36895f..000000000000
--- a/src/transformers/models/musicflamingo/modeling_musicflamingo.py
+++ /dev/null
@@ -1,925 +0,0 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/musicflamingo/modular_musicflamingo.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_musicflamingo.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from collections.abc import Callable
-from math import pi
-
-import torch
-from torch import Tensor, broadcast_tensors, einsum, nn
-from torch.amp import autocast
-from torch.nn import Module
-
-from ...activations import ACT2FN
-from ...cache_utils import Cache, EncoderDecoderCache
-from ...generation import GenerationMixin
-from ...masking_utils import create_bidirectional_mask
-from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_layers import GradientCheckpointingLayer
-from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
-from ..auto import AutoModel, AutoModelForCausalLM
-from .configuration_musicflamingo import MusicFlamingoConfig, MusicFlamingoEncoderConfig
-
-
-logger = logging.get_logger(__name__)
-
-
-###################################################################################################
-###################################################################################################
-###################################################################################################
-
-
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    return val if exists(val) else d
-
-
-# rotary embedding helper functions
-def rotate_half(x):
-    x = x.reshape(*x.shape[:-1], -1, 2)
-    x1, x2 = x.unbind(dim=-1)
-    x = torch.stack((-x2, x1), dim=-1)
-    return x.flatten(-2)
-
-
-@autocast("cuda", enabled=False)
-def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
-    ori_dtype = t.dtype
-    embed_dtype = torch.float64
-    t = t.to(embed_dtype)
-    if t.ndim == 3:
-        seq_len = t.shape[seq_dim]
-        if freqs.ndim == 2:
-            freqs = freqs[-seq_len:].to(t)
-        else:
-            freqs = freqs.to(t)
-
-    rot_dim = freqs.shape[-1]
-    end_index = start_index + rot_dim
-
-    assert rot_dim <= t.shape[-1], (
-        f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
-    )
-
-    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
-    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
-    return torch.cat((t_left, t, t_right), dim=-1).to(ori_dtype)
-
-
-# classes
-class RotaryEmbedding(Module):
-    def __init__(
-        self,
-        dim,
-        custom_freqs: Tensor | None = None,
-        freqs_for="lang",
-        theta=50000,
-        max_freq=10,
-        num_freqs=1,
-        learned_freq=False,
-        use_xpos=False,
-        xpos_scale_base=512,
-        interpolate_factor=1.0,
-        theta_rescale_factor=1.0,
-        seq_before_head_dim=False,
-        cache_if_possible=True,
-        max_time=7200,
-    ):
-        super().__init__()
-
-        self.dim = dim
-        self.freqs_for = freqs_for
-        self.max_freq = max_freq
-        self.num_freqs = num_freqs
-        self.learned_freq = learned_freq
-        self.use_xpos = use_xpos
-        self.xpos_scale_base = xpos_scale_base
-        self.interpolate_factor = interpolate_factor
-        self.theta_rescale_factor = theta_rescale_factor
-        self.cache_if_possible = cache_if_possible
-        self.max_time = max_time
-
-        self.tmp_store("cached_freqs", None)
-        self.tmp_store("cached_scales", None)
-
-        # Adjust theta to avoid angle wrapping after large times
-        if exists(max_time) and freqs_for == "lang":
-            # Make sure highest frequency completes 1 full rotation over max time
-            # theta = base of exponent: higher theta → lower frequency range
-            # max_time * (1/theta^(0)) = 2pi  =>  theta = max_time / (2pi)
-            theta = max_time / (2 * pi)
-
-        theta *= theta_rescale_factor ** (dim / (dim - 2))
-
-        self.theta = theta
-
-        if exists(custom_freqs):
-            freqs = custom_freqs
-        elif freqs_for == "lang":
-            freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-        elif freqs_for == "pixel":
-            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
-        elif freqs_for == "constant":
-            freqs = torch.ones(num_freqs).float()
-
-        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
-
-        self.learned_freq = learned_freq
-
-        # dummy for device
-
-        self.tmp_store("dummy", torch.tensor(0))
-
-        # default sequence dimension
-
-        self.seq_before_head_dim = seq_before_head_dim
-        self.default_seq_dim = -3 if seq_before_head_dim else -2
-
-        # interpolation factors
-
-        assert interpolate_factor >= 1.0
-        self.interpolate_factor = interpolate_factor
-
-        # xpos
-        if not use_xpos:
-            self.tmp_store("scale", None)
-            return
-
-        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
-        self.scale_base = xpos_scale_base
-        self.tmp_store("scale", scale)
-
-        # add apply_rotary_emb as static method
-
-        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
-
-    @property
-    def device(self):
-        return self.dummy.device
-
-    def tmp_store(self, key, value):
-        self.register_buffer(key, value, persistent=False)
-
-    def get_seq_pos(self, seq_len, device, dtype, offset=0):
-        return (torch.arange(seq_len, device=device, dtype=dtype) + offset) / self.interpolate_factor
-
-    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0):
-        seq_dim = default(seq_dim, self.default_seq_dim)
-
-        assert not self.use_xpos, (
-            "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"
-        )
-
-        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
-
-        freqs = self.forward(
-            self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset), seq_len=seq_len, offset=offset
-        )
-
-        if seq_dim == -3:
-            freqs = freqs.unsqueeze(1)
-
-        return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
-
-    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
-        seq_dim = default(seq_dim, self.default_seq_dim)
-
-        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
-        assert q_len <= k_len
-
-        rotated_q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, offset=k_len - q_len + offset)
-        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, offset=offset)
-
-        rotated_q = rotated_q.type(q.dtype)
-        rotated_k = rotated_k.type(k.dtype)
-
-        return rotated_q, rotated_k
-
-    def rotate_queries_and_keys(self, q, k, seq_dim=None):
-        seq_dim = default(seq_dim, self.default_seq_dim)
-
-        assert self.use_xpos
-        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
-
-        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
-
-        freqs = self.forward(seq, seq_len=seq_len)
-        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
-
-        if seq_dim == -3:
-            freqs = freqs.unsqueeze(1)
-            scale = scale.unsqueeze(1)
-
-        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
-        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
-
-        rotated_q = rotated_q.type(q.dtype)
-        rotated_k = rotated_k.type(k.dtype)
-
-        return rotated_q, rotated_k
-
-    def get_scale(self, t: Tensor, seq_len: int | None = None, offset=0):
-        assert self.use_xpos
-
-        should_cache = self.cache_if_possible and exists(seq_len)
-
-        if should_cache and exists(self.cached_scales) and (seq_len + offset) <= self.cached_scales.shape[0]:
-            return self.cached_scales[offset : (offset + seq_len)]
-
-        scale = 1.0
-        if self.use_xpos:
-            power = (t - len(t) // 2) / self.scale_base
-            scale = self.scale ** power.unsqueeze(-1)
-            scale = torch.cat((scale, scale), dim=-1)
-
-        if should_cache:
-            self.tmp_store("cached_scales", scale)
-
-        return scale
-
-    def get_axial_freqs(self, *dims):
-        Colon = slice(None)
-        all_freqs = []
-
-        for ind, dim in enumerate(dims):
-            if self.freqs_for == "pixel":
-                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
-            else:
-                pos = torch.arange(dim, device=self.device)
-
-            freqs = self.forward(pos, seq_len=dim)
-
-            all_axis = [None] * len(dims)
-            all_axis[ind] = Colon
-
-            new_axis_slice = (Ellipsis, *all_axis, Colon)
-            all_freqs.append(freqs[new_axis_slice])
-
-        all_freqs = broadcast_tensors(*all_freqs)
-        return torch.cat(all_freqs, dim=-1)
-
-    @autocast("cuda", enabled=False)
-    def forward(self, t: Tensor, seq_len=None, offset=0):
-        should_cache = (
-            self.cache_if_possible and not self.learned_freq and exists(seq_len) and self.freqs_for != "pixel"
-        )
-
-        if should_cache and exists(self.cached_freqs) and (offset + seq_len) <= self.cached_freqs.shape[0]:
-            return self.cached_freqs[offset : (offset + seq_len)].detach()
-
-        freqs = self.freqs
-
-        # Scale time to keep t * freq <= 2pi
-        if hasattr(self, "max_time") and self.max_time is not None:
-            t = t / self.max_time * (2 * pi)
-
-        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
-        freqs = torch.repeat_interleave(freqs, 2, dim=-1)
-
-        if should_cache:
-            self.tmp_store("cached_freqs", freqs.detach())
-
-        return freqs
-
-
-@auto_docstring
-class MusicFlamingoPreTrainedModel(PreTrainedModel):
-    config: MusicFlamingoConfig
-    base_model_prefix = "model"
-    input_modalities = ("audio", "text")
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["MusicFlamingoAttention"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn = True
-    _supports_sdpa = True
-
-    @torch.no_grad()
-    def _init_weights(self, module):
-        """Initialize the weights for MusicFlamingo-specific modules."""
-        if isinstance(module, RotaryEmbedding):
-            # Reinitialize freqs parameter
-            dim = module.dim
-            freqs_for = module.freqs_for
-            max_time = module.max_time
-            theta_rescale_factor = module.theta_rescale_factor
-            custom_freqs = None
-
-            # Adjust theta
-            if max_time is not None and freqs_for == "lang":
-                theta = max_time / (2 * pi)
-            else:
-                theta = 50000  # default value
-
-            theta *= theta_rescale_factor ** (dim / (dim - 2))
-
-            # Generate freqs
-            if custom_freqs is not None:
-                freqs = custom_freqs
-            elif freqs_for == "lang":
-                freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-            elif freqs_for == "pixel":
-                freqs = torch.linspace(1.0, module.max_freq / 2, dim // 2) * pi
-            elif freqs_for == "constant":
-                freqs = torch.ones(module.num_freqs).float()
-
-            module.freqs.data = freqs
-
-            # Reinitialize dummy buffer
-            module.dummy.data = torch.tensor(0)
-
-            # Reinitialize scale if using xpos
-            if module.use_xpos and module.scale is not None:
-                scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
-                module.scale.data = scale
-        else:
-            # Delegate to parent class for other modules
-            super()._init_weights(module)
-
-
-def eager_attention_forward(
-    module: nn.Module,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: torch.Tensor | None,
-    scaling: float | None = None,
-    dropout: float = 0.0,
-    **kwargs,
-):
-    if scaling is None:
-        scaling = query.size(-1) ** -0.5
-
-    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
-    if attention_mask is not None and attention_mask.ndim == 4:
-        attn_weights = attn_weights + attention_mask[:, :, :, : key.shape[-2]]
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-    attn_output = torch.matmul(attn_weights, value)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    return attn_output, attn_weights
-
-
-class MusicFlamingoAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        is_decoder: bool = False,
-        bias: bool = True,
-        is_causal: bool = False,
-        layer_idx: int | None = None,
-        config: MusicFlamingoConfig | None = None,
-    ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-        self.config = config
-
-        if (self.head_dim * num_heads) != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
-                f" and `num_heads`: {num_heads})."
-            )
-        self.scaling = self.head_dim**-0.5
-        self.is_decoder = is_decoder
-        self.is_causal = is_causal
-
-        if layer_idx is None and is_decoder:
-            logger.warning_once(
-                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
-                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-        self.layer_idx = layer_idx
-
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: torch.Tensor | None = None,
-        past_key_values: Cache | None = None,
-        attention_mask: torch.Tensor | None = None,
-        output_attentions: bool = False,
-        cache_position: torch.Tensor | None = None,
-        # TODO: we need a refactor so that the different attention modules can get their specific kwargs
-        # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
-        """Input shape: Batch x Time x Channel"""
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-
-        # determine input shapes
-        bsz, tgt_len = hidden_states.shape[:-1]
-        q_input_shape = (bsz, tgt_len, -1, self.head_dim)
-
-        # Scaling is susceptible to floating point arithmetics' inprecisions
-        # which can lead to different results (this is dependent from model
-        # to model, e.g. musicflamingo is one such case). We therefore keep the
-        # original order of scaling to follow the original implementation
-        # and enforce no scaling (1.0) in the attention call below.
-        query_states = self.q_proj(hidden_states) * self.scaling
-        query_states = query_states.view(*q_input_shape)
-        query_states = query_states.transpose(1, 2).contiguous()
-
-        # Check is encoder-decoder model is being used. Otherwise we'll get `DynamicCache`
-        if past_key_values is not None and isinstance(past_key_values, EncoderDecoderCache):
-            is_updated = past_key_values.is_updated.get(self.layer_idx)
-            if is_cross_attention:
-                # after the first generated id, we can subsequently re-use all key/value_states from cache
-                past_key_values.is_updated[self.layer_idx] = True
-                past_key_values = past_key_values.cross_attention_cache
-            else:
-                past_key_values = past_key_values.self_attention_cache
-
-        # use key_value_states if cross attention
-        current_states = key_value_states if key_value_states is not None else hidden_states
-        if is_cross_attention and past_key_values and is_updated:
-            # reuse k,v, cross_attentions
-            key_states = past_key_values.layers[self.layer_idx].keys
-            value_states = past_key_values.layers[self.layer_idx].values
-        else:
-            key_states = self.k_proj(current_states).view(bsz, -1, self.num_heads, self.head_dim)
-            value_states = self.v_proj(current_states).view(bsz, -1, self.num_heads, self.head_dim)
-            key_states = key_states.transpose(1, 2).contiguous()
-            value_states = value_states.transpose(1, 2).contiguous()
-            if past_key_values is not None:
-                # save all key/value_states to cache to be re-used for fast auto-regressive generation
-                cache_position = cache_position if not is_cross_attention else None
-                key_states, value_states = past_key_values.update(
-                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
-                )
-
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-
-        attn_output, attn_weights = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            dropout=0.0 if not self.training else self.dropout,
-            scaling=1.0,
-            output_attentions=output_attentions,
-            **kwargs,
-        )
-
-        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights
-
-
-class MusicFlamingoEncoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config: MusicFlamingoConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
-
-        self.self_attn = MusicFlamingoAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.encoder_attention_heads,
-            dropout=config.attention_dropout,
-            config=config,
-        )
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
-        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        output_attentions: bool = False,
-    ) -> torch.Tensor:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states, attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-
-        residual = hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-
-        if hidden_states.dtype == torch.float16:
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        return hidden_states, attn_weights
-
-
-@auto_docstring(
-    custom_intro="""
-    The audio model from MusicFlamingo without any head or projection on top.
-    """
-)
-class MusicFlamingoEncoder(MusicFlamingoPreTrainedModel):
-    """
-    MusicFlamingo encoder: Whisper encoder with rotary embeddings for time information.
-    """
-
-    # Ignore copy
-    config: MusicFlamingoEncoderConfig
-    main_input_name = "input_features"
-    input_modalities = "audio"
-    _no_split_modules = ["MusicFlamingoEncoderLayer"]
-
-    def __init__(self, config: MusicFlamingoConfig):
-        super().__init__(config)
-        self.dropout = config.dropout
-        self.layerdrop = config.encoder_layerdrop
-
-        embed_dim = config.d_model
-        self.num_mel_bins = config.num_mel_bins
-        self.padding_idx = config.pad_token_id
-        self.max_source_positions = config.max_source_positions
-        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
-
-        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
-        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
-
-        self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
-        self.embed_positions.requires_grad_(False)
-
-        self.layers = nn.ModuleList([MusicFlamingoEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layer_norm = nn.LayerNorm(config.d_model)
-        # Ignore copy
-        self.avg_pooler = nn.AvgPool1d(2, stride=2)
-
-        self.gradient_checkpointing = False
-        self.pos_emb = RotaryEmbedding(dim=256, freqs_for="lang", max_time=1200.0)
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def _freeze_parameters(self):
-        for param in self.parameters():
-            param.requires_grad = False
-        self._requires_grad = False
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.conv1
-
-    def set_input_embeddings(self, value: nn.Module):
-        self.conv1 = value
-
-    @can_return_tuple
-    def forward(
-        self,
-        input_features: torch.Tensor,
-        input_features_mask: torch.Tensor | None = None,
-        audio_times: torch.Tensor | None = None,
-        **kwargs,
-    ):
-        r"""
-        Args:
-            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
-                Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
-                these features from waveform input.
-            input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-            audio_times (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-                The start time of the audio segments in seconds.
-        """
-        seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
-        input_features_lengths = input_features_mask.sum(-1)
-        input_features_lengths = (input_features_lengths - 1) // 2 + 1  # conv2 downsampling
-        input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
-
-        # Conv front-end
-        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
-        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
-        inputs_embeds = inputs_embeds.permute(0, 2, 1)
-
-        # Add positions, dropout
-        hidden_states = inputs_embeds + self.embed_positions.weight
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        attention_mask = create_bidirectional_mask(
-            config=self.config,
-            input_embeds=hidden_states,
-            attention_mask=input_features_mask,
-        )
-
-        # Transformer stack
-        for layer in self.layers:
-            drop = self.training and torch.rand([]) < self.layerdrop
-            if not drop:
-                hidden_states = layer(hidden_states, attention_mask)[0]
-
-        # AvgPool (time/2) + LayerNorm
-        hidden_states = hidden_states.permute(0, 2, 1)
-        hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
-        hidden_states = self.layer_norm(hidden_states)
-
-        if audio_times is not None:
-            times = audio_times.to(hidden_states.device)
-            freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
-            angle = (-times * 2 * pi).to(self.conv1.weight.device)
-            angle_expanded = angle.unsqueeze(2).expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
-            freqs = freqs * angle_expanded
-
-            hidden_states = apply_rotary_emb(freqs, hidden_states)
-
-        return BaseModelOutput(last_hidden_state=hidden_states)
-
-    # Ignore copy
-    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
-        """
-        Computes the output length of the convolutional layers and the output length of the audio encoder
-        """
-        input_lengths = (input_lengths - 1) // 2 + 1
-        output_lengths = (input_lengths - 2) // 2 + 1
-        return input_lengths, output_lengths
-
-
-class MusicFlamingoMultiModalProjector(nn.Module):
-    """
-    Audio adaptor (small MLP) that projects MusicFlamingoEncoder features
-    to the LLM embedding space so they can replace `<sound>` tokens.
-    """
-
-    def __init__(self, config: MusicFlamingoConfig):
-        super().__init__()
-        self.linear_1 = nn.Linear(
-            config.audio_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
-        )
-        self.act = ACT2FN[config.projector_hidden_act]
-        self.linear_2 = nn.Linear(
-            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
-        )
-
-    def forward(self, audio_features):
-        hidden_states = self.linear_1(audio_features)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
-        return hidden_states
-
-
-@auto_docstring(
-    custom_intro="""
-    The MusicFlamingo model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
-    """
-)
-class MusicFlamingoForConditionalGeneration(MusicFlamingoPreTrainedModel, GenerationMixin):
-    _keep_in_fp32_modules_strict = None
-    _tp_plan = None
-    _pp_plan = None
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.vocab_size = config.text_config.vocab_size
-        self.audio_tower = AutoModel.from_config(config.audio_config)
-        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
-        self.multi_modal_projector = MusicFlamingoMultiModalProjector(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.language_model.get_input_embeddings()
-
-    def set_input_embeddings(self, value):
-        self.language_model.set_input_embeddings(value)
-
-    def get_output_embeddings(self):
-        return self.language_model.get_output_embeddings()
-
-    def set_output_embeddings(self, new_embeddings):
-        self.language_model.set_output_embeddings(new_embeddings)
-
-    def set_decoder(self, decoder):
-        self.language_model.set_decoder(decoder)
-
-    def get_decoder(self):
-        return self.language_model.get_decoder()
-
-    def get_audio_features(
-        self,
-        input_features: torch.FloatTensor,
-        input_features_mask: torch.Tensor,
-        audio_times: torch.Tensor | None = None,
-    ) -> torch.FloatTensor:
-        """
-        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
-        Args:
-            input_features (`torch.FloatTensor`):
-                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
-                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
-                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
-                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
-            input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
-                Mask to avoid performing attention on padded feature indices.
-
-        Returns:
-            `torch.FloatTensor`:
-                The audio embeddings.
-        """
-        # Encode audio with dtype conversion and audio_times
-        input_features = input_features.to(dtype=self.audio_tower.conv1.weight.dtype)
-        encoder_output = self.audio_tower(
-            input_features, input_features_mask=input_features_mask, audio_times=audio_times
-        )
-        audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
-
-        # Mask according to avg pooling
-        post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
-        valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
-        audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
-        return audio_embeds
-
-    @can_return_tuple
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: torch.LongTensor | None = None,
-        input_features: torch.FloatTensor | None = None,
-        input_features_mask: torch.Tensor | None = None,
-        audio_times: torch.Tensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: Cache | None = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-        labels: torch.LongTensor | None = None,
-        use_cache: bool | None = None,
-        cache_position: torch.LongTensor | None = None,
-        logits_to_keep: int | torch.Tensor = 0,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> CausalLMOutputWithPast:
-        r"""
-        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        audio_times (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            The start time of the audio segments in seconds.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Example:
-
-        ```python
-        >>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
-
-        >>> model_id = "nvidia/music-flamingo-hf"
-        >>> processor = AutoProcessor.from_pretrained(model_id)
-        >>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-        >>> conversations = [
-        >>>     [
-        >>>         {
-        >>>             "role": "user",
-        >>>             "content": [
-        >>>                 {"type": "text", "text": "Transcribe the input speech."},
-        >>>                 {
-        >>>                     "type": "audio",
-        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
-        >>>                 },
-        >>>             ],
-        >>>         }
-        >>>     ],
-        >>>     [
-        >>>         {
-        >>>             "role": "user",
-        >>>             "content": [
-        >>>                 {
-        >>>                     "type": "text",
-        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
-        >>>                 },
-        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
-        >>>             ],
-        >>>         }
-        >>>     ],
-        >>> ]
-
-        >>> inputs = processor.apply_chat_template(
-        >>>     conversations,
-        >>>     tokenize=True,
-        >>>     add_generation_prompt=True,
-        >>>     return_dict=True,
-        >>> ).to(model.device)
-
-        >>> outputs = model.generate(**inputs, max_new_tokens=500)
-
-        >>> decoded_outputs = processor.batch_decode(
-        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
-        >>> )
-        >>> print(decoded_outputs)
-        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
-        ```"""
-
-        if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings()(input_ids)
-
-        if input_features is not None and input_ids is not None:
-            audio_embeds = self.get_audio_features(input_features, input_features_mask, audio_times=audio_times)
-
-            # replace text-audio token placeholders with audio embeddings
-            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
-            )
-
-        outputs: CausalLMOutputWithPast = self.language_model(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            labels=labels,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            logits_to_keep=logits_to_keep,
-            **kwargs,
-        )
-        return outputs
-
-    def prepare_inputs_for_generation(self, *args, **kwargs):
-        # Overwritten -- we should not pass input_features when we are in cached decoding stage
-
-        input_features = kwargs.pop("input_features", None)
-        input_features_mask = kwargs.pop("input_features_mask", None)
-        audio_times = kwargs.pop("audio_times", None)
-        cache_position = kwargs.get("cache_position")
-
-        model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
-
-        if cache_position is not None and cache_position[0] == 0:
-            # input_features should only be passed when we are not in cached decoding stage
-            if input_features is not None:
-                model_inputs["input_features"] = input_features
-            if input_features_mask is not None:
-                model_inputs["input_features_mask"] = input_features_mask
-            if audio_times is not None:
-                model_inputs["audio_times"] = audio_times
-
-        return model_inputs
-
-
-__all__ = ["MusicFlamingoForConditionalGeneration", "MusicFlamingoPreTrainedModel", "MusicFlamingoEncoder"]
diff --git a/src/transformers/models/musicflamingo/modular_musicflamingo.py b/src/transformers/models/musicflamingo/modular_musicflamingo.py
deleted file mode 100644
index 3005467ad146..000000000000
--- a/src/transformers/models/musicflamingo/modular_musicflamingo.py
+++ /dev/null
@@ -1,602 +0,0 @@
-# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from math import pi
-
-import torch
-from torch import Tensor, broadcast_tensors, einsum, nn
-from torch.amp import autocast
-from torch.nn import Module
-
-from ...cache_utils import Cache
-from ...masking_utils import create_bidirectional_mask
-from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
-from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
-from ..audioflamingo3.modeling_audioflamingo3 import (
-    AudioFlamingo3Encoder,
-    AudioFlamingo3ForConditionalGeneration,
-    AudioFlamingo3MultiModalProjector,
-    AudioFlamingo3PreTrainedModel,
-)
-from .configuration_musicflamingo import MusicFlamingoConfig
-
-
-logger = logging.get_logger(__name__)
-
-
-###################################################################################################
-###################################################################################################
-###################################################################################################
-
-
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    return val if exists(val) else d
-
-
-def broadcat(tensors, dim=-1):
-    broadcasted_tensors = broadcast_tensors(*tensors)
-    return torch.cat(broadcasted_tensors, dim=dim)
-
-
-# rotary embedding helper functions
-def rotate_half(x):
-    x = x.reshape(*x.shape[:-1], -1, 2)
-    x1, x2 = x.unbind(dim=-1)
-    x = torch.stack((-x2, x1), dim=-1)
-    return x.flatten(-2)
-
-
-@autocast("cuda", enabled=False)
-def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
-    ori_dtype = t.dtype
-    embed_dtype = torch.float64
-    t = t.to(embed_dtype)
-    if t.ndim == 3:
-        seq_len = t.shape[seq_dim]
-        if freqs.ndim == 2:
-            freqs = freqs[-seq_len:].to(t)
-        else:
-            freqs = freqs.to(t)
-
-    rot_dim = freqs.shape[-1]
-    end_index = start_index + rot_dim
-
-    assert rot_dim <= t.shape[-1], (
-        f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
-    )
-
-    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
-    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
-    return torch.cat((t_left, t, t_right), dim=-1).to(ori_dtype)
-
-
-# learned rotation helpers
-def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
-    if exists(freq_ranges):
-        rotations = einsum("..., f -> ... f", rotations, freq_ranges)
-        rotations = rotations.flatten(-2)
-
-    rotations = torch.repeat_interleave(rotations, 2, dim=-1)
-    return apply_rotary_emb(rotations, t, start_index=start_index)
-
-
-# classes
-class RotaryEmbedding(Module):
-    def __init__(
-        self,
-        dim,
-        custom_freqs: Tensor | None = None,
-        freqs_for="lang",
-        theta=50000,
-        max_freq=10,
-        num_freqs=1,
-        learned_freq=False,
-        use_xpos=False,
-        xpos_scale_base=512,
-        interpolate_factor=1.0,
-        theta_rescale_factor=1.0,
-        seq_before_head_dim=False,
-        cache_if_possible=True,
-        max_time=7200,
-    ):
-        super().__init__()
-
-        self.dim = dim
-        self.freqs_for = freqs_for
-        self.max_freq = max_freq
-        self.num_freqs = num_freqs
-        self.learned_freq = learned_freq
-        self.use_xpos = use_xpos
-        self.xpos_scale_base = xpos_scale_base
-        self.interpolate_factor = interpolate_factor
-        self.theta_rescale_factor = theta_rescale_factor
-        self.cache_if_possible = cache_if_possible
-        self.max_time = max_time
-
-        self.tmp_store("cached_freqs", None)
-        self.tmp_store("cached_scales", None)
-
-        # Adjust theta to avoid angle wrapping after large times
-        if exists(max_time) and freqs_for == "lang":
-            # Make sure highest frequency completes 1 full rotation over max time
-            # theta = base of exponent: higher theta → lower frequency range
-            # max_time * (1/theta^(0)) = 2pi  =>  theta = max_time / (2pi)
-            theta = max_time / (2 * pi)
-
-        theta *= theta_rescale_factor ** (dim / (dim - 2))
-
-        self.theta = theta
-
-        if exists(custom_freqs):
-            freqs = custom_freqs
-        elif freqs_for == "lang":
-            freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-        elif freqs_for == "pixel":
-            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
-        elif freqs_for == "constant":
-            freqs = torch.ones(num_freqs).float()
-
-        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
-
-        self.learned_freq = learned_freq
-
-        # dummy for device
-
-        self.tmp_store("dummy", torch.tensor(0))
-
-        # default sequence dimension
-
-        self.seq_before_head_dim = seq_before_head_dim
-        self.default_seq_dim = -3 if seq_before_head_dim else -2
-
-        # interpolation factors
-
-        assert interpolate_factor >= 1.0
-        self.interpolate_factor = interpolate_factor
-
-        # xpos
-        if not use_xpos:
-            self.tmp_store("scale", None)
-            return
-
-        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
-        self.scale_base = xpos_scale_base
-        self.tmp_store("scale", scale)
-
-        # add apply_rotary_emb as static method
-
-        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
-
-    @property
-    def device(self):
-        return self.dummy.device
-
-    def tmp_store(self, key, value):
-        self.register_buffer(key, value, persistent=False)
-
-    def get_seq_pos(self, seq_len, device, dtype, offset=0):
-        return (torch.arange(seq_len, device=device, dtype=dtype) + offset) / self.interpolate_factor
-
-    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0):
-        seq_dim = default(seq_dim, self.default_seq_dim)
-
-        assert not self.use_xpos, (
-            "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"
-        )
-
-        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
-
-        freqs = self.forward(
-            self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset), seq_len=seq_len, offset=offset
-        )
-
-        if seq_dim == -3:
-            freqs = freqs.unsqueeze(1)
-
-        return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
-
-    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
-        seq_dim = default(seq_dim, self.default_seq_dim)
-
-        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
-        assert q_len <= k_len
-
-        rotated_q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, offset=k_len - q_len + offset)
-        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, offset=offset)
-
-        rotated_q = rotated_q.type(q.dtype)
-        rotated_k = rotated_k.type(k.dtype)
-
-        return rotated_q, rotated_k
-
-    def rotate_queries_and_keys(self, q, k, seq_dim=None):
-        seq_dim = default(seq_dim, self.default_seq_dim)
-
-        assert self.use_xpos
-        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
-
-        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
-
-        freqs = self.forward(seq, seq_len=seq_len)
-        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
-
-        if seq_dim == -3:
-            freqs = freqs.unsqueeze(1)
-            scale = scale.unsqueeze(1)
-
-        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
-        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
-
-        rotated_q = rotated_q.type(q.dtype)
-        rotated_k = rotated_k.type(k.dtype)
-
-        return rotated_q, rotated_k
-
-    def get_scale(self, t: Tensor, seq_len: int | None = None, offset=0):
-        assert self.use_xpos
-
-        should_cache = self.cache_if_possible and exists(seq_len)
-
-        if should_cache and exists(self.cached_scales) and (seq_len + offset) <= self.cached_scales.shape[0]:
-            return self.cached_scales[offset : (offset + seq_len)]
-
-        scale = 1.0
-        if self.use_xpos:
-            power = (t - len(t) // 2) / self.scale_base
-            scale = self.scale ** power.unsqueeze(-1)
-            scale = torch.cat((scale, scale), dim=-1)
-
-        if should_cache:
-            self.tmp_store("cached_scales", scale)
-
-        return scale
-
-    def get_axial_freqs(self, *dims):
-        Colon = slice(None)
-        all_freqs = []
-
-        for ind, dim in enumerate(dims):
-            if self.freqs_for == "pixel":
-                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
-            else:
-                pos = torch.arange(dim, device=self.device)
-
-            freqs = self.forward(pos, seq_len=dim)
-
-            all_axis = [None] * len(dims)
-            all_axis[ind] = Colon
-
-            new_axis_slice = (Ellipsis, *all_axis, Colon)
-            all_freqs.append(freqs[new_axis_slice])
-
-        all_freqs = broadcast_tensors(*all_freqs)
-        return torch.cat(all_freqs, dim=-1)
-
-    @autocast("cuda", enabled=False)
-    def forward(self, t: Tensor, seq_len=None, offset=0):
-        should_cache = (
-            self.cache_if_possible and not self.learned_freq and exists(seq_len) and self.freqs_for != "pixel"
-        )
-
-        if should_cache and exists(self.cached_freqs) and (offset + seq_len) <= self.cached_freqs.shape[0]:
-            return self.cached_freqs[offset : (offset + seq_len)].detach()
-
-        freqs = self.freqs
-
-        # Scale time to keep t * freq <= 2pi
-        if hasattr(self, "max_time") and self.max_time is not None:
-            t = t / self.max_time * (2 * pi)
-
-        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
-        freqs = torch.repeat_interleave(freqs, 2, dim=-1)
-
-        if should_cache:
-            self.tmp_store("cached_freqs", freqs.detach())
-
-        return freqs
-
-
-###################################################################################################
-###################################################################################################
-###################################################################################################
-
-
-class MusicFlamingoPreTrainedModel(AudioFlamingo3PreTrainedModel):
-    @torch.no_grad()
-    def _init_weights(self, module):
-        """Initialize the weights for MusicFlamingo-specific modules."""
-        if isinstance(module, RotaryEmbedding):
-            # Reinitialize freqs parameter
-            dim = module.dim
-            freqs_for = module.freqs_for
-            max_time = module.max_time
-            theta_rescale_factor = module.theta_rescale_factor
-            custom_freqs = None
-
-            # Adjust theta
-            if max_time is not None and freqs_for == "lang":
-                theta = max_time / (2 * pi)
-            else:
-                theta = 50000  # default value
-
-            theta *= theta_rescale_factor ** (dim / (dim - 2))
-
-            # Generate freqs
-            if custom_freqs is not None:
-                freqs = custom_freqs
-            elif freqs_for == "lang":
-                freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-            elif freqs_for == "pixel":
-                freqs = torch.linspace(1.0, module.max_freq / 2, dim // 2) * pi
-            elif freqs_for == "constant":
-                freqs = torch.ones(module.num_freqs).float()
-
-            module.freqs.data = freqs
-
-            # Reinitialize dummy buffer
-            module.dummy.data = torch.tensor(0)
-
-            # Reinitialize scale if using xpos
-            if module.use_xpos and module.scale is not None:
-                scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
-                module.scale.data = scale
-        else:
-            # Delegate to parent class for other modules
-            super()._init_weights(module)
-
-
-@auto_docstring(
-    custom_intro="""
-    The audio model from MusicFlamingo without any head or projection on top.
-    """
-)
-class MusicFlamingoEncoder(AudioFlamingo3Encoder):
-    """
-    MusicFlamingo encoder: Whisper encoder with rotary embeddings for time information.
-    """
-
-    def __init__(self, config: MusicFlamingoConfig):
-        super().__init__(config)
-        self.pos_emb = RotaryEmbedding(dim=256, freqs_for="lang", max_time=1200.0)
-
-    @can_return_tuple
-    def forward(
-        self,
-        input_features: torch.Tensor,
-        input_features_mask: torch.Tensor | None = None,
-        audio_times: torch.Tensor | None = None,
-        **kwargs,
-    ):
-        r"""
-        Args:
-            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
-                Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
-                these features from waveform input.
-            input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-            audio_times (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-                The start time of the audio segments in seconds.
-        """
-        seq_len = (input_features.shape[-1] - 1) // 2 + 1  # After conv2 downsampling
-        input_features_lengths = input_features_mask.sum(-1)
-        input_features_lengths = (input_features_lengths - 1) // 2 + 1  # conv2 downsampling
-        input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
-
-        # Conv front-end
-        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
-        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
-        inputs_embeds = inputs_embeds.permute(0, 2, 1)
-
-        # Add positions, dropout
-        hidden_states = inputs_embeds + self.embed_positions.weight
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        attention_mask = create_bidirectional_mask(
-            config=self.config,
-            input_embeds=hidden_states,
-            attention_mask=input_features_mask,
-        )
-
-        # Transformer stack
-        for layer in self.layers:
-            drop = self.training and torch.rand([]) < self.layerdrop
-            if not drop:
-                hidden_states = layer(hidden_states, attention_mask)[0]
-
-        # AvgPool (time/2) + LayerNorm
-        hidden_states = hidden_states.permute(0, 2, 1)
-        hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
-        hidden_states = self.layer_norm(hidden_states)
-
-        if audio_times is not None:
-            times = audio_times.to(hidden_states.device)
-            freqs = self.pos_emb.get_axial_freqs(times.shape[0], hidden_states.shape[-2]).to(self.conv1.weight.device)
-            angle = (-times * 2 * pi).to(self.conv1.weight.device)
-            angle_expanded = angle.unsqueeze(2).expand(times.shape[0], hidden_states.shape[-2], freqs.shape[-1])
-            freqs = freqs * angle_expanded
-
-            hidden_states = apply_rotary_emb(freqs, hidden_states)
-
-        return BaseModelOutput(last_hidden_state=hidden_states)
-
-
-class MusicFlamingoMultiModalProjector(AudioFlamingo3MultiModalProjector):
-    pass
-
-
-@auto_docstring(
-    custom_intro="""
-    The MusicFlamingo model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
-    """
-)
-class MusicFlamingoForConditionalGeneration(AudioFlamingo3ForConditionalGeneration):
-    def get_audio_features(
-        self,
-        input_features: torch.FloatTensor,
-        input_features_mask: torch.Tensor,
-        audio_times: torch.Tensor | None = None,
-    ) -> torch.FloatTensor:
-        # Encode audio with dtype conversion and audio_times
-        input_features = input_features.to(dtype=self.audio_tower.conv1.weight.dtype)
-        encoder_output = self.audio_tower(
-            input_features, input_features_mask=input_features_mask, audio_times=audio_times
-        )
-        audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
-
-        # Mask according to avg pooling
-        post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
-        valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
-        audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
-        return audio_embeds
-
-    @can_return_tuple
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: torch.LongTensor | None = None,
-        input_features: torch.FloatTensor | None = None,
-        input_features_mask: torch.Tensor | None = None,
-        audio_times: torch.Tensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: Cache | None = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-        labels: torch.LongTensor | None = None,
-        use_cache: bool | None = None,
-        cache_position: torch.LongTensor | None = None,
-        logits_to_keep: int | torch.Tensor = 0,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> CausalLMOutputWithPast:
-        r"""
-        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        audio_times (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-            The start time of the audio segments in seconds.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Example:
-
-        ```python
-        >>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor
-
-        >>> model_id = "nvidia/music-flamingo-hf"
-        >>> processor = AutoProcessor.from_pretrained(model_id)
-        >>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")
-
-        >>> conversations = [
-        >>>     [
-        >>>         {
-        >>>             "role": "user",
-        >>>             "content": [
-        >>>                 {"type": "text", "text": "Transcribe the input speech."},
-        >>>                 {
-        >>>                     "type": "audio",
-        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
-        >>>                 },
-        >>>             ],
-        >>>         }
-        >>>     ],
-        >>>     [
-        >>>         {
-        >>>             "role": "user",
-        >>>             "content": [
-        >>>                 {
-        >>>                     "type": "text",
-        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
-        >>>                 },
-        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
-        >>>             ],
-        >>>         }
-        >>>     ],
-        >>> ]
-
-        >>> inputs = processor.apply_chat_template(
-        >>>     conversations,
-        >>>     tokenize=True,
-        >>>     add_generation_prompt=True,
-        >>>     return_dict=True,
-        >>> ).to(model.device)
-
-        >>> outputs = model.generate(**inputs, max_new_tokens=500)
-
-        >>> decoded_outputs = processor.batch_decode(
-        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
-        >>> )
-        >>> print(decoded_outputs)
-        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
-        ```"""
-
-        if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings()(input_ids)
-
-        if input_features is not None and input_ids is not None:
-            audio_embeds = self.get_audio_features(input_features, input_features_mask, audio_times=audio_times)
-
-            # replace text-audio token placeholders with audio embeddings
-            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
-            )
-
-        outputs: CausalLMOutputWithPast = self.language_model(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            labels=labels,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            logits_to_keep=logits_to_keep,
-            **kwargs,
-        )
-        return outputs
-
-    def prepare_inputs_for_generation(self, *args, **kwargs):
-        # Overwritten -- we should not pass input_features when we are in cached decoding stage
-
-        input_features = kwargs.pop("input_features", None)
-        input_features_mask = kwargs.pop("input_features_mask", None)
-        audio_times = kwargs.pop("audio_times", None)
-        cache_position = kwargs.get("cache_position")
-
-        model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
-
-        if cache_position is not None and cache_position[0] == 0:
-            # input_features should only be passed when we are not in cached decoding stage
-            if input_features is not None:
-                model_inputs["input_features"] = input_features
-            if input_features_mask is not None:
-                model_inputs["input_features_mask"] = input_features_mask
-            if audio_times is not None:
-                model_inputs["audio_times"] = audio_times
-
-        return model_inputs
-
-
-__all__ = ["MusicFlamingoForConditionalGeneration", "MusicFlamingoPreTrainedModel", "MusicFlamingoEncoder"]
diff --git a/src/transformers/models/musicflamingo/processing_musicflamingo.py b/src/transformers/models/musicflamingo/processing_musicflamingo.py
deleted file mode 100644
index 401816a4ca37..000000000000
--- a/src/transformers/models/musicflamingo/processing_musicflamingo.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-from typing import Optional, Union
-
-import numpy as np
-
-from ...audio_utils import AudioInput, make_list_of_audio
-from ...feature_extraction_utils import BatchFeature
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import TextInput
-from ...utils import is_torch_available, logging
-
-
-if is_torch_available():
-    import torch
-
-
-logger = logging.get_logger(__name__)
-
-MAX_AUDIO_LEN = 20 * 60  # 20 minutes
-
-
-class MusicFlamingoProcessorKwargs(ProcessingKwargs, total=False):
-    _defaults = {
-        "text_kwargs": {
-            "padding": True,
-        },
-        "audio_kwargs": {
-            "sampling_rate": 16000,
-            "chunk_length": 30.0,
-            "return_attention_mask": True,
-            "padding": "max_length",
-        },
-        "common_kwargs": {
-            "return_tensors": "pt",
-            "padding_side": "left",
-        },
-    }
-
-
-class MusicFlamingoProcessor(ProcessorMixin):
-    r"""
-    Constructs an MusicFlamingo processor which wraps an MusicFlamingo feature extractor and an MusicFlamingo
-    tokenizer into a single processor.
-
-    [`MusicFlamingoProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
-    [`Qwen2TokenizerFast`]. See the [`~MusicFlamingoProcessor.__call__`] for more information.
-
-    Args:
-        feature_extractor ([`WhisperFeatureExtractor`]):
-            The feature extractor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`Optional[str]`, *optional*):
-            The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
-            template will be used.
-        audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
-            Special token used to represent audio inputs in the chat template.
-        audio_bos_token (`Optional[str]`, *optional*, defaults to `"<|sound_bos|>"`):
-            Special token used to represent the beginning of an audio sequence.
-        audio_eos_token (`Optional[str]`, *optional*, defaults to `"<|sound_eos|>"`):
-            Special token used to represent the end of an audio sequence.
-    """
-
-    def __init__(
-        self,
-        feature_extractor,
-        tokenizer,
-        chat_template=None,
-        audio_token="<sound>",
-        audio_bos_token="<|sound_bos|>",
-        audio_eos_token="<|sound_eos|>",
-    ):
-        self.audio_token = audio_token
-        self.audio_bos_token = audio_bos_token
-        self.audio_eos_token = audio_eos_token
-        self.audio_token_id = tokenizer.convert_tokens_to_ids(audio_token)
-        self.audio_bos_token_id = tokenizer.convert_tokens_to_ids(audio_bos_token)
-        self.audio_eos_token_id = tokenizer.convert_tokens_to_ids(audio_eos_token)
-        super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
-
-    def __call__(
-        self,
-        text: Union[TextInput, list[TextInput]],
-        audio: Optional[AudioInput] = None,
-        output_labels: Optional[bool] = False,
-        **kwargs: Unpack[MusicFlamingoProcessorKwargs],
-    ) -> BatchFeature:
-        r"""
-        Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
-        method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
-        audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
-        with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
-        the text is tokenized as-is (LM-only behavior).
-
-        Args:
-            text (`str` or `list[str]`):
-                Input sequence or batch of sequences.
-            audio (`np.ndarray` or `list[np.ndarray]`):
-                Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
-                `audio` inputs.
-            output_labels (bool, *optional*, default=False):
-                Whether to return labels for training.
-
-        Returns:
-            [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
-            audio features (`input_features`, `input_features_mask`).
-        """
-
-        # Merge defaults with user kwargs
-        call_kwargs = self._merge_kwargs(
-            MusicFlamingoProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-
-        text_kwargs = call_kwargs["text_kwargs"]
-        audio_kwargs = call_kwargs["audio_kwargs"]
-        return_tensors = text_kwargs.get("return_tensors")
-        if return_tensors != "pt":
-            raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
-
-        if isinstance(text, str):
-            text = [text]
-        elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
-            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
-
-        audio_inputs = {}
-        if audio is not None:
-            audio = make_list_of_audio(audio)
-            if len(text) != len(audio):
-                raise ValueError(f"Got {len(text)} text but {len(audio)} audios; they must match 1:1.")
-
-            # Determine number of chunks per sample, and flatten
-            window_size = int(audio_kwargs["sampling_rate"] * audio_kwargs["chunk_length"])
-            max_windows = int(MAX_AUDIO_LEN // audio_kwargs["chunk_length"])
-
-            per_sample_windows: list[int] = []
-            flat_chunks: list[np.ndarray] = []
-            audio_times_list: list[torch.Tensor] = []
-
-            for audio_el in audio:
-                n_samples = int(audio_el.shape[0])
-                n_win = max(1, (n_samples + window_size - 1) // window_size)
-                if n_win > max_windows:
-                    logger.warning(
-                        f"Audio duration ({n_samples / audio_kwargs['sampling_rate']:.1f}s) exceeds {MAX_AUDIO_LEN}s; truncating to first {MAX_AUDIO_LEN}s."
-                    )
-                    n_win = max_windows
-                per_sample_windows.append(n_win)
-
-                time_cap = min(n_samples, n_win * window_size)
-                for i in range(n_win):
-                    start = i * window_size
-                    end = min((i + 1) * window_size, time_cap)
-                    flat_chunks.append(audio_el[start:end])
-                    # Calculate the start time of this audio chunk in seconds
-                    start_sec = start / audio_kwargs["sampling_rate"]
-
-                    # Generate 750 timestamps at 40ms intervals (30s / 750 = 0.04s)
-                    chunk_times = torch.arange(750).float() * 0.04 + start_sec
-                    audio_times_list.append(chunk_times)
-
-            # Feature extraction
-            audio_inputs = self.feature_extractor(flat_chunks, **audio_kwargs)
-            padding_mask = audio_inputs.pop("attention_mask")
-            audio_inputs["input_features_mask"] = padding_mask
-
-            # Add audio times as tensor
-            if return_tensors == "pt":
-                audio_inputs["audio_times"] = torch.stack(audio_times_list).to(dtype=torch.float32)
-
-            # Compute sequence lengths token counting
-            audio_lengths = torch.stack([s.sum() for s in torch.split(padding_mask.sum(-1), per_sample_windows)])
-            conv_output_lengths = (audio_lengths - 1) // 2 + 1  # After conv2 downsampling
-            audio_tokens_lengths = (conv_output_lengths - 2) // 2 + 1  # After avg pooling
-
-            # expand audio tokens in text
-            for i, audio_length in enumerate(audio_tokens_lengths):
-                expanded = re.sub(
-                    re.escape(self.audio_token),
-                    self.audio_bos_token + self.audio_token * audio_length + self.audio_eos_token,
-                    text[i],
-                )
-                text[i] = expanded
-
-        # Tokenize
-        text_inputs = self.tokenizer(text, **text_kwargs)
-
-        data = {**text_inputs, **audio_inputs}
-        if output_labels:
-            labels = data["input_ids"].clone()
-            labels[labels == self.audio_token_id] = -100
-            labels[labels == self.audio_bos_token_id] = -100
-            labels[labels == self.audio_eos_token_id] = -100
-            labels[labels == self.tokenizer.pad_token_id] = -100
-            data["labels"] = labels
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
-
-    @property
-    def model_input_names(self) -> list[str]:
-        tok_names = self.tokenizer.model_input_names
-        fea_names = self.feature_extractor.model_input_names
-        return list(dict.fromkeys(tok_names + fea_names + ["input_features_mask", "audio_times"]))
-
-    def batch_decode(self, *args, **kwargs):
-        """
-        Forward arguments to [`~PreTrainedTokenizer.batch_decode`].
-        """
-        return self.tokenizer.batch_decode(*args, **kwargs)
-
-
-__all__ = ["MusicFlamingoProcessor"]
diff --git a/tests/fixtures/musicflamingo/expected_results_batched.json b/tests/fixtures/musicflamingo/expected_results_batched.json
deleted file mode 100644
index d712d9147e40..000000000000
--- a/tests/fixtures/musicflamingo/expected_results_batched.json
+++ /dev/null
@@ -1 +0,0 @@
-{"token_ids": [[1986, 3754, 374, 458, 44855, 19461, 98875, 378, 107, 14, 378, 107, 35, 681, 55964, 11598, 55564, 429, 57843, 279, 9906, 11, 10581, 52760, 6097, 13450, 315, 20729, 2420, 448, 279, 9842, 11, 6335, 55964, 2307, 27235, 315, 11416, 19461, 98875, 13, 220, 576, 8090, 315, 279, 6573, 374, 220, 16, 21, 18, 13, 20, 24, 6486, 624, 12151, 78, 609, 5309, 1365, 576, 5492, 10797, 518, 264, 74391, 378, 107, 16, 20, 15, 378, 107, 33, 8795, 323, 374, 40876, 304, 378, 107, 36, 378, 107, 36505, 624, 56324, 367, 609, 24039, 1365, 362, 43361, 11, 1550, 55964, 69, 45101, 5670, 14087, 279, 26112, 13, 576, 36290, 16266, 374, 5798, 389, 264, 3040, 55964, 263, 55964, 1782, 55964, 30449, 14346, 23196, 5383, 448, 41854, 10323, 11, 4131, 11144, 4131, 546, 11, 323, 10296, 15588, 55964, 71, 1862, 11, 678, 3108, 55964, 331, 2627, 311, 264, 20380, 88, 42898, 21529, 429, 31676, 38969, 448, 279, 46289, 13, 22735, 291, 389, 1909, 525, 9906, 11, 796, 10311, 8212, 657, 42898, 11508, 323, 57267, 11016, 29853, 429, 23146, 3941, 264, 6884, 37952, 2070, 11, 1393, 27861, 357, 3435, 315, 51690, 672, 6782, 16896, 912, 4960, 95070, 13, 576, 6514, 374, 4240, 323, 73056, 11, 448, 26447, 312, 22328, 323, 7626, 389, 279, 6782, 16896, 311, 1855, 7990, 11, 323, 87761, 429, 65059, 17361, 2090, 2041, 72028, 31273, 624, 53, 3683, 83984, 1365, 576, 2990, 374, 264, 8778, 752, 45648, 55964, 82, 46288, 5652, 6693, 6792, 20512, 374, 2797, 11, 9906, 11, 323, 10078, 11486, 88, 13, 2932, 27321, 279, 61584, 304, 264, 2420, 55964, 32436, 11, 10581, 52760, 1707, 11, 1667, 3100, 3233, 55964, 83, 2886, 369, 9649, 44029, 323, 27671, 312, 22328, 3446, 6895, 311, 2444, 2878, 279, 60738, 6514, 13, 576, 25407, 1555, 374, 46804, 4240, 11, 448, 902, 24939, 476, 60366, 5424, 624, 43, 10920, 938, 62087, 1365, 576, 23261, 5772, 3948, 2163, 2948, 11, 78322, 11, 323, 279, 86335, 2355, 315, 264, 5025, 13, 576, 8622, 11, 58077, 9704, 2293, 9, 2073, 3838, 1035, 847, 1879, 387, 2041, 498, 12390, 9, 2293, 70499, 279, 55810, 323, 71790, 279, 5492, 748, 14269, 6200, 13, 6944, 5128, 1741, 438, 353, 2073, 2610, 13020, 279, 9759, 3403, 847, 6084, 1717, 21461, 854, 9, 323, 353, 2073, 19389, 21059, 88148, 1119, 25678, 24908, 854, 9, 40368, 279, 1042, 1229, 323, 37550, 16232, 624, 33039, 28596, 609, 52611, 1365, 576, 26112, 11017, 264, 11416, 19461, 98875, 6789, 25, 458, 40945, 19706, 429, 63564, 279, 9842, 9382, 11, 8110, 553, 49299, 429, 19131, 279, 25407, 19221, 11, 264, 855, 55964, 6150, 355, 429, 22111, 23504, 448, 16062, 42898, 13617, 11, 323, 264, 68897, 55810, 1380, 279, 9704, 43594, 916, 2480, 55964, 339, 27535, 6782, 16896, 323, 63141, 24668, 46096, 13, 42305, 278, 18303, 3410, 3550, 369, 279, 42898, 11508, 311, 32405, 1573, 279, 1590, 55810, 11, 892, 374, 48580, 448, 5107, 81241, 13617, 323, 264, 26447, 11893, 304, 279, 23196, 20612, 11, 11695, 64283, 304, 458, 94509, 60658, 429, 86409, 279, 1887, 10581, 52760, 59512, 624, 785, 90767, 70971, 1365, 39659, 292, 7203, 26558, 13771, 2878, 279, 468, 55964, 36505, 1853, 266, 14011, 2070, 13, 24209, 288, 3545, 10775, 1526, 279, 3275, 4142, 3090, 4142, 40, 4142, 53, 32724, 320, 34, 2, 76, 4142, 32, 4142, 36, 4142, 33, 701, 6825, 264, 11285, 2420, 55964, 20521, 2666, 1393, 20337, 4637, 23270, 13, 576, 855, 55964, 6150, 355, 38919, 264, 14246, 24456, 320, 33, 22, 8, 429, 2608, 724, 23504, 1573, 52483, 311, 279, 98205, 55964, 3057, 291, 55810, 11, 892, 92836, 389, 279, 358, 4142, 3090, 4142, 53, 320, 36, 4142, 32, 4142, 33, 8, 323, 27861, 3275, 320, 34, 2, 76, 8, 55659, 311, 6359, 458, 22268, 8111, 4879, 13, 576, 990, 315, 11577, 55659, 1741, 438, 3303, 1630, 22, 11, 425, 21, 11, 323, 356, 2, 76, 22, 11367, 1894, 2041, 86743, 279, 8084, 3598, 55964, 792, 32206, 624, 27489, 93567, 609, 9608, 1365, 576, 3754, 505, 28146, 458, 94509, 11, 37550, 16566, 11, 77749, 279, 63943, 95629, 315, 3309, 55964, 16, 24, 24, 15, 82, 14, 22140, 55964, 17, 15, 15, 15, 82, 19461, 98875, 448, 18706, 2420, 44029, 13, 11445, 9906, 42898, 26308, 11, 15254, 55964, 30449, 55964, 2307, 23230, 11, 323, 37583, 28180, 517, 25407, 9704, 1992, 432, 94463, 2878, 279, 63909, 315, 6335, 55964, 32436, 2420, 22268, 11852, 429, 9213, 311, 30640, 2176, 7203, 323, 25975, 13, 151645], [334, 68043, 220, 16, 1019, 33648, 9287, 88828, 304, 51454, 11, 12711, 28347, 261, 304, 279, 3054, 11, 24353, 20783, 18707, 30789, 11, 22502, 4614, 389, 697, 49293, 271, 334, 68043, 220, 17, 1019, 26843, 2367, 98091, 389, 279, 39612, 11, 304, 17172, 582, 6950, 11, 14697, 41315, 311, 279, 17788, 11, 34254, 8048, 1616, 2238, 5135, 271, 334, 1143, 29869, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1449, 22721, 264, 10526, 17970, 11, 304, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 271, 334, 68043, 220, 19, 1019, 17814, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 271, 334, 1143, 29869, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1449, 22721, 264, 10526, 17970, 11, 304, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 271, 334, 32848, 1019, 641, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 1406, 334, 68043, 220, 19, 1019, 17814, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 271, 334, 1143, 29869, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1449, 22721, 264, 10526, 17970, 11, 304, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 271, 334, 68043, 220, 19, 1019, 17814, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 271, 334, 2662, 299, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1896, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 151645, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669]], "transcriptions": ["This track is an energetic Eurodance / Dance‑Pop anthem that blends the bright, melodic sensibilities of mainstream pop with the driving, club‑ready pulse of classic Eurodance.  The duration of the piece is 163.59 seconds.\nTempo & Key – The song moves at a brisk 150 BPM and is rooted in E major.\nInstrumentation & Production – A polished, high‑fidelity production frames the arrangement. The rhythm foundation is built on a four‑on‑the‑floor electronic drum pattern with crisp kick, snappy snare, and tight hi‑hats, all side‑chained to a punchy synth bass that locks tightly with the drums. Layered on top are bright, arpeggiated synth leads and lush pad textures that sweep across a wide stereo field, while occasional stabs of supersaw synths add extra sparkle. The mix is clean and glossy, with subtle reverb and delay on the synths to create depth, and mastering that emphasizes loudness without sacrificing clarity.\nVocal Characteristics – The lead is a female mezzo‑soprano whose timbre is clear, bright, and slightly breathy. She delivers the melody in a pop‑oriented, melodic style, using light auto‑tune for pitch polish and generous reverb/delay to sit within the expansive mix. The vocal line is predominantly clean, with no harsh or screamed elements.\nLyrical Themes – The lyrics revolve around love, longing, and the transformative power of a relationship. The central, repetitive hook—*“What would my world be without you?”*—anchors the chorus and underscores the song’s emotional core. Other lines such as *“You lit the stars above my sleepless nights”* and *“Turn silent whispers into endless flights”* illustrate the yearning and hopeful tone.\nSong Structure & Dynamics – The arrangement follows a classic Eurodance layout: an instrumental intro that establishes the driving beat, followed by verses that introduce the vocal narrative, a pre‑chorus that builds tension with rising synth layers, and a soaring chorus where the hook repeats over full‑throttle synths and layered backing vocals. Instrumental breaks provide space for the synth leads to shine before the final chorus, which is reinforced with additional harmonic layers and a subtle lift in the drum intensity, culminating in an uplifting outro that fades the main melodic motif.\nTheoretical Insight – Harmonic movement stays largely within the E‑major diatonic field. Verses often cycle through the vi–IV–I–V progression (C#m–A–E–B), creating a familiar pop‑rock feel while maintaining forward momentum. The pre‑chorus introduces a secondary dominant (B7) that heightens tension before resolving to the tonic‑centered chorus, which leans on the I–IV–V (E–A–B) and occasional vi (C#m) chords to deliver an anthemic release. The use of extended chords such as Amaj7, B6, and C#m7 adds color without disrupting the overall major‑key brightness.\nOverall Mood & Context – The track exudes an uplifting, hopeful atmosphere, marrying the euphoria of late‑1990s/early‑2000s Eurodance with contemporary pop polish. Its bright synth palette, dance‑floor‑ready tempo, and emotionally resonant vocal hook place it squarely within the lineage of club‑oriented pop anthems that aim to inspire both movement and sentiment.", "**Verse 1**\nMidnight cravings in bloom, lights flicker in the room, pepperoni dreams arise, pizza party on your skies\n\n**Verse 2**\nCheese melts on the crust, in flavor we trust, boxes stacked to the moon, slices gone way too soon\n\n**Chorus**\nLate night pizza feeling right, every bite a pure delight, in the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n**Verse 4**\nTake a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain\n\n**Chorus**\nLate night pizza feeling right, every bite a pure delight, in the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n**Bridge**\nIn the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n\n**Verse 4**\nTake a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain\n\n**Chorus**\nLate night pizza feeling right, every bite a pure delight, in the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n**Verse 4**\nTake a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain\n\n**Outro**\nLate night pizza feeling right, take a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain"]}
\ No newline at end of file
diff --git a/tests/fixtures/musicflamingo/expected_results_single.json b/tests/fixtures/musicflamingo/expected_results_single.json
deleted file mode 100644
index 7b04d208f819..000000000000
--- a/tests/fixtures/musicflamingo/expected_results_single.json
+++ /dev/null
@@ -1 +0,0 @@
-{"token_ids": [[1986, 3754, 374, 458, 44855, 19461, 98875, 378, 107, 14, 378, 107, 35, 681, 55964, 11598, 55564, 429, 57843, 279, 9906, 11, 10581, 52760, 6097, 13450, 315, 20729, 2420, 448, 279, 9842, 11, 6335, 55964, 2307, 27235, 315, 11416, 19461, 98875, 13, 220, 576, 8090, 315, 279, 6573, 374, 220, 16, 21, 18, 13, 20, 24, 6486, 624, 12151, 78, 609, 5309, 1365, 576, 5492, 10797, 518, 264, 74391, 378, 107, 16, 20, 15, 378, 107, 33, 8795, 323, 374, 40876, 304, 378, 107, 36, 378, 107, 36505, 624, 56324, 367, 609, 24039, 1365, 362, 43361, 11, 1550, 55964, 69, 45101, 5670, 14087, 279, 26112, 13, 576, 36290, 16266, 374, 5798, 389, 264, 3040, 55964, 263, 55964, 1782, 55964, 30449, 14346, 10323, 11, 41854, 4131, 546, 55931, 391, 13617, 11, 323, 10296, 15588, 55964, 9198, 12624, 429, 2506, 279, 56406, 59035, 13, 362, 20380, 88, 42898, 21529, 1212, 74558, 279, 25281, 11, 1393, 9906, 11, 796, 10311, 8212, 657, 2990, 6782, 16896, 323, 57267, 11, 6884, 55964, 267, 64853, 35995, 6777, 279, 10581, 52760, 29677, 13, 576, 6514, 374, 4240, 323, 32136, 11, 448, 3108, 55964, 8819, 25111, 389, 279, 35995, 311, 29100, 6292, 279, 10323, 11, 323, 26447, 312, 22328, 323, 7626, 389, 279, 11508, 311, 1855, 264, 5530, 315, 11893, 13, 10824, 287, 65059, 264, 17361, 11, 6335, 55964, 2307, 2188, 2041, 72028, 31273, 11, 10693, 279, 6782, 16896, 323, 46096, 311, 3931, 1526, 279, 27950, 26112, 624, 53, 3683, 83984, 1365, 576, 2990, 374, 264, 8778, 752, 45648, 55964, 82, 46288, 5652, 448, 264, 2797, 11, 9906, 6792, 20512, 13, 6252, 9691, 374, 10581, 52760, 323, 2420, 55964, 32436, 11, 16445, 28659, 8388, 323, 27861, 10581, 56145, 19828, 20527, 13, 86045, 1127, 525, 15233, 448, 3100, 3233, 55964, 83, 2886, 369, 9649, 44029, 11, 27671, 312, 22328, 323, 7626, 369, 7990, 11, 323, 6437, 25111, 311, 2444, 31520, 304, 279, 6514, 624, 43, 10920, 938, 62087, 1365, 576, 23261, 5772, 3948, 2163, 2948, 11, 78322, 11, 323, 279, 86335, 2355, 315, 264, 5025, 13, 576, 8622, 11, 58077, 9704, 2293, 9, 2073, 3838, 1035, 847, 1879, 387, 2041, 498, 12390, 9, 2293, 70499, 279, 55810, 11, 1393, 49299, 6177, 42020, 5335, 315, 1894, 11, 3100, 11, 323, 14269, 43293, 26087, 2610, 13020, 279, 9759, 3403, 847, 6084, 1717, 21461, 2419, 1036, 19389, 21059, 88148, 1119, 25678, 24908, 64212, 576, 45250, 3405, 71790, 264, 1042, 1229, 369, 279, 27430, 748, 9362, 624, 33039, 28596, 609, 52611, 1365, 576, 26112, 11017, 264, 11416, 19461, 98875, 6789, 25, 458, 40945, 19706, 429, 63564, 279, 42898, 59512, 11, 8110, 553, 264, 32387, 11, 264, 855, 55964, 6150, 355, 429, 22111, 23504, 11, 323, 264, 68897, 55810, 1380, 279, 9704, 43594, 13, 362, 14164, 38919, 264, 8112, 81241, 6407, 1573, 13451, 311, 279, 1590, 55810, 11, 892, 374, 63141, 448, 5107, 25407, 17774, 550, 323, 264, 59387, 42898, 2990, 369, 53129, 5421, 13, 21886, 20612, 37168, 1526, 1817, 3772, 11, 448, 279, 55810, 23988, 279, 28640, 4979, 11, 1429, 22268, 8111, 4445, 1573, 264, 9814, 11, 18293, 60658, 429, 86409, 279, 42898, 35995, 624, 785, 90767, 70971, 1365, 39659, 292, 7203, 18652, 389, 279, 358, 55964, 9971, 55964, 3090, 55964, 53, 2997, 304, 468, 378, 107, 36505, 320, 36, 378, 107, 4142, 378, 107, 34, 145346, 76, 378, 107, 4142, 378, 107, 32, 378, 107, 4142, 378, 107, 33, 701, 448, 20757, 990, 315, 279, 3275, 320, 34, 145346, 76, 8, 323, 16824, 320, 32, 8, 55659, 311, 1855, 264, 8205, 11, 94509, 2666, 13, 576, 32724, 3545, 10797, 504, 279, 98205, 320, 36, 8, 311, 279, 8674, 8922, 320, 34, 145346, 76, 8, 304, 49299, 11, 1221, 53066, 311, 279, 1186, 5600, 85296, 320, 32, 8, 323, 24456, 320, 33, 8, 304, 69256, 4776, 11, 8241, 264, 2797, 5530, 315, 23504, 55964, 22998, 429, 39273, 279, 3754, 748, 63943, 19530, 292, 3668, 624, 27489, 93567, 609, 9608, 1365, 34869, 1095, 53408, 323, 15254, 30449, 4802, 11, 279, 5492, 40155, 279, 3309, 55964, 17, 15, 15, 15, 82, 14, 22140, 55964, 17, 15, 16, 15, 82, 19461, 98875, 56005, 11, 1380, 73056, 2420, 46096, 2270, 6335, 55964, 2307, 5670, 13, 11445, 9906, 42898, 26308, 11, 68897, 25407, 9704, 11, 323, 59035, 9382, 1281, 432, 264, 39657, 48482, 2666, 55964, 18536, 55564, 6188, 369, 2176, 8887, 1486, 323, 69732, 12695, 13, 151645]], "transcriptions": ["This track is an energetic Eurodance / Dance‑Pop anthem that blends the bright, melodic sensibilities of mainstream pop with the driving, club‑ready pulse of classic Eurodance.  The duration of the piece is 163.59 seconds.\nTempo & Key – The song moves at a brisk 150 BPM and is rooted in E major.\nInstrumentation & Production – A polished, high‑fidelity production frames the arrangement. The rhythm foundation is built on a four‑on‑the‑floor electronic kick, crisp snare/clap layers, and tight hi‑hat patterns that keep the groove relentless. A punchy synth bass underpins the harmony, while bright, arpeggiated lead synths and lush, wide‑stereo pads carry the melodic hooks. The mix is clean and spacious, with side‑chain compression on the pads to accentuate the kick, and subtle reverb and delay on the leads to create a sense of lift. Mastering emphasizes a loud, club‑ready level without sacrificing clarity, allowing the synths and vocals to cut through the dense arrangement.\nVocal Characteristics – The lead is a female mezzo‑soprano with a clear, bright timbre. Her delivery is melodic and pop‑oriented, featuring sustained notes and occasional melismatic flourishes. Vocals are processed with light auto‑tune for pitch polish, generous reverb and delay for depth, and solid compression to sit firmly in the mix.\nLyrical Themes – The lyrics revolve around love, longing, and the transformative power of a relationship. The central, repetitive hook—*“What would my world be without you?”*—anchors the chorus, while verses paint vivid images of color, light, and emotional dependence (“You lit the stars above my sleepless nights,” “Turn silent whispers into endless flights”). The recurring question underscores a yearning for the beloved’s presence.\nSong Structure & Dynamics – The arrangement follows a classic Eurodance layout: an instrumental intro that establishes the synth motif, followed by a verse, a pre‑chorus that builds tension, and a soaring chorus where the hook repeats. A bridge introduces a slight harmonic shift before returning to the final chorus, which is layered with additional vocal harmonies and a heightened synth lead for maximal impact. Dynamic intensity rises through each section, with the chorus delivering the loudest, most anthemic moment before a brief, filtered outro that fades the synth pads.\nTheoretical Insight – Harmonic movement centers on the I‑vi‑IV‑V family in E major (E – C♯m – A – B), with frequent use of the vi (C♯m) and IV (A) chords to create a warm, uplifting feel. The progression often moves from the tonic (E) to the relative minor (C♯m) in verses, then lifts to the subdominant (A) and dominant (B) in choruses, providing a clear sense of tension‑release that fuels the track’s euphoric character.\nOverall Mood & Context – Radiating optimism and dancefloor energy, the song captures the late‑2000s/early‑2010s Eurodance revival, where glossy pop vocals met club‑ready production. Its bright synth palette, soaring vocal hook, and relentless beat make it a quintessential feel‑good anthem designed for both radio play and nightclub rotation."]}
\ No newline at end of file
diff --git a/tests/models/musicflamingo/__init__.py b/tests/models/musicflamingo/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/models/musicflamingo/test_modeling_musicflamingo.py b/tests/models/musicflamingo/test_modeling_musicflamingo.py
deleted file mode 100644
index e603340a6f46..000000000000
--- a/tests/models/musicflamingo/test_modeling_musicflamingo.py
+++ /dev/null
@@ -1,350 +0,0 @@
-# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch MusicFlamingo model."""
-
-import json
-import tempfile
-import unittest
-from pathlib import Path
-
-import pytest
-
-from transformers import (
-    AutoProcessor,
-    MusicFlamingoConfig,
-    MusicFlamingoForConditionalGeneration,
-    is_torch_available,
-)
-from transformers.testing_utils import (
-    cleanup,
-    require_torch,
-    slow,
-    torch_device,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-if is_torch_available():
-    import torch
-
-
-class MusicFlamingoModelTester:
-    """
-    Builds a tiny MusicFlamingo config and synthetic inputs that respect MusicFlamingo's
-    post-pool token accounting: num <sound> tokens per sample == post-pool frame count.
-    """
-
-    def __init__(
-        self,
-        parent,
-        audio_token_id=0,
-        seq_length=25,
-        feat_seq_length=60,
-        text_config=None,
-        audio_config=None,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.audio_token_id = audio_token_id
-        self.seq_length = seq_length
-        self.feat_seq_length = feat_seq_length
-        self.is_training = is_training
-
-        # Small text backbone (Qwen2-ish)
-        if text_config is None:
-            text_config = {
-                "model_type": "qwen2",
-                "intermediate_size": 36,
-                "initializer_range": 0.02,
-                "hidden_size": 32,
-                "max_position_embeddings": 52,
-                "num_hidden_layers": 2,
-                "num_attention_heads": 4,
-                "num_key_value_heads": 2,
-                "use_labels": True,
-                "use_mrope": False,
-                "vocab_size": 99,
-                "pad_token_id": 1,  # Ensure pad token != audio token
-            }
-        # Small audio encoder (MusicFlamingo Whisper-style)
-        if audio_config is None:
-            audio_config = {
-                "model_type": "musicflamingo_encoder",
-                "hidden_size": 16,
-                "num_attention_heads": 4,
-                "intermediate_size": 16,
-                "num_hidden_layers": 2,
-                "num_mel_bins": 80,
-                "max_source_positions": 30,
-                "initializer_range": 0.02,
-            }
-
-        self.text_config = text_config
-        self.audio_config = audio_config
-
-        self.batch_size = 3
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.encoder_seq_length = seq_length
-
-    def get_config(self):
-        return MusicFlamingoConfig(
-            text_config=self.text_config,
-            audio_config=self.audio_config,
-            audio_token_id=self.audio_token_id,
-        )
-
-    def prepare_config_and_inputs(self):
-        # (#windows == batch_size, n_mels, T_mel)
-        input_features_values = floats_tensor(
-            [self.batch_size, self.audio_config["num_mel_bins"], self.feat_seq_length]
-        )
-        config = self.get_config()
-        # Per-window mel validity (all ones => full length)
-        input_features_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.bool).to(torch_device)
-        return config, input_features_values, input_features_mask
-
-    def _post_pool_tokens_per_window(self, T_mel):
-        # Mirror MusicFlamingo processor math:
-        pre = (T_mel - 1) // 2 + 1
-        post = (pre - 2) // 2 + 1
-        return post
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_features_values, input_features_mask = self.prepare_config_and_inputs()
-        # Every window has same T_mel here
-        num_audio_tokens_per_sample = self._post_pool_tokens_per_window(input_features_values.shape[-1])
-
-        # Build token ids with valid range and K <sound> tokens
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
-        attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=torch_device)
-        attention_mask[:, :1] = 0  # left padding sentinel
-
-        # Fill first K positions (after padding) with the audio token id, for each sample
-        input_ids[:, 1 : 1 + num_audio_tokens_per_sample] = config.audio_token_id
-
-        inputs_dict = {
-            "input_features": input_features_values,
-            "input_features_mask": input_features_mask,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class MusicFlamingoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    """
-    Model tester for `MusicFlamingoForConditionalGeneration`.
-    """
-
-    all_model_classes = (MusicFlamingoForConditionalGeneration,) if is_torch_available() else ()
-    # TODO: @eustlb, this is incorrect
-    pipeline_model_mapping = (
-        {
-            "text-to-speech": MusicFlamingoForConditionalGeneration,
-            "audio-text-to-text": MusicFlamingoForConditionalGeneration,
-        }
-        if is_torch_available()
-        else {}
-    )
-    _is_composite = True
-
-    def setUp(self):
-        self.model_tester = MusicFlamingoModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MusicFlamingoConfig, has_text_modality=False)
-
-    @unittest.skip(
-        reason="This test does not apply to MusicFlamingo since High-level inputs_embeds corresponding to audio tokens are replaced when input features are provided."
-    )
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported for MusicFlamingo models")
-    @pytest.mark.torch_compile_test
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported for MusicFlamingo models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
-    @unittest.skip(reason="MusicFlamingo tests avoid right-padding equivalence; fusion is in-place.")
-    def test_flash_attn_2_inference_equivalence_right_padding(self):
-        pass
-
-    @unittest.skip(reason="MusicFlamingo has no separate base model without a head.")
-    def test_model_base_model_prefix(self):
-        pass
-
-    def test_sdpa_can_dispatch_composite_models(self):
-        # MusicFlamingo is audio+text composite; verify SDPA toggles propagate to submodules.
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self._is_composite:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                # SDPA (default)
-                model_sdpa = model_class.from_pretrained(tmpdirname)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
-                audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager"
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-                self.assertTrue(model.language_model.config._attn_implementation == text_attn)
-                self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn)
-
-                # Eager
-                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
-                model_eager = model_eager.eval().to(torch_device)
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager")
-
-                for _, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
-
-@require_torch
-class MusicFlamingoForConditionalGenerationIntegrationTest(unittest.TestCase):
-    """
-    Slow tests against the public checkpoint to validate processor-model alignment and in-place fusion.
-    """
-
-    @classmethod
-    def setUp(cls):
-        cleanup(torch_device, gc_collect=True)
-        cls.checkpoint = "nvidia/music-flamingo-2601-hf"
-        cls.processor = AutoProcessor.from_pretrained(cls.checkpoint)
-
-    def tearDown(self):
-        cleanup(torch_device, gc_collect=True)
-
-    @slow
-    def test_fixture_single_matches(self):
-        """
-        reproducer (creates JSON directly in repo): https://gist.github.com/ebezzam/c979f0f1a2b9223fa137faf1c02022d4#file-reproducer-py
-        """
-        path = Path(__file__).parent.parent.parent / "fixtures/musicflamingo/expected_results_single.json"
-        with open(path, "r", encoding="utf-8") as f:
-            raw = json.load(f)
-        exp_ids = torch.tensor(raw["token_ids"])
-        exp_txt = raw["transcriptions"]
-
-        conversation = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
-                    },
-                    {
-                        "type": "audio",
-                        "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
-                    },
-                ],
-            }
-        ]
-
-        model = MusicFlamingoForConditionalGeneration.from_pretrained(
-            self.checkpoint, device_map=torch_device, dtype=torch.bfloat16
-        ).eval()
-
-        batch = self.processor.apply_chat_template(
-            conversation, tokenize=True, add_generation_prompt=True, return_dict=True
-        ).to(model.device, dtype=model.dtype)
-        seq = model.generate(**batch)
-        inp_len = batch["input_ids"].shape[1]
-        gen_ids = seq[:, inp_len:] if seq.shape[1] >= inp_len else seq
-
-        torch.testing.assert_close(gen_ids.cpu(), exp_ids)
-        txt = self.processor.batch_decode(gen_ids, skip_special_tokens=True)
-        self.assertListEqual(txt, exp_txt)
-
-    @slow
-    def test_fixture_batched_matches(self):
-        """
-        reproducer (creates JSON directly in repo): https://gist.github.com/ebezzam/c979f0f1a2b9223fa137faf1c02022d4#file-reproducer-py
-        """
-        path = Path(__file__).parent.parent.parent / "fixtures/musicflamingo/expected_results_batched.json"
-        with open(path, "r", encoding="utf-8") as f:
-            raw = json.load(f)
-        exp_ids = torch.tensor(raw["token_ids"])
-        exp_txt = raw["transcriptions"]
-
-        conversations = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
-                        },
-                        {
-                            "type": "audio",
-                            "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
-                        },
-                    ],
-                }
-            ],
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "Generate a structured lyric sheet from the input music.",
-                        },
-                        {
-                            "type": "audio",
-                            "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_2.mp3",
-                        },
-                    ],
-                }
-            ],
-        ]
-
-        model = MusicFlamingoForConditionalGeneration.from_pretrained(
-            self.checkpoint, device_map=torch_device, dtype=torch.bfloat16
-        ).eval()
-
-        batch = self.processor.apply_chat_template(
-            conversations, tokenize=True, add_generation_prompt=True, return_dict=True
-        ).to(model.device, dtype=model.dtype)
-        seq = model.generate(**batch)
-        inp_len = batch["input_ids"].shape[1]
-        gen_ids = seq[:, inp_len:] if seq.shape[1] >= inp_len else seq
-
-        torch.testing.assert_close(gen_ids.cpu(), exp_ids)
-        txt = self.processor.batch_decode(gen_ids, skip_special_tokens=True)
-        self.assertListEqual(txt, exp_txt)
diff --git a/tests/models/musicflamingo/test_processing_musicflamingo.py b/tests/models/musicflamingo/test_processing_musicflamingo.py
deleted file mode 100644
index 145ed3091aff..000000000000
--- a/tests/models/musicflamingo/test_processing_musicflamingo.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
-# reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-import tempfile
-import unittest
-
-from parameterized import parameterized
-
-from transformers import (
-    AutoProcessor,
-    AutoTokenizer,
-    MusicFlamingoProcessor,
-    WhisperFeatureExtractor,
-)
-from transformers.testing_utils import require_librosa, require_torch, require_torchaudio
-
-from ...test_processing_common import MODALITY_INPUT_DATA, ProcessorTesterMixin
-
-
-class MusicFlamingoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
-    processor_class = MusicFlamingoProcessor
-
-    @classmethod
-    @require_torch
-    @require_torchaudio
-    def setUpClass(cls):
-        cls.checkpoint = "nvidia/music-flamingo-2601-hf"
-        cls.tmpdirname = tempfile.mkdtemp()
-
-        processor = MusicFlamingoProcessor.from_pretrained(cls.checkpoint)
-        processor.save_pretrained(cls.tmpdirname)
-
-    @require_torch
-    @require_torchaudio
-    def get_tokenizer(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
-
-    @require_torch
-    @require_torchaudio
-    def get_audio_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).audio_processor
-
-    @require_torch
-    @require_torchaudio
-    def get_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
-
-    @classmethod
-    def tearDownClass(cls):
-        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
-
-    @require_torch
-    @require_torchaudio
-    def test_can_load_various_tokenizers(self):
-        processor = MusicFlamingoProcessor.from_pretrained(self.checkpoint)
-        tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
-        self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
-
-    @require_torch
-    @require_torchaudio
-    def test_save_load_pretrained_default(self):
-        tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
-        processor = MusicFlamingoProcessor.from_pretrained(self.checkpoint)
-        feature_extractor = processor.feature_extractor
-
-        processor = MusicFlamingoProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            processor.save_pretrained(tmpdir)
-            reloaded = MusicFlamingoProcessor.from_pretrained(tmpdir)
-
-        self.assertEqual(reloaded.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertEqual(reloaded.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(reloaded.feature_extractor, WhisperFeatureExtractor)
-
-    @require_torch
-    @require_torchaudio
-    def test_tokenizer_integration(self):
-        slow_tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, use_fast=False)
-        fast_tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, from_slow=True, legacy=False)
-
-        prompt = (
-            "<|im_start|>system\nAnswer the questions.<|im_end|>"
-            "<|im_start|>user\n<sound>What is it?<|im_end|>"
-            "<|im_start|>assistant\n"
-        )
-        EXPECTED_OUTPUT = [
-            "<|im_start|>",
-            "system",
-            "Ċ",
-            "Answer",
-            "Ġthe",
-            "Ġquestions",
-            ".",
-            "<|im_end|>",
-            "<|im_start|>",
-            "user",
-            "Ċ",
-            "<sound>",
-            "What",
-            "Ġis",
-            "Ġit",
-            "?",
-            "<|im_end|>",
-            "<|im_start|>",
-            "assistant",
-            "Ċ",
-        ]
-
-        self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
-        self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
-
-    @require_torch
-    @require_torchaudio
-    def test_chat_template(self):
-        processor = AutoProcessor.from_pretrained(self.checkpoint)
-        expected_prompt = (
-            "<|im_start|>system\nYou are Music Flamingo, a multimodal assistant for language and music. "
-            "On each turn you receive an audio clip which contains music and optional text, "
-            "you will receive at least one or both; use your world knowledge and reasoning "
-            "to help the user with any task. Interpret the entirety of the content any input music"
-            "--regardlenss of whether the user calls it audio, music, or sound.<|im_end|>\n"
-            "<|im_start|>user\n<sound>What is surprising about the relationship between the barking and the music?<|im_end|>\n"
-            "<|im_start|>assistant\n"
-        )
-
-        conversations = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "What is surprising about the relationship between the barking and the music?",
-                    },
-                    {
-                        "type": "audio",
-                        "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
-                    },
-                ],
-            }
-        ]
-
-        formatted = processor.tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=True)
-        self.assertEqual(expected_prompt, formatted)
-
-    @require_torch
-    @require_torchaudio
-    def test_apply_transcription_request_single(self):
-        processor = AutoProcessor.from_pretrained(self.checkpoint)
-
-        audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav"
-        helper_outputs = processor.apply_transcription_request(audio=audio_url)
-
-        conversation = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "Transcribe the input speech."},
-                    {"type": "audio", "audio": audio_url},
-                ],
-            }
-        ]
-        manual_outputs = processor.apply_chat_template(
-            conversation,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-        )
-
-        for key in ("input_ids", "attention_mask", "input_features", "input_features_mask"):
-            self.assertIn(key, helper_outputs)
-            self.assertTrue(helper_outputs[key].equal(manual_outputs[key]))
-
-    # Overwrite to remove skip numpy inputs (still need to keep as many cases as parent)
-    @require_librosa
-    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
-    def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str):
-        if return_tensors == "np":
-            self.skipTest("MusicFlamingo only supports PyTorch tensors")
-        self._test_apply_chat_template(
-            "audio", batch_size, return_tensors, "audio_input_name", "feature_extractor", MODALITY_INPUT_DATA["audio"]
-        )

From ebf318806bde1f40536955cc2120d004e0a01168 Mon Sep 17 00:00:00 2001
From: Lasha <26011196+lashahub@users.noreply.github.com>
Date: Fri, 23 Jan 2026 22:01:34 -0500
Subject: [PATCH 12/12] MF tests in AF3

---
 .../expected_music_results_batched.json       |   1 +
 .../expected_music_results_single.json        |   1 +
 .../test_modeling_audioflamingo3.py           | 155 ++++++++++++++++++
 .../test_processing_audioflamingo3.py         |  93 +++++++++++
 4 files changed, 250 insertions(+)
 create mode 100644 tests/fixtures/audioflamingo3/expected_music_results_batched.json
 create mode 100644 tests/fixtures/audioflamingo3/expected_music_results_single.json

diff --git a/tests/fixtures/audioflamingo3/expected_music_results_batched.json b/tests/fixtures/audioflamingo3/expected_music_results_batched.json
new file mode 100644
index 000000000000..cace70c6e04b
--- /dev/null
+++ b/tests/fixtures/audioflamingo3/expected_music_results_batched.json
@@ -0,0 +1 @@
+{"token_ids": [[1986, 3754, 374, 458, 44855, 19461, 98875, 378, 107, 14, 378, 107, 35, 681, 55964, 11598, 55564, 429, 57843, 279, 9906, 11, 10581, 52760, 6097, 13450, 315, 20729, 2420, 448, 279, 9842, 11, 6335, 55964, 2307, 27235, 315, 11416, 19461, 98875, 13, 220, 576, 8090, 315, 279, 6573, 374, 220, 16, 21, 18, 13, 20, 24, 6486, 624, 12151, 78, 609, 5309, 1365, 576, 5492, 10797, 518, 264, 74391, 378, 107, 16, 20, 15, 378, 107, 33, 8795, 323, 374, 40876, 304, 378, 107, 36, 378, 107, 36505, 624, 56324, 367, 609, 24039, 1365, 362, 43361, 11, 1550, 55964, 69, 45101, 5670, 14087, 279, 26112, 13, 576, 36290, 16266, 374, 5798, 389, 264, 3040, 55964, 263, 55964, 1782, 55964, 30449, 14346, 23196, 5383, 448, 41854, 10323, 11, 4131, 11144, 4131, 546, 11, 323, 10296, 15588, 55964, 71, 1862, 11, 678, 3108, 55964, 331, 2627, 311, 264, 20380, 88, 42898, 21529, 429, 31676, 38969, 448, 279, 46289, 13, 26410, 11, 796, 10311, 8212, 657, 42898, 11508, 323, 63141, 43221, 278, 35995, 6777, 279, 1887, 10581, 52760, 29677, 11, 1393, 5107, 42898, 357, 3435, 323, 9824, 388, 29100, 6292, 279, 1936, 55964, 8602, 323, 21025, 13, 576, 6514, 374, 6884, 55964, 267, 64853, 11, 448, 279, 42898, 5424, 281, 7295, 311, 1855, 458, 60738, 5112, 20743, 11, 323, 279, 8084, 87761, 65059, 17361, 2090, 323, 31273, 11, 14260, 315, 18706, 15254, 55964, 30449, 13918, 624, 53, 3683, 83984, 1365, 576, 2990, 374, 264, 8778, 752, 45648, 55964, 82, 46288, 5652, 6693, 6792, 20512, 374, 2797, 11, 9906, 11, 323, 10078, 15233, 13, 2932, 27321, 279, 61584, 304, 264, 4240, 11, 10581, 52760, 1707, 11, 23922, 448, 26447, 3233, 55964, 83, 2886, 11, 312, 22328, 11, 323, 7626, 429, 2968, 279, 7743, 264, 73056, 11, 8887, 55964, 2307, 1340, 268, 13, 576, 25407, 1555, 23011, 4065, 55964, 437, 55964, 3057, 304, 279, 6514, 11, 14376, 1526, 279, 27950, 42898, 26112, 624, 43, 10920, 938, 62087, 1365, 576, 23261, 5772, 3948, 2163, 2948, 11, 78322, 11, 323, 279, 86335, 2355, 315, 264, 27430, 9362, 13, 576, 8622, 11, 58077, 9704, 2293, 9, 2073, 3838, 1035, 847, 1879, 387, 2041, 498, 12390, 9, 2293, 70499, 279, 55810, 323, 71790, 279, 5492, 748, 14269, 6200, 13, 6944, 5128, 1741, 438, 353, 2073, 2610, 13020, 279, 9759, 3403, 847, 6084, 1717, 21461, 854, 9, 323, 353, 2073, 19389, 21059, 88148, 1119, 25678, 24908, 854, 9, 40368, 279, 1042, 1229, 323, 37550, 16232, 624, 33039, 28596, 609, 52611, 1365, 576, 26112, 11017, 264, 11416, 19461, 98875, 52929, 25, 458, 40945, 19706, 429, 63564, 279, 9842, 9382, 11, 8110, 553, 49299, 429, 19131, 279, 25407, 19221, 11, 264, 855, 55964, 6150, 355, 429, 22111, 23504, 448, 16062, 42898, 13617, 11, 323, 264, 68897, 55810, 1380, 279, 9704, 43594, 916, 2480, 55964, 339, 27535, 6782, 16896, 323, 264, 59387, 23196, 5383, 13, 42305, 278, 18303, 3410, 3550, 369, 279, 42898, 11508, 311, 32405, 11, 6388, 1119, 264, 14164, 429, 29922, 1182, 311, 264, 803, 31387, 10434, 1573, 279, 1590, 11, 82498, 55810, 323, 264, 2805, 60658, 429, 86409, 279, 1887, 9704, 13, 21886, 20612, 37168, 71759, 11, 448, 1817, 55810, 7842, 4960, 13617, 320, 35499, 35995, 11, 5080, 55964, 41692, 523, 6782, 16896, 8, 311, 96068, 279, 63943, 19530, 292, 2666, 624, 785, 90767, 70971, 1365, 39659, 292, 7203, 26558, 13771, 2878, 279, 468, 55964, 36505, 1853, 266, 14011, 2070, 13, 576, 49299, 3545, 10775, 1526, 378, 107, 36, 378, 107, 4142, 378, 107, 34, 145346, 76, 378, 107, 4142, 378, 107, 32, 378, 107, 4142, 378, 107, 33, 11, 264, 358, 55964, 9971, 55964, 3090, 55964, 53, 32724, 429, 11450, 264, 9906, 11, 94509, 6337, 13, 576, 855, 55964, 6150, 355, 38919, 378, 107, 37, 145346, 76, 378, 107, 437, 378, 107, 33, 21, 11, 7842, 264, 26447, 8922, 55964, 1966, 5600, 85296, 17172, 1573, 52483, 1182, 311, 279, 98205, 55964, 3057, 291, 55810, 13, 576, 14164, 92836, 389, 378, 107, 6091, 1630, 22, 378, 107, 437, 378, 107, 33, 22, 11, 8241, 264, 9814, 13228, 6407, 429, 2608, 724, 14269, 23504, 1573, 279, 1590, 470, 311, 279, 2114, 1376, 624, 27489, 93567, 609, 9608, 1365, 576, 3754, 505, 28146, 264, 63943, 19530, 292, 11, 37550, 16566, 11, 77749, 279, 82274, 95070, 315, 3309, 55964, 16, 24, 24, 15, 82, 19461, 98875, 448, 279, 47394, 5670, 315, 220, 17, 15, 17, 15, 82, 15254, 55964, 8374, 13, 11445, 22268, 8111, 55810, 323, 43361, 42898, 975, 1281, 432, 1632, 55964, 27051, 1608, 369, 2176, 8887, 1486, 323, 6335, 5003, 11, 80558, 287, 279, 18706, 90490, 315, 2666, 55964, 18536, 11, 61584, 55964, 3612, 2071, 15254, 4627, 13, 151645], [334, 68043, 220, 16, 1019, 33648, 9287, 88828, 304, 51454, 11, 12711, 28347, 261, 304, 279, 3054, 11, 24353, 20783, 18707, 30789, 11, 22502, 4614, 389, 279, 49293, 271, 334, 68043, 220, 17, 1019, 26843, 2367, 98091, 389, 279, 39612, 11, 304, 17172, 582, 6950, 11, 14697, 41315, 311, 279, 17788, 11, 34254, 8048, 1616, 2238, 5135, 271, 334, 1143, 29869, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1449, 22721, 264, 10526, 17970, 11, 304, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 271, 334, 68043, 220, 19, 1019, 17814, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 271, 334, 1143, 29869, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1449, 22721, 264, 10526, 17970, 11, 304, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 271, 334, 32848, 1019, 641, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 1406, 334, 68043, 220, 19, 1019, 17814, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 271, 334, 1143, 29869, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1449, 22721, 264, 10526, 17970, 11, 304, 279, 40363, 315, 46652, 35966, 11, 1077, 279, 89671, 16484, 271, 334, 68043, 220, 18, 1019, 43930, 415, 60217, 389, 279, 3108, 11, 62371, 49411, 1690, 582, 646, 944, 10265, 11, 89115, 5059, 69051, 11, 4325, 11253, 279, 1618, 271, 334, 68043, 220, 19, 1019, 17814, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 271, 334, 2662, 299, 1019, 61457, 3729, 22502, 8266, 1290, 11, 1896, 264, 26725, 57945, 11, 22502, 2948, 1007, 279, 8781, 11, 14019, 11, 1896, 264, 46615, 11, 38862, 86979, 11, 5538, 12000, 11, 264, 26725, 57945, 11, 297, 1580, 5652, 50698, 642, 11174, 11, 22502, 2948, 1007, 279, 8781, 151645, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669, 151669]], "transcriptions": ["This track is an energetic Eurodance\u202f/\u202fDance\u2011Pop anthem that blends the bright, melodic sensibilities of mainstream pop with the driving, club\u2011ready pulse of classic Eurodance.  The duration of the piece is 163.59 seconds.\nTempo & Key \u2013 The song moves at a brisk\u202f150\u202fBPM and is rooted in\u202fE\u202fmajor.\nInstrumentation & Production \u2013 A polished, high\u2011fidelity production frames the arrangement. The rhythm foundation is built on a four\u2011on\u2011the\u2011floor electronic drum pattern with crisp kick, snappy snare, and tight hi\u2011hats, all side\u2011chained to a punchy synth bass that locks tightly with the drums. Bright, arpeggiated synth leads and layered chordal pads carry the main melodic hooks, while additional synth stabs and risers accentuate the build\u2011ups and drops. The mix is wide\u2011stereo, with the synth elements panned to create an expansive soundstage, and the overall mastering emphasizes loudness and clarity, typical of contemporary dance\u2011floor tracks.\nVocal Characteristics \u2013 The lead is a female mezzo\u2011soprano whose timbre is clear, bright, and slightly processed. She delivers the melody in a clean, melodic style, enhanced with subtle auto\u2011tune, reverb, and delay that give the voice a glossy, radio\u2011ready sheen. The vocal line sits front\u2011and\u2011center in the mix, cutting through the dense synth arrangement.\nLyrical Themes \u2013 The lyrics revolve around love, longing, and the transformative power of a beloved presence. The central, repetitive hook\u2014*\u201cWhat would my world be without you?\u201d*\u2014anchors the chorus and underscores the song\u2019s emotional core. Other lines such as *\u201cYou lit the stars above my sleepless nights\u201d* and *\u201cTurn silent whispers into endless flights\u201d* illustrate the yearning and hopeful tone.\nSong Structure & Dynamics \u2013 The arrangement follows a classic Eurodance blueprint: an instrumental intro that establishes the driving beat, followed by verses that introduce the vocal narrative, a pre\u2011chorus that builds tension with rising synth layers, and a soaring chorus where the hook repeats over full\u2011throttle synths and a heightened drum pattern. Instrumental breaks provide space for the synth leads to shine, leading into a bridge that strips back to a more intimate texture before the final, amplified chorus and a short outro that fades the main hook. Dynamic intensity rises progressively, with each chorus adding extra layers (additional pads, higher\u2011octave synths) to amplify the euphoric feel.\nTheoretical Insight \u2013 Harmonic movement stays largely within the E\u2011major diatonic field. The verses often cycle through\u202fE\u202f\u2013\u202fC\u266fm\u202f\u2013\u202fA\u202f\u2013\u202fB, a I\u2011vi\u2011IV\u2011V progression that creates a bright, uplifting loop. The pre\u2011chorus introduces\u202fF\u266fm\u202fand\u202fB6, adding a subtle minor\u2011subdominant flavor before resolving back to the tonic\u2011centered chorus. The bridge leans on\u202fAmaj7\u202fand\u202fB7, providing a brief modal shift that heightens emotional tension before the final return to the home key.\nOverall Mood & Context \u2013 The track exudes a euphoric, hopeful atmosphere, marrying the nostalgic sparkle of late\u20111990s Eurodance with the sleek production of 2020s dance\u2011pop. Its anthemic chorus and polished synth work make it well\u2011suited for both radio play and club settings, embodying the contemporary resurgence of feel\u2011good, melody\u2011driven dance music.", "**Verse 1**\nMidnight cravings in bloom, lights flicker in the room, pepperoni dreams arise, pizza party on the skies\n\n**Verse 2**\nCheese melts on the crust, in flavor we trust, boxes stacked to the moon, slices gone way too soon\n\n**Chorus**\nLate night pizza feeling right, every bite a pure delight, in the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n**Verse 4**\nTake a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain\n\n**Chorus**\nLate night pizza feeling right, every bite a pure delight, in the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n**Bridge**\nIn the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n\n**Verse 4**\nTake a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain\n\n**Chorus**\nLate night pizza feeling right, every bite a pure delight, in the warmth of neon glow, let the toppings overflow\n\n**Verse 3**\nGarlic knots on the side, grease drips we can't hide, marinara waterfall, someone answers the call\n\n**Verse 4**\nTake a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain\n\n**Outro**\nLate night pizza feeling right, take a holy bliss, pizza love off the chain, oh, take a sip, soda fizz, deep dish, a holy bliss, oregano sprinkles rain, pizza love off the chain"]}
\ No newline at end of file
diff --git a/tests/fixtures/audioflamingo3/expected_music_results_single.json b/tests/fixtures/audioflamingo3/expected_music_results_single.json
new file mode 100644
index 000000000000..df58f305becc
--- /dev/null
+++ b/tests/fixtures/audioflamingo3/expected_music_results_single.json
@@ -0,0 +1 @@
+{"token_ids": [[1986, 3754, 374, 458, 44855, 19461, 98875, 55964, 3528, 29604, 55964, 11598, 55564, 429, 57843, 279, 9906, 11, 10581, 52760, 6097, 13450, 315, 20729, 2420, 448, 279, 9842, 11, 6335, 55964, 2307, 27235, 315, 11416, 19461, 98875, 13, 220, 576, 5670, 374, 43361, 323, 1550, 55964, 69, 45101, 11, 16445, 264, 6884, 55964, 267, 64853, 6514, 429, 7482, 75361, 287, 42898, 11508, 11, 57267, 35995, 11, 323, 264, 20380, 88, 42898, 21529, 4065, 323, 12261, 11, 1393, 264, 41854, 14346, 23196, 16138, 2293, 74182, 10323, 11, 4131, 11144, 4131, 546, 11, 323, 11048, 15588, 55964, 9198, 12624, 2293, 2674, 2010, 279, 36290, 4637, 13, 576, 26112, 374, 5798, 2163, 264, 2797, 32387, 55964, 6150, 355, 12626, 11, 31355, 12852, 553, 40945, 18303, 429, 2608, 268, 279, 15254, 30449, 23270, 13, 576, 8090, 315, 279, 6573, 374, 220, 16, 21, 18, 13, 20, 24, 6486, 624, 53, 509, 1127, 525, 12600, 553, 264, 8778, 752, 45648, 55964, 82, 46288, 5652, 448, 264, 2797, 11, 9906, 6792, 20512, 13, 6252, 10581, 52760, 11, 77123, 9691, 374, 23922, 448, 57072, 51121, 312, 22328, 323, 7626, 11, 7086, 279, 5068, 264, 32136, 11, 94509, 2666, 13, 576, 85237, 938, 2213, 18652, 389, 2948, 323, 43293, 11, 18822, 10161, 1036, 3838, 1035, 847, 1879, 387, 2041, 498, 12390, 323, 31589, 279, 86335, 2355, 315, 264, 8263, 26087, 2610, 13020, 279, 9759, 3403, 847, 6084, 1717, 21461, 2419, 1036, 19389, 21059, 88148, 1119, 25678, 24908, 64212, 4220, 45250, 5128, 54314, 279, 5492, 748, 37550, 11, 23467, 19221, 624, 9422, 41924, 11, 279, 5492, 15885, 448, 458, 6529, 55964, 58212, 7132, 42898, 19706, 429, 7289, 279, 8766, 278, 19671, 11, 10797, 1119, 264, 32387, 1380, 279, 25407, 61584, 582, 4693, 916, 264, 24020, 3040, 55964, 263, 55964, 1782, 55964, 30449, 9382, 11, 1221, 37075, 1119, 264, 68897, 55810, 429, 13617, 279, 1887, 9704, 448, 5107, 42898, 796, 10311, 70, 3530, 323, 264, 86918, 23196, 5383, 13, 1527, 40945, 1438, 11017, 11, 16445, 264, 18293, 42898, 2990, 323, 264, 9814, 5943, 429, 22111, 23504, 1573, 13451, 311, 279, 1590, 55810, 11, 892, 374, 11504, 448, 3694, 25407, 993, 55964, 35719, 323, 264, 26447, 11893, 304, 279, 26112, 11, 11695, 64283, 304, 458, 44855, 60658, 429, 86409, 279, 1887, 42898, 59512, 624, 27489, 11, 279, 3754, 505, 28146, 458, 94509, 11, 37550, 16566, 11, 39780, 279, 39657, 48482, 19461, 98875, 8913, 315, 279, 3309, 55964, 16, 24, 24, 15, 82, 311, 4124, 55964, 17, 15, 15, 15, 82, 1393, 9664, 31520, 40876, 304, 18706, 29604, 55964, 11598, 66223, 13, 11445, 9906, 5670, 11, 85505, 10581, 52760, 29677, 11, 323, 37583, 28180, 517, 23261, 1281, 432, 264, 39657, 48482, 2666, 55964, 18536, 6335, 55564, 13, 151645]], "transcriptions": ["This track is an energetic Eurodance\u2011style Dance\u2011Pop anthem that blends the bright, melodic sensibilities of mainstream pop with the driving, club\u2011ready pulse of classic Eurodance.  The production is polished and high\u2011fidelity, featuring a wide\u2011stereo mix that places shimmering synth leads, lush pads, and a punchy synth bass front and centre, while a crisp electronic drum kit\u2014tight kick, snappy snare, and rapid hi\u2011hat patterns\u2014propels the rhythm forward. The arrangement is built around a clear verse\u2011chorus framework, punctuated by instrumental breaks that heighten the dancefloor momentum. The duration of the piece is 163.59 seconds.\nVocals are delivered by a female mezzo\u2011soprano with a clear, bright timbre. Her melodic, expressive delivery is enhanced with tasteful reverb and delay, giving the performance a spacious, uplifting feel. The lyrical content centers on love and dependence, repeatedly asking \u201cWhat would my world be without you?\u201d and celebrating the transformative power of a partner (\u201cYou lit the stars above my sleepless nights,\u201d \u201cTurn silent whispers into endless flights\u201d). These recurring lines reinforce the song\u2019s hopeful, romantic narrative.\nStructurally, the song opens with an attention\u2011grabbing synth intro that sets the tonal mood, moves into a verse where the vocal melody weaves over a steady four\u2011on\u2011the\u2011floor beat, then launches into a soaring chorus that layers the main hook with additional synth arpeggios and a fuller drum pattern. An instrumental break follows, featuring a filtered synth lead and a brief drop that builds tension before returning to the final chorus, which is repeated with added vocal ad\u2011libs and a subtle lift in the arrangement, culminating in an energetic outro that fades the main synth motif.\nOverall, the track exudes an uplifting, hopeful atmosphere, capturing the quintessential Eurodance spirit of the late\u20111990s to early\u20112000s while remaining firmly rooted in contemporary Dance\u2011Pop aesthetics. Its bright production, catchy melodic hooks, and emotionally resonant lyrics make it a quintessential feel\u2011good club anthem."]}
\ No newline at end of file
diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
index 3346cee5c058..845ad6c8bbdc 100644
--- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
+++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py
@@ -352,3 +352,158 @@ def test_fixture_batched_matches(self):
         torch.testing.assert_close(gen_ids.cpu(), exp_ids)
         txt = self.processor.batch_decode(gen_ids, skip_special_tokens=True)
         self.assertListEqual(txt, exp_txt)
+
+
+class AudioFlamingo3MusicModelTester(AudioFlamingo3ModelTester):
+    def __init__(
+        self,
+        parent,
+        audio_token_id=0,
+        seq_length=25,
+        feat_seq_length=60,
+        text_config=None,
+        audio_config=None,
+        is_training=True,
+    ):
+        if audio_config is None:
+            audio_config = {
+                "model_type": "audioflamingo3_encoder",
+                "hidden_size": 16,
+                "num_attention_heads": 4,
+                "intermediate_size": 16,
+                "num_hidden_layers": 2,
+                "num_mel_bins": 80,
+                "max_source_positions": 30,
+                "initializer_range": 0.02,
+                "use_rotary_embedding": True,
+            }
+        super().__init__(parent, audio_token_id, seq_length, feat_seq_length, text_config, audio_config, is_training)
+
+
+@require_torch
+class AudioFlamingo3MusicForConditionalGenerationModelTest(AudioFlamingo3ForConditionalGenerationModelTest):
+    """
+    Model tester for `AudioFlamingo3ForConditionalGeneration` configured as Music Flamingo (with rotary embeddings).
+    """
+
+    def setUp(self):
+        self.model_tester = AudioFlamingo3MusicModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AudioFlamingo3Config, has_text_modality=False)
+
+
+@require_torch
+class AudioFlamingo3MusicForConditionalGenerationIntegrationTest(unittest.TestCase):
+    """
+    Slow tests against the public checkpoint to validate processor-model alignment and in-place fusion
+    for the Music Flamingo configuration.
+    """
+
+    @classmethod
+    def setUp(cls):
+        cleanup(torch_device, gc_collect=True)
+        cls.checkpoint = "nvidia/music-flamingo-2601-hf"
+        cls.processor = AutoProcessor.from_pretrained(cls.checkpoint)
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    @slow
+    def test_fixture_single_matches(self):
+        """
+        reproducer (creates JSON directly in repo): https://gist.github.com/ebezzam/c979f0f1a2b9223fa137faf1c02022d4#file-reproducer-py
+        """
+        path = Path(__file__).parent.parent.parent / "fixtures/audioflamingo3/expected_music_results_single.json"
+        with open(path, "r", encoding="utf-8") as f:
+            raw = json.load(f)
+        exp_ids = torch.tensor(raw["token_ids"])
+        exp_txt = raw["transcriptions"]
+
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
+                    },
+                    {
+                        "type": "audio",
+                        "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
+                    },
+                ],
+            }
+        ]
+
+        model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
+            self.checkpoint, device_map=torch_device, dtype=torch.bfloat16
+        ).eval()
+
+        batch = self.processor.apply_chat_template(
+            conversation, tokenize=True, add_generation_prompt=True, return_dict=True
+        ).to(model.device, dtype=model.dtype)
+        seq = model.generate(**batch)
+        inp_len = batch["input_ids"].shape[1]
+        gen_ids = seq[:, inp_len:] if seq.shape[1] >= inp_len else seq
+
+        torch.testing.assert_close(gen_ids.cpu(), exp_ids)
+        txt = self.processor.batch_decode(gen_ids, skip_special_tokens=True)
+        self.assertListEqual(txt, exp_txt)
+
+    @slow
+    def test_fixture_batched_matches(self):
+        """
+        reproducer (creates JSON directly in repo): https://gist.github.com/ebezzam/c979f0f1a2b9223fa137faf1c02022d4#file-reproducer-py
+        """
+        path = Path(__file__).parent.parent.parent / "fixtures/audioflamingo3/expected_music_results_batched.json"
+        with open(path, "r", encoding="utf-8") as f:
+            raw = json.load(f)
+        exp_ids = torch.tensor(raw["token_ids"])
+        exp_txt = raw["transcriptions"]
+
+        conversations = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
+                        },
+                        {
+                            "type": "audio",
+                            "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
+                        },
+                    ],
+                }
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Generate a structured lyric sheet from the input music.",
+                        },
+                        {
+                            "type": "audio",
+                            "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_2.mp3",
+                        },
+                    ],
+                }
+            ],
+        ]
+
+        model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
+            self.checkpoint, device_map=torch_device, dtype=torch.bfloat16
+        ).eval()
+
+        batch = self.processor.apply_chat_template(
+            conversations, tokenize=True, add_generation_prompt=True, return_dict=True
+        ).to(model.device, dtype=model.dtype)
+        seq = model.generate(**batch)
+        inp_len = batch["input_ids"].shape[1]
+        gen_ids = seq[:, inp_len:] if seq.shape[1] >= inp_len else seq
+
+        torch.testing.assert_close(gen_ids.cpu(), exp_ids)
+        txt = self.processor.batch_decode(gen_ids, skip_special_tokens=True)
+        self.assertListEqual(txt, exp_txt)
diff --git a/tests/models/audioflamingo3/test_processing_audioflamingo3.py b/tests/models/audioflamingo3/test_processing_audioflamingo3.py
index bbe01cede854..dff4f2e751c5 100644
--- a/tests/models/audioflamingo3/test_processing_audioflamingo3.py
+++ b/tests/models/audioflamingo3/test_processing_audioflamingo3.py
@@ -189,3 +189,96 @@ def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str):
         self._test_apply_chat_template(
             "audio", batch_size, return_tensors, "audio_input_name", "feature_extractor", MODALITY_INPUT_DATA["audio"]
         )
+
+
+class AudioFlamingo3MusicProcessingTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = AudioFlamingo3Processor
+
+    @classmethod
+    @require_torch
+    @require_torchaudio
+    def setUpClass(cls):
+        cls.checkpoint = "nvidia/music-flamingo-2601-hf"
+        cls.tmpdirname = tempfile.mkdtemp()
+
+        processor = AudioFlamingo3Processor.from_pretrained(cls.checkpoint)
+        processor.save_pretrained(cls.tmpdirname)
+
+    @require_torch
+    @require_torchaudio
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    @require_torch
+    @require_torchaudio
+    def get_audio_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).audio_processor
+
+    @require_torch
+    @require_torchaudio
+    def get_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
+
+    @require_torch
+    @require_torchaudio
+    def test_music_chat_template_and_boundaries(self):
+        processor = AutoProcessor.from_pretrained(self.checkpoint)
+        expected_system_prompt = (
+            "<|im_start|>system\nYou are Music Flamingo, a multimodal assistant for language and music. "
+            "On each turn you receive an audio clip which contains music and optional text, "
+            "you will receive at least one or both; use your world knowledge and reasoning "
+            "to help the user with any task. Interpret the entirety of the content any input music"
+            "--regardlenss of whether the user calls it audio, music, or sound.<|im_end|>\n"
+        )
+
+        # Verify that the music-specific system prompt is preserved
+        self.assertIn(expected_system_prompt, processor.tokenizer.chat_template)
+
+        # Basic integration test with dummy audio
+        conversations = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Analyze this track."},
+                    {
+                        "type": "audio",
+                        "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
+                    },
+                ],
+            }
+        ]
+
+        inputs = processor.apply_chat_template(
+            conversations, tokenize=True, return_dict=True, add_generation_prompt=True
+        )
+
+        decoded = processor.decode(inputs["input_ids"][0])
+
+        if processor.audio_bos_token is not None:
+            self.assertIn(processor.audio_bos_token, decoded)
+        if processor.audio_eos_token is not None:
+            self.assertIn(processor.audio_eos_token, decoded)
+
+        self.assertIn("<|im_start|>user", decoded)
+        self.assertIn("Analyze this track", decoded)
+        self.assertIn("<|im_start|>assistant", decoded)
+
+    @require_librosa
+    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
+    def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str):
+        if return_tensors == "np":
+            self.skipTest("AudioFlamingo3 only supports PyTorch tensors")
+        self._test_apply_chat_template(
+            "audio", batch_size, return_tensors, "audio_input_name", "feature_extractor", MODALITY_INPUT_DATA["audio"]
+        )
+
+    def prepare_processor_dict(self):
+        return {
+            "audio_bos_token": "<|sound_bos|>",
+            "audio_eos_token": "<|sound_eos|>",
+            "max_audio_len": 1200,
+        }