huggingface · Cyrilvallez · Mar 31, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/docs/source/en/model_doc/falcon_mamba.md b/docs/source/en/model_doc/falcon_mamba.md
@@ -111,13 +111,6 @@ outputs = model.generate(**inputs, max_new_tokens=100)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```
 
-## FalconMambaCache
-
-[[autodoc]] FalconMambaCache
-    - update_conv_state
-    - update_ssm_state
-    - reset
-
 ## FalconMambaConfig
 
 [[autodoc]] FalconMambaConfig

diff --git a/docs/source/en/model_doc/mamba.md b/docs/source/en/model_doc/mamba.md
@@ -110,13 +110,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
   trainer.train()
    ```
 
-## MambaCache
-
-[[autodoc]] MambaCache
-    - update_conv_state
-    - update_ssm_state
-    - reset
-
 ## MambaConfig
 
 [[autodoc]] MambaConfig

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -634,7 +634,6 @@
     from .modeling_utils import AttentionInterface as AttentionInterface
     from .modeling_utils import PreTrainedModel as PreTrainedModel
     from .models import *
-    from .models.mamba.modeling_mamba import MambaCache as MambaCache
     from .models.timm_wrapper import TimmWrapperImageProcessor as TimmWrapperImageProcessor
 
     # Optimization

diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -68,6 +68,8 @@
     "attention",
     "sparse",
     "dense",
+    "hybrid",  # for layers that have both mamba and attention in zamba and zamba2
+    "moe",  # for nemotron_h, which uses either attention, mamba or moe
 )
 
 

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -1775,19 +1775,19 @@ def _prepare_static_cache(
     def _supports_default_dynamic_cache(cls: type["GenerativePreTrainedModel"]) -> bool:
         """
         Return `True` if current model can use a `DynamicCache` instance when initializing the `past_key_values`.
-        This adds exception for some models like `Mamba` models which use their own caches.
         """
         # NOTE: remove xlnet/reformer when the models are deprecated, non-standard model architecture/cache name
-        return not cls._is_stateful and all(
-            special_model_name not in cls.__name__.lower()
-            or "minimaxm2" in cls.__name__.lower()  # name clash between minimax and minimax m2
-            for special_model_name in [
-                "reformer",
-                "minimax",
-                "xlnet",
-                "lfm2",
-                "lfm2_vl",
-            ]
+        unsupported_model_names = (
+            "reformer",
+            "minimax",
+            "xlnet",
+            "olmohybrid",  # olmo_hybrid cannot use linear attention cache for now as it uses split k,q,v conv states
+            "rwkv",
+            "xlstm",
+        )
+        # name clash between minimax and minimax m2, so we add this "or"
+        return "minimaxm2" in cls.__name__.lower() or all(
+            unsupported_name not in cls.__name__.lower() for unsupported_name in unsupported_model_names
         )
 
     def _prepare_cache_for_generation(
@@ -1849,7 +1849,12 @@ def _prepare_cache_for_generation(
             generation_config.cache_implementation = "dynamic_full"
 
         dynamic_cache_kwargs = {}
-        if generation_config.cache_implementation != "dynamic_full":
+        # linear attention models always need to pass the config, otherwise it will use an Attention cache for the LinearAttention layers
+        is_linear_attention = any(
+            x in ("mamba", "conv", "linear_attention")
-            x in ("mamba", "conv", "linear_attention")
+            x in ("linear_attention_mamba", "conv", "linear_attention_minimax")
-            x in ("mamba", "conv", "linear_attention")
+            x in ("linear_attention_mamba", "conv", "linear_attention_minimax")
+            for x in getattr(self.config.get_text_config(decoder=True), "layer_types", [])
+        )
+        if generation_config.cache_implementation != "dynamic_full" or is_linear_attention:
             dynamic_cache_kwargs["config"] = self.config.get_text_config(decoder=True)
 
         if generation_config.cache_implementation == "offloaded":
@@ -1862,7 +1867,7 @@ def _prepare_cache_for_generation(
                     f"and will be removed in v5.13. Please only use one of {STATIC_CACHE_IMPLEMENTATIONS}, "
                     "and the layer structure will be inferred automatically."
                 )
-            model_kwargs["past_key_values"] = self._prepare_static_cache(
+            model_kwargs[cache_name] = self._prepare_static_cache(
                 cache_implementation=generation_config.cache_implementation,
                 batch_size=max(generation_config.num_beams, generation_config.num_return_sequences) * batch_size,
                 max_cache_len=max_cache_length,
@@ -1878,19 +1883,19 @@ def _prepare_cache_for_generation(
             cache_config = generation_config.cache_config if generation_config.cache_config is not None else {}
             cache_config.setdefault("config", self.config.get_text_config(decoder=True))
             backend = cache_config.pop("backend", "quanto")
-            model_kwargs["past_key_values"] = QuantizedCache(backend=backend, **cache_config)
+            model_kwargs[cache_name] = QuantizedCache(backend=backend, **cache_config)
         # i.e. `cache_implementation` in [None, "dynamic", "offloaded", "dynamic_full"]
         # TODO: prepare linear cache from a single API, instead of creating in modeling code
         else:
-            model_kwargs["past_key_values"] = DynamicCache(**dynamic_cache_kwargs)
+            model_kwargs[cache_name] = DynamicCache(**dynamic_cache_kwargs)
 
         if (
             self.config.is_encoder_decoder
-            and "past_key_values" in model_kwargs
-            and not isinstance(model_kwargs["past_key_values"], EncoderDecoderCache)
+            and cache_name in model_kwargs
+            and not isinstance(model_kwargs[cache_name], EncoderDecoderCache)
         ):
-            model_kwargs["past_key_values"] = EncoderDecoderCache(
-                model_kwargs["past_key_values"],  # self-attention cache
+            model_kwargs[cache_name] = EncoderDecoderCache(
+                model_kwargs[cache_name],  # self-attention cache
                 DynamicCache(**dynamic_cache_kwargs),  # cross-attention cache
             )
 
@@ -1990,13 +1995,15 @@ def _valid_auto_compile_criteria(
         if generation_config.disable_compile:
             return False
 
+        cache = model_kwargs.get("past_key_values", model_kwargs.get("cache_params"))
+
         # Base logic
         valid_hardware = self.device.type in ["cuda", "xpu"] or bool(
             generation_config.compile_config is not None and generation_config.compile_config._compile_all_devices
         )
-        using_compilable_cache = (
-            isinstance(model_kwargs.get("past_key_values"), Cache) and model_kwargs["past_key_values"].is_compileable
-        )
+        # Note: for some models that only use linear attention (e.g. Mamba), even a DynamicCache is compileable since all
+        # layers are, but we don't want to ALWAYS compile when calling `generate`, so we check the type
+        using_compilable_cache = cache is not None and cache.is_compileable and type(cache) is not DynamicCache
         can_compile = valid_hardware and using_compilable_cache
 
         # Exception 1: Some quantization methods do not support compilation
@@ -3467,10 +3474,9 @@ def _assisted_decoding(
         # The cache must be dynamic for assisted generation, and the check must happen AFTER preparing cache
         if not model_kwargs["use_cache"]:
             raise ValueError("assisted generate requires `use_cache=True`")
-        if generation_config.cache_implementation in ["static", "hybrid", "sliding_window"] or (
-            "past_key_values" in model_kwargs
-            and hasattr(model_kwargs["past_key_values"], "layers")
-            and any(getattr(l, "is_compileable", False) for l in model_kwargs["past_key_values"].layers)
+        if (
+            generation_config.cache_implementation in ["static", "hybrid", "sliding_window"]
+            or type(model_kwargs.get("past_key_values")) is StaticCache
         ):
             raise ValueError("assisted generate is not supported with Static cache classes`")
         # Get the candidate generator, given the parameterization

diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py
@@ -43,6 +43,7 @@ class BambaConfig(PreTrainedConfig):
     """
 
     model_type = "bamba"
+    attribute_map = {"layer_types": "layers_block_type"}
     keys_to_ignore_at_inference = ["past_key_values"]
 
     vocab_size: int = 128000