huggingface · HallerPatrick · Mar 31, 2026 · Mar 31, 2026
diff --git a/src/transformers/models/qwen3_5/configuration_qwen3_5.py b/src/transformers/models/qwen3_5/configuration_qwen3_5.py
@@ -105,10 +105,13 @@ def __post_init__(self, **kwargs):
         kwargs.setdefault("partial_rotary_factor", 0.25)  # assign default for BC
         if self.layer_types is None:
             interval_pattern = kwargs.pop("full_attention_interval", 4)
-            self.layer_types = [
-                "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
+            if interval_pattern <= 0:
+                self.layer_types = ["linear_attention"] * self.num_hidden_layers
+            else:
+                self.layer_types = [
+                    "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention"
+                    for i in range(self.num_hidden_layers)
+                ]
 
         super().__post_init__(**kwargs)
 

diff --git a/src/transformers/models/qwen3_5/modeling_qwen3_5.py b/src/transformers/models/qwen3_5/modeling_qwen3_5.py
@@ -1243,9 +1243,12 @@ def forward(
         if use_cache and past_key_values is None:
             past_key_values = DynamicCache(config=self.config)
 
+        contains_softmax_attention = self.config.layer_types.count("full_attention") > 0
+
         # the hard coded `4` is for text, temporal, height and width.
         if position_ids is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            past_seen_tokens = past_key_values.get_seq_length() if \
+                (past_key_values is not None and contains_softmax_attention) else 0
             position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
             position_ids = position_ids.view(1, 1, -1).expand(4, inputs_embeds.shape[0], -1)
         elif position_ids.ndim == 2:
@@ -1263,7 +1266,8 @@ def forward(
             attention_mask=attention_mask,
             past_key_values=past_key_values,
             position_ids=text_position_ids,
-        )
+        ) if contains_softmax_attention else None
+
         linear_attn_mask = self._update_linear_attn_mask(attention_mask, past_key_values)
 
         hidden_states = inputs_embeds