From ca12486c8399d1b7b9514dd92c705e5c8b32e8d2 Mon Sep 17 00:00:00 2001 From: Patrick Haller Date: Tue, 31 Mar 2026 15:17:29 +0000 Subject: [PATCH 1/2] Allow for all layers in Qwen3.5 architecture to be Gated Deltanet. --- .../models/qwen3_5/configuration_qwen3_5.py | 11 +++++++---- src/transformers/models/qwen3_5/modeling_qwen3_5.py | 8 ++++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/qwen3_5/configuration_qwen3_5.py b/src/transformers/models/qwen3_5/configuration_qwen3_5.py index b200b920b18e..1dac460af5fe 100644 --- a/src/transformers/models/qwen3_5/configuration_qwen3_5.py +++ b/src/transformers/models/qwen3_5/configuration_qwen3_5.py @@ -105,10 +105,13 @@ def __post_init__(self, **kwargs): kwargs.setdefault("partial_rotary_factor", 0.25) # assign default for BC if self.layer_types is None: interval_pattern = kwargs.pop("full_attention_interval", 4) - self.layer_types = [ - "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention" - for i in range(self.num_hidden_layers) - ] + if interval_pattern <= 0: + self.layer_types = ["linear_attention"] * self.num_hidden_layers + else: + self.layer_types = [ + "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention" + for i in range(self.num_hidden_layers) + ] super().__post_init__(**kwargs) diff --git a/src/transformers/models/qwen3_5/modeling_qwen3_5.py b/src/transformers/models/qwen3_5/modeling_qwen3_5.py index eba3eec02fdd..c8b490f3c061 100644 --- a/src/transformers/models/qwen3_5/modeling_qwen3_5.py +++ b/src/transformers/models/qwen3_5/modeling_qwen3_5.py @@ -1243,9 +1243,12 @@ def forward( if use_cache and past_key_values is None: past_key_values = DynamicCache(config=self.config) + contains_softmax_attention = self.config.layer_types.count("softmax_attention") > 0 + # the hard coded `4` is for text, temporal, height and width. if position_ids is None: - past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + past_seen_tokens = past_key_values.get_seq_length() if \ + (past_key_values is not None and contains_softmax_attention) else 0 position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens position_ids = position_ids.view(1, 1, -1).expand(4, inputs_embeds.shape[0], -1) elif position_ids.ndim == 2: @@ -1263,7 +1266,8 @@ def forward( attention_mask=attention_mask, past_key_values=past_key_values, position_ids=text_position_ids, - ) + ) if contains_softmax_attention else None + linear_attn_mask = self._update_linear_attn_mask(attention_mask, past_key_values) hidden_states = inputs_embeds From a947401c7c58a291bd8699ac30a27154ff4cffef Mon Sep 17 00:00:00 2001 From: Patrick Haller Date: Tue, 31 Mar 2026 17:22:38 +0200 Subject: [PATCH 2/2] Change attention type check from softmax to full attention --- src/transformers/models/qwen3_5/modeling_qwen3_5.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/qwen3_5/modeling_qwen3_5.py b/src/transformers/models/qwen3_5/modeling_qwen3_5.py index c8b490f3c061..d747c7acc456 100644 --- a/src/transformers/models/qwen3_5/modeling_qwen3_5.py +++ b/src/transformers/models/qwen3_5/modeling_qwen3_5.py @@ -1243,7 +1243,7 @@ def forward( if use_cache and past_key_values is None: past_key_values = DynamicCache(config=self.config) - contains_softmax_attention = self.config.layer_types.count("softmax_attention") > 0 + contains_softmax_attention = self.config.layer_types.count("full_attention") > 0 # the hard coded `4` is for text, temporal, height and width. if position_ids is None: