From 08f02cf7a8cfb2d62a08d8c382c5160d5462fae6 Mon Sep 17 00:00:00 2001 From: Paritosh Dwivedi Date: Sun, 15 Feb 2026 12:56:24 +0530 Subject: [PATCH 01/13] Refactor ResNet output handling to decorators --- .../models/resnet/modeling_resnet.py | 46 ++++++------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py index 92a264014c6c..8a8b8531f6ef 100644 --- a/src/transformers/models/resnet/modeling_resnet.py +++ b/src/transformers/models/resnet/modeling_resnet.py @@ -28,7 +28,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_resnet import ResNetConfig @@ -248,6 +249,7 @@ class ResNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["ResNetConvLayer", "ResNetShortCut"] + _can_record_outputs = {"hidden_states": ResNetStage} @torch.no_grad() def _init_weights(self, module): @@ -281,36 +283,24 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: Tensor, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - embedding_output = self.embedder(pixel_values) - encoder_outputs = self.encoder( - embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict - ) + encoder_outputs = self.encoder(embedding_output, return_dict=True) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -333,13 +323,13 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> ImageClassifierOutputWithNoAttention: r""" @@ -347,11 +337,12 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.resnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + model_kwargs = {} + if output_hidden_states is not None: + model_kwargs["output_hidden_states"] = output_hidden_states + outputs = self.resnet(pixel_values, **model_kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) @@ -360,10 +351,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return (loss,) + output if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @@ -385,12 +372,12 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: Tensor, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BackboneOutput: r""" @@ -419,7 +406,6 @@ def forward( >>> list(feature_maps[-1].shape) [1, 2048, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -435,12 +421,6 @@ def forward( if stage in self.out_features: feature_maps += (hidden_states[idx],) - if not return_dict: - output = (feature_maps,) - if output_hidden_states: - output += (outputs.hidden_states,) - return output - return BackboneOutput( feature_maps=feature_maps, hidden_states=outputs.hidden_states if output_hidden_states else None, From 1d5aa9c5e189b5496c7bd5526ef3872b9f5f9df2 Mon Sep 17 00:00:00 2001 From: gabrielfruet Date: Sun, 15 Feb 2026 10:43:25 -0300 Subject: [PATCH 02/13] refactor: tracing --- .../mobilenet_v2/modeling_mobilenet_v2.py | 53 ++++--------------- 1 file changed, 10 insertions(+), 43 deletions(-) diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py index 7648658c3050..fd465e9c2de2 100755 --- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py @@ -25,6 +25,7 @@ ) from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, logging +from ...utils.output_manager import can_return_tuple, capture_outputs from .configuration_mobilenet_v2 import MobileNetV2Config @@ -254,6 +255,7 @@ class MobileNetV2PreTrainedModel(PreTrainedModel): input_modalities = ("image",) supports_gradient_checkpointing = False _no_split_modules = [] + _can_record_outputs = {"hidden_states": MobileNetV2InvertedResidual} @auto_docstring @@ -323,31 +325,20 @@ def __init__(self, config: MobileNetV2Config, add_pooling_layer: bool = True): self.post_init() @auto_docstring + @capture_outputs def forward( self, pixel_values: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if pixel_values is None: raise ValueError("You have to specify pixel_values") hidden_states = self.conv_stem(pixel_values) - all_hidden_states = () if output_hidden_states else None - for i, layer_module in enumerate(self.layer): hidden_states = layer_module(hidden_states) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - last_hidden_state = self.conv_1x1(hidden_states) if self.pooler is not None: @@ -355,13 +346,9 @@ def forward( else: pooled_output = None - if not return_dict: - return tuple(v for v in [last_hidden_state, pooled_output, all_hidden_states] if v is not None) - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=all_hidden_states, ) @@ -388,12 +375,11 @@ def __init__(self, config: MobileNetV2Config) -> None: self.post_init() @auto_docstring + @can_return_tuple def forward( self, pixel_values: torch.Tensor | None = None, - output_hidden_states: bool | None = None, labels: torch.Tensor | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -402,11 +388,9 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.mobilenet_v2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + outputs = self.mobilenet_v2(pixel_values, **kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(self.dropout(pooled_output)) @@ -414,10 +398,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention( loss=loss, logits=logits, @@ -517,12 +497,11 @@ def __init__(self, config: MobileNetV2Config) -> None: self.post_init() @auto_docstring + @can_return_tuple def forward( self, pixel_values: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | SemanticSegmenterOutput: r""" @@ -553,21 +532,16 @@ def forward( >>> # logits are of shape (batch_size, num_labels, height, width) >>> logits = outputs.logits ```""" - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if labels is not None and self.config.num_labels == 1: raise ValueError("The number of labels should be greater than one") outputs = self.mobilenet_v2( pixel_values, output_hidden_states=True, # we need the intermediate hidden states - return_dict=return_dict, + **kwargs, ) - encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + encoder_hidden_states = outputs.hidden_states logits = self.segmentation_head(encoder_hidden_states[-1]) @@ -580,17 +554,10 @@ def forward( loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index) loss = loss_fct(upsampled_logits, labels) - if not return_dict: - if output_hidden_states: - output = (logits,) + outputs[1:] - else: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SemanticSegmenterOutput( loss=loss, logits=logits, - hidden_states=outputs.hidden_states if output_hidden_states else None, + hidden_states=outputs.hidden_states, attentions=None, ) From a1c15f9994ac0099a0c230ea7f526397ea94637e Mon Sep 17 00:00:00 2001 From: Arpit Rawat Date: Mon, 16 Feb 2026 22:02:29 +0530 Subject: [PATCH 03/13] add hooks to deberta_v2 --- .../models/deberta_v2/modeling_deberta_v2.py | 166 ++++++------------ 1 file changed, 53 insertions(+), 113 deletions(-) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 82ac99b93d7f..f1ba673c39e1 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -31,7 +31,9 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_deberta_v2 import DebertaV2Config @@ -272,8 +274,7 @@ def forward( ) new_context_layer_shape = context_layer.size()[:-2] + (-1,) context_layer = context_layer.view(new_context_layer_shape) - if not output_attentions: - return (context_layer, None) + return (context_layer, attention_probs) def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): @@ -431,8 +432,8 @@ def forward( relative_pos=None, rel_embeddings=None, output_attentions: bool = False, - ) -> tuple[torch.Tensor, torch.Tensor | None]: - attention_output, att_matrix = self.attention( + ) -> torch.Tensor: + attention_output, _ = self.attention( hidden_states, attention_mask, output_attentions=output_attentions, @@ -443,10 +444,7 @@ def forward( intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) - if output_attentions: - return (layer_output, att_matrix) - else: - return (layer_output, None) + return layer_output class ConvLayer(nn.Module): @@ -634,11 +632,9 @@ def forward( self, hidden_states, attention_mask, - output_hidden_states=True, - output_attentions=False, query_states=None, relative_pos=None, - return_dict=True, + **kwargs: Unpack[TransformersKwargs], ): if attention_mask.dim() <= 2: input_mask = attention_mask @@ -647,30 +643,25 @@ def forward( attention_mask = self.get_attention_mask(attention_mask) relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) - all_hidden_states: tuple[torch.Tensor] | None = (hidden_states,) if output_hidden_states else None - all_attentions = () if output_attentions else None + # Extract output_attentions from kwargs + output_attentions = kwargs.get("output_attentions", False) next_kv = hidden_states rel_embeddings = self.get_rel_embedding() + for i, layer_module in enumerate(self.layer): - output_states, attn_weights = layer_module( + output_states = layer_module( next_kv, attention_mask, query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings, - output_attentions=output_attentions, + output_attentions=output_attentions, # Pass it through! ) - if output_attentions: - all_attentions = all_attentions + (attn_weights,) - if i == 0 and self.conv is not None: output_states = self.conv(hidden_states, output_states, input_mask) - if output_hidden_states: - all_hidden_states = all_hidden_states + (output_states,) - if query_states is not None: query_states = output_states if isinstance(hidden_states, Sequence): @@ -678,11 +669,7 @@ def forward( else: next_kv = output_states - if not return_dict: - return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions - ) + return BaseModelOutput(last_hidden_state=output_states) @auto_docstring @@ -691,6 +678,10 @@ class DebertaV2PreTrainedModel(PreTrainedModel): base_model_prefix = "deberta" _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True + _can_record_outputs = { + "hidden_states": DebertaV2Layer, + "attentions": DisentangledSelfAttention, + } @torch.no_grad() def _init_weights(self, module): @@ -721,6 +712,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.embeddings.word_embeddings = new_embeddings + @capture_outputs @auto_docstring def forward( self, @@ -729,17 +721,8 @@ def forward( token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutput: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -768,38 +751,41 @@ def forward( encoder_outputs = self.encoder( embedding_output, attention_mask, - output_hidden_states=True, - output_attentions=output_attentions, - return_dict=return_dict, + query_states=None, + relative_pos=None, + **kwargs, ) - encoded_layers = encoder_outputs[1] + + sequence_output = encoder_outputs.last_hidden_state if self.z_steps > 1: - hidden_states = encoded_layers[-2] + # Get the second-to-last hidden state if available + if encoder_outputs.hidden_states and len(encoder_outputs.hidden_states) >= 2: + hidden_states = encoder_outputs.hidden_states[-2] + else: + hidden_states = sequence_output + layers = [self.encoder.layer[-1] for _ in range(self.z_steps)] - query_states = encoded_layers[-1] + query_states = sequence_output rel_embeddings = self.encoder.get_rel_embedding() - attention_mask = self.encoder.get_attention_mask(attention_mask) + attention_mask_encoded = self.encoder.get_attention_mask(attention_mask) rel_pos = self.encoder.get_rel_pos(embedding_output) + for layer in layers[1:]: query_states = layer( hidden_states, - attention_mask, - output_attentions=False, + attention_mask_encoded, query_states=query_states, relative_pos=rel_pos, rel_embeddings=rel_embeddings, + output_attentions=kwargs.get("output_attentions", False), # Pass it here too! ) - encoded_layers.append(query_states) - - sequence_output = encoded_layers[-1] - if not return_dict: - return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :] + sequence_output = query_states return BaseModelOutput( last_hidden_state=sequence_output, - hidden_states=encoder_outputs.hidden_states if output_hidden_states else None, + hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) @@ -924,6 +910,7 @@ def set_output_embeddings(self, new_embeddings): self.lm_predictions.lm_head.dense = new_embeddings self.lm_predictions.lm_head.bias = new_embeddings.bias + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM.forward with Deberta->DebertaV2 def forward( @@ -934,10 +921,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -946,17 +930,13 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.deberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -970,10 +950,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[1:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1036,6 +1012,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.deberta.set_input_embeddings(new_embeddings) + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification.forward with Deberta->DebertaV2 def forward( @@ -1046,10 +1023,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1057,7 +1031,6 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.deberta( input_ids, @@ -1065,9 +1038,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) encoder_layer = outputs[0] @@ -1110,9 +1081,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions @@ -1133,6 +1101,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -1142,16 +1111,12 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.deberta( input_ids, @@ -1159,9 +1124,7 @@ def forward( token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1174,10 +1137,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) @@ -1195,6 +1154,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering.forward with Deberta->DebertaV2 def forward( @@ -1206,22 +1166,15 @@ def forward( inputs_embeds: torch.Tensor | None = None, start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.deberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1248,10 +1201,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[1:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1286,6 +1235,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.deberta.set_input_embeddings(new_embeddings) + @can_return_tuple @auto_docstring def forward( self, @@ -1295,10 +1245,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1306,7 +1253,6 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1325,9 +1271,7 @@ def forward( token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, inputs_embeds=flat_inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) encoder_layer = outputs[0] @@ -1341,10 +1285,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, From 1361b6d2e39227af01eed56942cb111498d9f9f3 Mon Sep 17 00:00:00 2001 From: Arpit Rawat Date: Mon, 16 Feb 2026 22:19:28 +0530 Subject: [PATCH 04/13] fix ruff --- src/transformers/models/deberta_v2/modeling_deberta_v2.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index f1ba673c39e1..e4754a5a9ead 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -656,7 +656,7 @@ def forward( query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings, - output_attentions=output_attentions, # Pass it through! + output_attentions=output_attentions, ) if i == 0 and self.conv is not None: @@ -759,7 +759,6 @@ def forward( sequence_output = encoder_outputs.last_hidden_state if self.z_steps > 1: - # Get the second-to-last hidden state if available if encoder_outputs.hidden_states and len(encoder_outputs.hidden_states) >= 2: hidden_states = encoder_outputs.hidden_states[-2] else: @@ -778,7 +777,7 @@ def forward( query_states=query_states, relative_pos=rel_pos, rel_embeddings=rel_embeddings, - output_attentions=kwargs.get("output_attentions", False), # Pass it here too! + output_attentions=kwargs.get("output_attentions", False), ) sequence_output = query_states From c8f5b2a1669bee594c2e58214090fc4d4069a943 Mon Sep 17 00:00:00 2001 From: Arpit Rawat Date: Mon, 16 Feb 2026 22:29:52 +0530 Subject: [PATCH 05/13] fix output_attentions arg --- src/transformers/models/deberta_v2/modeling_deberta_v2.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index e4754a5a9ead..37e9874c7694 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -634,6 +634,7 @@ def forward( attention_mask, query_states=None, relative_pos=None, + output_attentions=None, **kwargs: Unpack[TransformersKwargs], ): if attention_mask.dim() <= 2: @@ -643,9 +644,6 @@ def forward( attention_mask = self.get_attention_mask(attention_mask) relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) - # Extract output_attentions from kwargs - output_attentions = kwargs.get("output_attentions", False) - next_kv = hidden_states rel_embeddings = self.get_rel_embedding() From 75ff815408015a2d06f1dfe852304c480a672c50 Mon Sep 17 00:00:00 2001 From: Arpit Rawat Date: Mon, 16 Feb 2026 22:32:58 +0530 Subject: [PATCH 06/13] fix output_attentions arg --- src/transformers/models/deberta_v2/modeling_deberta_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 37e9874c7694..1762102d6b2a 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -632,9 +632,9 @@ def forward( self, hidden_states, attention_mask, + output_attentions=False, query_states=None, relative_pos=None, - output_attentions=None, **kwargs: Unpack[TransformersKwargs], ): if attention_mask.dim() <= 2: From e6be956abc00f8a9e07cc35d0019467efc26a165 Mon Sep 17 00:00:00 2001 From: Siddhartha Pradhan <102735487+Siddhartha7340@users.noreply.github.com> Date: Tue, 17 Feb 2026 12:18:57 +0530 Subject: [PATCH 07/13] refactor efficientnet output tracing with @capture_outputs and @can_return_tuple decorators --- .../efficientnet/modeling_efficientnet.py | 50 +++++-------------- 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py index 5a058ea17883..30d235ddcb8d 100644 --- a/src/transformers/models/efficientnet/modeling_efficientnet.py +++ b/src/transformers/models/efficientnet/modeling_efficientnet.py @@ -26,7 +26,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_efficientnet import EfficientNetConfig @@ -404,26 +405,16 @@ def round_repeats(repeats): def forward( self, hidden_states: torch.FloatTensor, - output_hidden_states: bool | None = False, - return_dict: bool | None = True, ) -> BaseModelOutputWithNoAttention: - all_hidden_states = (hidden_states,) if output_hidden_states else None - for block in self.blocks: hidden_states = block(hidden_states) - if output_hidden_states: - all_hidden_states += (hidden_states,) hidden_states = self.top_conv(hidden_states) hidden_states = self.top_bn(hidden_states) hidden_states = self.top_activation(hidden_states) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - return BaseModelOutputWithNoAttention( last_hidden_state=hidden_states, - hidden_states=all_hidden_states, ) @@ -434,6 +425,9 @@ class EfficientNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["EfficientNetBlock"] + _can_record_outputs = { + "hidden_states": EfficientNetBlock, + } @torch.no_grad() def _init_weights(self, module: nn.Module): @@ -467,42 +461,29 @@ def __init__(self, config: EfficientNetConfig): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if pixel_values is None: raise ValueError("You have to specify pixel_values") embedding_output = self.embeddings(pixel_values) - encoder_outputs = self.encoder( - embedding_output, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + encoder_outputs = self.encoder(embedding_output) + last_hidden_state = encoder_outputs.last_hidden_state + # Apply pooling - last_hidden_state = encoder_outputs[0] pooled_output = self.pooler(last_hidden_state) # Reshape (batch_size, 1280, 1 , 1) -> (batch_size, 1280) pooled_output = pooled_output.reshape(pooled_output.shape[:2]) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -525,13 +506,12 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -540,11 +520,9 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.efficientnet(pixel_values, **kwargs) - outputs = self.efficientnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) - - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -552,10 +530,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention( loss=loss, logits=logits, From 2f9a2dbd614105ecbe71f7824f6ef782db1a10d6 Mon Sep 17 00:00:00 2001 From: Paritosh Dwivedi <73585231+pdwi2020@users.noreply.github.com> Date: Thu, 19 Feb 2026 20:38:44 +0530 Subject: [PATCH 08/13] Fix repository copy checks after ResNet output refactor --- .../models/regnet/modeling_regnet.py | 33 +++++-------------- .../models/rt_detr/modeling_rt_detr_resnet.py | 1 + 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py index f8db0b166b92..7ed6b6b66590 100644 --- a/src/transformers/models/regnet/modeling_regnet.py +++ b/src/transformers/models/regnet/modeling_regnet.py @@ -294,36 +294,24 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: Tensor, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - embedding_output = self.embedder(pixel_values) - encoder_outputs = self.encoder( - embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict - ) + encoder_outputs = self.encoder(embedding_output, return_dict=True) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -347,13 +335,13 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> ImageClassifierOutputWithNoAttention: r""" @@ -361,11 +349,12 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.regnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + model_kwargs = {} + if output_hidden_states is not None: + model_kwargs["output_hidden_states"] = output_hidden_states + outputs = self.regnet(pixel_values, **model_kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) @@ -374,10 +363,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return (loss,) + output if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) diff --git a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py index a79afaf88b1e..89a45d8e21f1 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py @@ -302,6 +302,7 @@ class RTDetrResNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["RTDetrResNetConvLayer", "RTDetrResNetShortCut"] + _can_record_outputs = {"hidden_states": RTDetrResNetStage} @torch.no_grad() def _init_weights(self, module): From 6e022f676d0bb605cf4dd82f2e836995b05d2f27 Mon Sep 17 00:00:00 2001 From: Paritosh Dwivedi <73585231+pdwi2020@users.noreply.github.com> Date: Thu, 19 Feb 2026 20:43:07 +0530 Subject: [PATCH 09/13] Fix missing RegNet decorator imports after copy sync --- src/transformers/models/regnet/modeling_regnet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py index 7ed6b6b66590..5a291bdbaf50 100644 --- a/src/transformers/models/regnet/modeling_regnet.py +++ b/src/transformers/models/regnet/modeling_regnet.py @@ -26,7 +26,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_regnet import RegNetConfig From 411c2c7ddc8784fb150be167623747beb2c91ede Mon Sep 17 00:00:00 2001 From: Paritosh Dwivedi <73585231+pdwi2020@users.noreply.github.com> Date: Thu, 19 Feb 2026 21:18:16 +0530 Subject: [PATCH 10/13] Enable hidden-state capture for RegNet outputs --- src/transformers/models/regnet/modeling_regnet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py index 5a291bdbaf50..19f8e1bc07eb 100644 --- a/src/transformers/models/regnet/modeling_regnet.py +++ b/src/transformers/models/regnet/modeling_regnet.py @@ -262,6 +262,7 @@ class RegNetPreTrainedModel(PreTrainedModel): base_model_prefix = "regnet" main_input_name = "pixel_values" _no_split_modules = ["RegNetYLayer"] + _can_record_outputs = {"hidden_states": RegNetStage} @torch.no_grad() def _init_weights(self, module): From af28f254830c0bdf9b47350745730e7a9c44a38b Mon Sep 17 00:00:00 2001 From: chandan shah Date: Sun, 15 Mar 2026 21:18:01 +0545 Subject: [PATCH 11/13] Refactor gptj output tracing to use @capture_outputs and @can_return_tuple Migrate the GPT-J model to use the new standardized output collection decorators, replacing manual accumulation of hidden states and attention weights with hook-based capturing. Changes: - Add `_can_record_outputs` to `GPTJPreTrainedModel` mapping hidden_states to GPTJBlock and attentions to GPTJAttention - Add `@capture_outputs` and `@merge_with_config_defaults` to `GPTJModel.forward()` - Add `@can_return_tuple` to all task head models (ForCausalLM, ForSequenceClassification, ForQuestionAnswering) - Remove `output_attentions`, `output_hidden_states`, and `return_dict` parameters from all forward signatures - Remove manual accumulator loops and return_dict branching - Simplify GPTJBlock to return plain `torch.Tensor` instead of tuple - Update attention forward signatures to always return `(attn_output, attn_weights)` without conditional logic Resolves huggingface/transformers#43979 --- src/transformers/models/gptj/modeling_gptj.py | 146 +++++------------- 1 file changed, 38 insertions(+), 108 deletions(-) diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 007cc6fd9822..e86219124887 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -33,7 +33,10 @@ SequenceClassifierOutputWithPast, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.generic import merge_with_config_defaults +from ...utils.output_capturing import capture_outputs from .configuration_gptj import GPTJConfig @@ -172,13 +175,8 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, - output_attentions: bool | None = False, cache_position: torch.LongTensor | None = None, - ) -> ( - tuple[torch.Tensor, tuple[torch.Tensor]] - | tuple[torch.Tensor, tuple[torch.Tensor], tuple[torch.Tensor, ...]] - | None - ): + ) -> tuple[torch.Tensor, torch.Tensor]: query = self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) @@ -253,13 +251,8 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, - output_attentions: bool | None = False, cache_position: torch.LongTensor | None = None, - ) -> ( - tuple[torch.Tensor, tuple[torch.Tensor]] - | tuple[torch.Tensor, tuple[torch.Tensor], tuple[torch.Tensor, ...]] - | None - ): + ) -> tuple[torch.Tensor, torch.Tensor]: query = self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) @@ -406,24 +399,22 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, - output_attentions: bool | None = False, cache_position: torch.LongTensor | None = None, - ) -> tuple[torch.Tensor] | tuple[torch.Tensor, tuple[torch.FloatTensor, ...]] | None: + ) -> torch.Tensor: residual = hidden_states hidden_states = self.ln_1(hidden_states) - attn_outputs, attn_weights = self.attn( + attn_outputs, _ = self.attn( hidden_states=hidden_states, layer_past=layer_past, attention_mask=attention_mask, position_ids=position_ids, use_cache=use_cache, - output_attentions=output_attentions, cache_position=cache_position, ) feed_forward_hidden_states = self.mlp(hidden_states) hidden_states = attn_outputs + feed_forward_hidden_states + residual - return hidden_states, attn_weights + return hidden_states @auto_docstring @@ -435,6 +426,10 @@ class GPTJPreTrainedModel(PreTrainedModel): _skip_keys_device_placement = "past_key_values" _supports_flash_attn = True _can_compile_fullgraph = True + _can_record_outputs = { + "hidden_states": GPTJBlock, + "attentions": GPTJAttention, + } def _init_weights(self, module): super()._init_weights(module) @@ -465,6 +460,8 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.wte = new_embeddings + @merge_with_config_defaults + @capture_outputs @auto_docstring def forward( self, @@ -475,35 +472,18 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, - **kwargs, - ) -> tuple | BaseModelOutputWithPast: + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - if inputs_embeds is None: inputs_embeds = self.wte(input_ids) @@ -539,43 +519,22 @@ def forward( hidden_states = self.drop(hidden_states) output_shape = (-1, seq_length, hidden_states.size(-1)) - all_self_attentions = () if output_attentions else None - all_hidden_states = () if output_hidden_states else None - for i, block in enumerate(self.h): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - outputs = block( + for block in self.h: + hidden_states = block( hidden_states, layer_past=past_key_values, attention_mask=causal_mask, position_ids=position_ids, use_cache=use_cache, - output_attentions=output_attentions, cache_position=cache_position, ) - hidden_states = outputs[0] - if output_attentions: - all_self_attentions = all_self_attentions + (outputs[1],) - hidden_states = self.ln_f(hidden_states) - hidden_states = hidden_states.view(output_shape) - # Add last hidden state - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple( - v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions] if v is not None - ) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=past_key_values, - hidden_states=all_hidden_states, - attentions=all_self_attentions, ) @@ -595,6 +554,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -606,13 +566,10 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, - **kwargs, - ) -> tuple | CausalLMOutputWithPast: + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This @@ -623,9 +580,7 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.transformer( + outputs: BaseModelOutputWithPast = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, @@ -633,13 +588,11 @@ def forward( position_ids=position_ids, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, cache_position=cache_position, + **kwargs, ) - hidden_states = transformer_outputs[0] + hidden_states = outputs.last_hidden_state # Only compute necessary logits, and do not upcast them to float if we are not computing the loss slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep logits = self.lm_head(hidden_states[:, slice_indices, :]) @@ -648,16 +601,12 @@ def forward( if labels is not None: loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) - if not return_dict: - output = (logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - return CausalLMOutputWithPast( loss=loss, logits=logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, ) @@ -685,6 +634,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -696,11 +646,8 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, - ) -> tuple | SequenceClassifierOutputWithPast: + **kwargs: Unpack[TransformersKwargs], + ) -> SequenceClassifierOutputWithPast: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This @@ -711,8 +658,6 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, @@ -721,11 +666,9 @@ def forward( position_ids=position_ids, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) - hidden_states = transformer_outputs[0] + hidden_states = transformer_outputs.last_hidden_state logits = self.score(hidden_states) if input_ids is not None: @@ -774,9 +717,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutputWithPast( loss=loss, @@ -798,6 +738,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -808,31 +749,24 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, - ) -> tuple | QuestionAnsweringModelOutput: + **kwargs: Unpack[TransformersKwargs], + ) -> QuestionAnsweringModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.transformer( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -856,10 +790,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, From b9756993ac16895095ad95929efdcb47ef460560 Mon Sep 17 00:00:00 2001 From: chandan shah Date: Mon, 16 Mar 2026 11:04:30 +0545 Subject: [PATCH 12/13] Sync CodeGenBlock copy with updated GPTJBlock The CodeGenBlock is a documented copy of GPTJBlock. This syncs it to match the updated signature after removing output_attentions parameter and simplifying the return type to plain torch.Tensor. Generated via `python utils/check_copies.py --fix_and_overwrite`. --- src/transformers/models/codegen/modeling_codegen.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index ccc540207789..faf10a317ea0 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -245,24 +245,22 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, - output_attentions: bool | None = False, cache_position: torch.LongTensor | None = None, - ) -> tuple[torch.Tensor] | tuple[torch.Tensor, tuple[torch.FloatTensor, ...]] | None: + ) -> torch.Tensor: residual = hidden_states hidden_states = self.ln_1(hidden_states) - attn_outputs, attn_weights = self.attn( + attn_outputs, _ = self.attn( hidden_states=hidden_states, layer_past=layer_past, attention_mask=attention_mask, position_ids=position_ids, use_cache=use_cache, - output_attentions=output_attentions, cache_position=cache_position, ) feed_forward_hidden_states = self.mlp(hidden_states) hidden_states = attn_outputs + feed_forward_hidden_states + residual - return hidden_states, attn_weights + return hidden_states @auto_docstring From d12abc7fdaa35016d16d9ca25e59635c2f880aaf Mon Sep 17 00:00:00 2001 From: chandan shah Date: Mon, 16 Mar 2026 12:21:54 +0545 Subject: [PATCH 13/13] Fix CodeGenBlock copy sync: restore original forward signature The previous commit auto-synced CodeGenBlock.forward() with the refactored GPTJBlock, but CodeGenModel still passes output_attentions to CodeGenBlock and expects a tuple return. Since the CodeGen model has not been refactored to use the new decorators yet, restore CodeGenBlock's original forward() signature and remove the '# Copied from' directive to decouple it from GPTJBlock until CodeGen gets its own output tracing refactor. --- src/transformers/models/codegen/modeling_codegen.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index faf10a317ea0..419e1ed0f86a 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -228,9 +228,7 @@ def forward(self, hidden_states: torch.FloatTensor | None) -> torch.FloatTensor: return hidden_states -# Copied from transformers.models.gptj.modeling_gptj.GPTJBlock with GPTJ->CodeGen class CodeGenBlock(GradientCheckpointingLayer): - # Ignore copy def __init__(self, config, layer_idx=None): super().__init__() inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd @@ -245,22 +243,24 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, + output_attentions: bool | None = False, cache_position: torch.LongTensor | None = None, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor] | tuple[torch.Tensor, tuple[torch.FloatTensor, ...]] | None: residual = hidden_states hidden_states = self.ln_1(hidden_states) - attn_outputs, _ = self.attn( + attn_outputs, attn_weights = self.attn( hidden_states=hidden_states, layer_past=layer_past, attention_mask=attention_mask, position_ids=position_ids, use_cache=use_cache, + output_attentions=output_attentions, cache_position=cache_position, ) feed_forward_hidden_states = self.mlp(hidden_states) hidden_states = attn_outputs + feed_forward_hidden_states + residual - return hidden_states + return hidden_states, attn_weights @auto_docstring