From fc58a7bd3ec5a2de28cc0b747df6794e49118f32 Mon Sep 17 00:00:00 2001 From: Pranay Beela Date: Sat, 14 Feb 2026 12:40:34 -0500 Subject: [PATCH 01/12] Refactor FNet and CVT output tracing --- src/transformers/models/cvt/modeling_cvt.py | 56 ++++---- src/transformers/models/fnet/modeling_fnet.py | 122 ++++-------------- 2 files changed, 44 insertions(+), 134 deletions(-) diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py index 88de46f81591..bccc726e6e47 100644 --- a/src/transformers/models/cvt/modeling_cvt.py +++ b/src/transformers/models/cvt/modeling_cvt.py @@ -23,7 +23,7 @@ from ... import initialization as init from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging from .configuration_cvt import CvtConfig @@ -461,23 +461,15 @@ def __init__(self, config): for stage_idx in range(len(config.depth)): self.stages.append(CvtStage(config, stage_idx)) - def forward(self, pixel_values, output_hidden_states=False, return_dict=True): - all_hidden_states = () if output_hidden_states else None + def forward(self, pixel_values): hidden_state = pixel_values - cls_token = None - for _, (stage_module) in enumerate(self.stages): + for stage_module in self.stages: hidden_state, cls_token = stage_module(hidden_state) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_state,) - - if not return_dict: - return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None) return BaseModelOutputWithCLSToken( last_hidden_state=hidden_state, cls_token_value=cls_token, - hidden_states=all_hidden_states, ) @@ -519,36 +511,42 @@ def __init__(self, config, add_pooling_layer=True): self.encoder = CvtEncoder(config) self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.Tensor | None = None, output_hidden_states: bool | None = None, + output_attentions: bool | None = None, return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithCLSToken: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") - encoder_outputs = self.encoder( - pixel_values, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = encoder_outputs[0] + # Manually collect hidden states from encoder stages + all_hidden_states = () if output_hidden_states else None + hidden_state = pixel_values + cls_token = None + + for stage_module in self.encoder.stages: + hidden_state, cls_token = stage_module(hidden_state) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) if not return_dict: - return (sequence_output,) + encoder_outputs[1:] + return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None) return BaseModelOutputWithCLSToken( - last_hidden_state=sequence_output, - cls_token_value=encoder_outputs.cls_token_value, - hidden_states=encoder_outputs.hidden_states, + last_hidden_state=hidden_state, + cls_token_value=cls_token, + hidden_states=all_hidden_states, ) @@ -573,13 +571,12 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -588,12 +585,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.cvt( - pixel_values, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + outputs = self.cvt(pixel_values, **kwargs) sequence_output = outputs[0] cls_token = outputs[1] @@ -631,10 +623,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index 2fe0823dd8fe..a99ff2b8a5c3 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -21,7 +21,8 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ... import initialization as init -from ...utils import auto_docstring, is_scipy_available +from ...utils import auto_docstring, can_return_tuple, is_scipy_available +from ...utils.output_capturing import capture_outputs if is_scipy_available(): @@ -265,24 +266,12 @@ def __init__(self, config): self.layer = nn.ModuleList([FNetLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - def forward(self, hidden_states, output_hidden_states=False, return_dict=True): - all_hidden_states = () if output_hidden_states else None - - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - + def forward(self, hidden_states): + for layer_module in self.layer: layer_outputs = layer_module(hidden_states) - hidden_states = layer_outputs[0] - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - - return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states) + return BaseModelOutput(last_hidden_state=hidden_states) # Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->FNet @@ -371,6 +360,9 @@ class FNetPreTrainedModel(PreTrainedModel): config: FNetConfig base_model_prefix = "fnet" supports_gradient_checkpointing = True + _can_record_outputs = { + "hidden_states": FNetLayer, + } def _init_weights(self, module): super()._init_weights(module) @@ -434,6 +426,8 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value + @can_return_tuple + @capture_outputs @auto_docstring def forward( self, @@ -441,15 +435,8 @@ def forward( token_type_ids: torch.LongTensor | None = None, position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutput: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -487,18 +474,11 @@ def forward( token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) - encoder_outputs = self.encoder( - embedding_output, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + encoder_outputs = self.encoder(embedding_output) sequence_output = encoder_outputs[0] pooler_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooler_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=sequence_output, pooler_output=pooler_output, @@ -534,6 +514,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @auto_docstring def forward( self, @@ -543,8 +524,6 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, next_sentence_label: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | FNetForPreTrainingOutput: r""" @@ -572,15 +551,12 @@ def forward( >>> prediction_logits = outputs.prediction_logits >>> seq_relationship_logits = outputs.seq_relationship_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output, pooled_output = outputs[:2] @@ -593,10 +569,6 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return FNetForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, @@ -628,6 +600,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @auto_docstring def forward( self, @@ -636,8 +609,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | MaskedLMOutput: r""" @@ -646,15 +617,12 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -665,10 +633,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput(loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states) @@ -687,6 +651,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -695,8 +660,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | NextSentencePredictorOutput: r""" @@ -722,16 +685,11 @@ def forward( >>> logits = outputs.logits >>> assert logits[0, 0] < logits[0, 1] # next sentence was random ```""" - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, ) pooled_output = outputs[1] @@ -743,10 +701,6 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) - if not return_dict: - output = (seq_relationship_scores,) + outputs[2:] - return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output - return NextSentencePredictorOutput( loss=next_sentence_loss, logits=seq_relationship_scores, @@ -772,6 +726,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -780,8 +735,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | SequenceClassifierOutput: r""" @@ -790,15 +743,11 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, ) pooled_output = outputs[1] @@ -827,9 +776,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @@ -846,6 +792,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -854,8 +801,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | MultipleChoiceModelOutput: r""" @@ -888,7 +833,6 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -905,8 +849,7 @@ def forward( token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) pooled_output = outputs[1] @@ -920,10 +863,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput(loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states) @@ -941,6 +880,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -949,23 +889,17 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, ) sequence_output = outputs[0] @@ -979,10 +913,6 @@ def forward( # Only keep active parts of the loss loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @@ -999,6 +929,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -1008,19 +939,14 @@ def forward( inputs_embeds: torch.Tensor | None = None, start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1047,10 +973,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states ) From 08f02cf7a8cfb2d62a08d8c382c5160d5462fae6 Mon Sep 17 00:00:00 2001 From: Paritosh Dwivedi Date: Sun, 15 Feb 2026 12:56:24 +0530 Subject: [PATCH 02/12] Refactor ResNet output handling to decorators --- .../models/resnet/modeling_resnet.py | 46 ++++++------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py index 92a264014c6c..8a8b8531f6ef 100644 --- a/src/transformers/models/resnet/modeling_resnet.py +++ b/src/transformers/models/resnet/modeling_resnet.py @@ -28,7 +28,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_resnet import ResNetConfig @@ -248,6 +249,7 @@ class ResNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["ResNetConvLayer", "ResNetShortCut"] + _can_record_outputs = {"hidden_states": ResNetStage} @torch.no_grad() def _init_weights(self, module): @@ -281,36 +283,24 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: Tensor, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - embedding_output = self.embedder(pixel_values) - encoder_outputs = self.encoder( - embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict - ) + encoder_outputs = self.encoder(embedding_output, return_dict=True) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -333,13 +323,13 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> ImageClassifierOutputWithNoAttention: r""" @@ -347,11 +337,12 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.resnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + model_kwargs = {} + if output_hidden_states is not None: + model_kwargs["output_hidden_states"] = output_hidden_states + outputs = self.resnet(pixel_values, **model_kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) @@ -360,10 +351,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return (loss,) + output if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @@ -385,12 +372,12 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: Tensor, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BackboneOutput: r""" @@ -419,7 +406,6 @@ def forward( >>> list(feature_maps[-1].shape) [1, 2048, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -435,12 +421,6 @@ def forward( if stage in self.out_features: feature_maps += (hidden_states[idx],) - if not return_dict: - output = (feature_maps,) - if output_hidden_states: - output += (outputs.hidden_states,) - return output - return BackboneOutput( feature_maps=feature_maps, hidden_states=outputs.hidden_states if output_hidden_states else None, From 1d5aa9c5e189b5496c7bd5526ef3872b9f5f9df2 Mon Sep 17 00:00:00 2001 From: gabrielfruet Date: Sun, 15 Feb 2026 10:43:25 -0300 Subject: [PATCH 03/12] refactor: tracing --- .../mobilenet_v2/modeling_mobilenet_v2.py | 53 ++++--------------- 1 file changed, 10 insertions(+), 43 deletions(-) diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py index 7648658c3050..fd465e9c2de2 100755 --- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py @@ -25,6 +25,7 @@ ) from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, logging +from ...utils.output_manager import can_return_tuple, capture_outputs from .configuration_mobilenet_v2 import MobileNetV2Config @@ -254,6 +255,7 @@ class MobileNetV2PreTrainedModel(PreTrainedModel): input_modalities = ("image",) supports_gradient_checkpointing = False _no_split_modules = [] + _can_record_outputs = {"hidden_states": MobileNetV2InvertedResidual} @auto_docstring @@ -323,31 +325,20 @@ def __init__(self, config: MobileNetV2Config, add_pooling_layer: bool = True): self.post_init() @auto_docstring + @capture_outputs def forward( self, pixel_values: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if pixel_values is None: raise ValueError("You have to specify pixel_values") hidden_states = self.conv_stem(pixel_values) - all_hidden_states = () if output_hidden_states else None - for i, layer_module in enumerate(self.layer): hidden_states = layer_module(hidden_states) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - last_hidden_state = self.conv_1x1(hidden_states) if self.pooler is not None: @@ -355,13 +346,9 @@ def forward( else: pooled_output = None - if not return_dict: - return tuple(v for v in [last_hidden_state, pooled_output, all_hidden_states] if v is not None) - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=all_hidden_states, ) @@ -388,12 +375,11 @@ def __init__(self, config: MobileNetV2Config) -> None: self.post_init() @auto_docstring + @can_return_tuple def forward( self, pixel_values: torch.Tensor | None = None, - output_hidden_states: bool | None = None, labels: torch.Tensor | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -402,11 +388,9 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.mobilenet_v2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + outputs = self.mobilenet_v2(pixel_values, **kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(self.dropout(pooled_output)) @@ -414,10 +398,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention( loss=loss, logits=logits, @@ -517,12 +497,11 @@ def __init__(self, config: MobileNetV2Config) -> None: self.post_init() @auto_docstring + @can_return_tuple def forward( self, pixel_values: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | SemanticSegmenterOutput: r""" @@ -553,21 +532,16 @@ def forward( >>> # logits are of shape (batch_size, num_labels, height, width) >>> logits = outputs.logits ```""" - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if labels is not None and self.config.num_labels == 1: raise ValueError("The number of labels should be greater than one") outputs = self.mobilenet_v2( pixel_values, output_hidden_states=True, # we need the intermediate hidden states - return_dict=return_dict, + **kwargs, ) - encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + encoder_hidden_states = outputs.hidden_states logits = self.segmentation_head(encoder_hidden_states[-1]) @@ -580,17 +554,10 @@ def forward( loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index) loss = loss_fct(upsampled_logits, labels) - if not return_dict: - if output_hidden_states: - output = (logits,) + outputs[1:] - else: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SemanticSegmenterOutput( loss=loss, logits=logits, - hidden_states=outputs.hidden_states if output_hidden_states else None, + hidden_states=outputs.hidden_states, attentions=None, ) From a1c15f9994ac0099a0c230ea7f526397ea94637e Mon Sep 17 00:00:00 2001 From: Arpit Rawat Date: Mon, 16 Feb 2026 22:02:29 +0530 Subject: [PATCH 04/12] add hooks to deberta_v2 --- .../models/deberta_v2/modeling_deberta_v2.py | 166 ++++++------------ 1 file changed, 53 insertions(+), 113 deletions(-) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 82ac99b93d7f..f1ba673c39e1 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -31,7 +31,9 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_deberta_v2 import DebertaV2Config @@ -272,8 +274,7 @@ def forward( ) new_context_layer_shape = context_layer.size()[:-2] + (-1,) context_layer = context_layer.view(new_context_layer_shape) - if not output_attentions: - return (context_layer, None) + return (context_layer, attention_probs) def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): @@ -431,8 +432,8 @@ def forward( relative_pos=None, rel_embeddings=None, output_attentions: bool = False, - ) -> tuple[torch.Tensor, torch.Tensor | None]: - attention_output, att_matrix = self.attention( + ) -> torch.Tensor: + attention_output, _ = self.attention( hidden_states, attention_mask, output_attentions=output_attentions, @@ -443,10 +444,7 @@ def forward( intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) - if output_attentions: - return (layer_output, att_matrix) - else: - return (layer_output, None) + return layer_output class ConvLayer(nn.Module): @@ -634,11 +632,9 @@ def forward( self, hidden_states, attention_mask, - output_hidden_states=True, - output_attentions=False, query_states=None, relative_pos=None, - return_dict=True, + **kwargs: Unpack[TransformersKwargs], ): if attention_mask.dim() <= 2: input_mask = attention_mask @@ -647,30 +643,25 @@ def forward( attention_mask = self.get_attention_mask(attention_mask) relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) - all_hidden_states: tuple[torch.Tensor] | None = (hidden_states,) if output_hidden_states else None - all_attentions = () if output_attentions else None + # Extract output_attentions from kwargs + output_attentions = kwargs.get("output_attentions", False) next_kv = hidden_states rel_embeddings = self.get_rel_embedding() + for i, layer_module in enumerate(self.layer): - output_states, attn_weights = layer_module( + output_states = layer_module( next_kv, attention_mask, query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings, - output_attentions=output_attentions, + output_attentions=output_attentions, # Pass it through! ) - if output_attentions: - all_attentions = all_attentions + (attn_weights,) - if i == 0 and self.conv is not None: output_states = self.conv(hidden_states, output_states, input_mask) - if output_hidden_states: - all_hidden_states = all_hidden_states + (output_states,) - if query_states is not None: query_states = output_states if isinstance(hidden_states, Sequence): @@ -678,11 +669,7 @@ def forward( else: next_kv = output_states - if not return_dict: - return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions - ) + return BaseModelOutput(last_hidden_state=output_states) @auto_docstring @@ -691,6 +678,10 @@ class DebertaV2PreTrainedModel(PreTrainedModel): base_model_prefix = "deberta" _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True + _can_record_outputs = { + "hidden_states": DebertaV2Layer, + "attentions": DisentangledSelfAttention, + } @torch.no_grad() def _init_weights(self, module): @@ -721,6 +712,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.embeddings.word_embeddings = new_embeddings + @capture_outputs @auto_docstring def forward( self, @@ -729,17 +721,8 @@ def forward( token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutput: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -768,38 +751,41 @@ def forward( encoder_outputs = self.encoder( embedding_output, attention_mask, - output_hidden_states=True, - output_attentions=output_attentions, - return_dict=return_dict, + query_states=None, + relative_pos=None, + **kwargs, ) - encoded_layers = encoder_outputs[1] + + sequence_output = encoder_outputs.last_hidden_state if self.z_steps > 1: - hidden_states = encoded_layers[-2] + # Get the second-to-last hidden state if available + if encoder_outputs.hidden_states and len(encoder_outputs.hidden_states) >= 2: + hidden_states = encoder_outputs.hidden_states[-2] + else: + hidden_states = sequence_output + layers = [self.encoder.layer[-1] for _ in range(self.z_steps)] - query_states = encoded_layers[-1] + query_states = sequence_output rel_embeddings = self.encoder.get_rel_embedding() - attention_mask = self.encoder.get_attention_mask(attention_mask) + attention_mask_encoded = self.encoder.get_attention_mask(attention_mask) rel_pos = self.encoder.get_rel_pos(embedding_output) + for layer in layers[1:]: query_states = layer( hidden_states, - attention_mask, - output_attentions=False, + attention_mask_encoded, query_states=query_states, relative_pos=rel_pos, rel_embeddings=rel_embeddings, + output_attentions=kwargs.get("output_attentions", False), # Pass it here too! ) - encoded_layers.append(query_states) - - sequence_output = encoded_layers[-1] - if not return_dict: - return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :] + sequence_output = query_states return BaseModelOutput( last_hidden_state=sequence_output, - hidden_states=encoder_outputs.hidden_states if output_hidden_states else None, + hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) @@ -924,6 +910,7 @@ def set_output_embeddings(self, new_embeddings): self.lm_predictions.lm_head.dense = new_embeddings self.lm_predictions.lm_head.bias = new_embeddings.bias + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM.forward with Deberta->DebertaV2 def forward( @@ -934,10 +921,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -946,17 +930,13 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.deberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -970,10 +950,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[1:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1036,6 +1012,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.deberta.set_input_embeddings(new_embeddings) + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification.forward with Deberta->DebertaV2 def forward( @@ -1046,10 +1023,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1057,7 +1031,6 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.deberta( input_ids, @@ -1065,9 +1038,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) encoder_layer = outputs[0] @@ -1110,9 +1081,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions @@ -1133,6 +1101,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -1142,16 +1111,12 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.deberta( input_ids, @@ -1159,9 +1124,7 @@ def forward( token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1174,10 +1137,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) @@ -1195,6 +1154,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering.forward with Deberta->DebertaV2 def forward( @@ -1206,22 +1166,15 @@ def forward( inputs_embeds: torch.Tensor | None = None, start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.deberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1248,10 +1201,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[1:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1286,6 +1235,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.deberta.set_input_embeddings(new_embeddings) + @can_return_tuple @auto_docstring def forward( self, @@ -1295,10 +1245,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1306,7 +1253,6 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1325,9 +1271,7 @@ def forward( token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, inputs_embeds=flat_inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) encoder_layer = outputs[0] @@ -1341,10 +1285,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, From 1361b6d2e39227af01eed56942cb111498d9f9f3 Mon Sep 17 00:00:00 2001 From: Arpit Rawat Date: Mon, 16 Feb 2026 22:19:28 +0530 Subject: [PATCH 05/12] fix ruff --- src/transformers/models/deberta_v2/modeling_deberta_v2.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index f1ba673c39e1..e4754a5a9ead 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -656,7 +656,7 @@ def forward( query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings, - output_attentions=output_attentions, # Pass it through! + output_attentions=output_attentions, ) if i == 0 and self.conv is not None: @@ -759,7 +759,6 @@ def forward( sequence_output = encoder_outputs.last_hidden_state if self.z_steps > 1: - # Get the second-to-last hidden state if available if encoder_outputs.hidden_states and len(encoder_outputs.hidden_states) >= 2: hidden_states = encoder_outputs.hidden_states[-2] else: @@ -778,7 +777,7 @@ def forward( query_states=query_states, relative_pos=rel_pos, rel_embeddings=rel_embeddings, - output_attentions=kwargs.get("output_attentions", False), # Pass it here too! + output_attentions=kwargs.get("output_attentions", False), ) sequence_output = query_states From c8f5b2a1669bee594c2e58214090fc4d4069a943 Mon Sep 17 00:00:00 2001 From: Arpit Rawat Date: Mon, 16 Feb 2026 22:29:52 +0530 Subject: [PATCH 06/12] fix output_attentions arg --- src/transformers/models/deberta_v2/modeling_deberta_v2.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index e4754a5a9ead..37e9874c7694 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -634,6 +634,7 @@ def forward( attention_mask, query_states=None, relative_pos=None, + output_attentions=None, **kwargs: Unpack[TransformersKwargs], ): if attention_mask.dim() <= 2: @@ -643,9 +644,6 @@ def forward( attention_mask = self.get_attention_mask(attention_mask) relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) - # Extract output_attentions from kwargs - output_attentions = kwargs.get("output_attentions", False) - next_kv = hidden_states rel_embeddings = self.get_rel_embedding() From 75ff815408015a2d06f1dfe852304c480a672c50 Mon Sep 17 00:00:00 2001 From: Arpit Rawat Date: Mon, 16 Feb 2026 22:32:58 +0530 Subject: [PATCH 07/12] fix output_attentions arg --- src/transformers/models/deberta_v2/modeling_deberta_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 37e9874c7694..1762102d6b2a 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -632,9 +632,9 @@ def forward( self, hidden_states, attention_mask, + output_attentions=False, query_states=None, relative_pos=None, - output_attentions=None, **kwargs: Unpack[TransformersKwargs], ): if attention_mask.dim() <= 2: From e6be956abc00f8a9e07cc35d0019467efc26a165 Mon Sep 17 00:00:00 2001 From: Siddhartha Pradhan <102735487+Siddhartha7340@users.noreply.github.com> Date: Tue, 17 Feb 2026 12:18:57 +0530 Subject: [PATCH 08/12] refactor efficientnet output tracing with @capture_outputs and @can_return_tuple decorators --- .../efficientnet/modeling_efficientnet.py | 50 +++++-------------- 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py index 5a058ea17883..30d235ddcb8d 100644 --- a/src/transformers/models/efficientnet/modeling_efficientnet.py +++ b/src/transformers/models/efficientnet/modeling_efficientnet.py @@ -26,7 +26,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_efficientnet import EfficientNetConfig @@ -404,26 +405,16 @@ def round_repeats(repeats): def forward( self, hidden_states: torch.FloatTensor, - output_hidden_states: bool | None = False, - return_dict: bool | None = True, ) -> BaseModelOutputWithNoAttention: - all_hidden_states = (hidden_states,) if output_hidden_states else None - for block in self.blocks: hidden_states = block(hidden_states) - if output_hidden_states: - all_hidden_states += (hidden_states,) hidden_states = self.top_conv(hidden_states) hidden_states = self.top_bn(hidden_states) hidden_states = self.top_activation(hidden_states) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - return BaseModelOutputWithNoAttention( last_hidden_state=hidden_states, - hidden_states=all_hidden_states, ) @@ -434,6 +425,9 @@ class EfficientNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["EfficientNetBlock"] + _can_record_outputs = { + "hidden_states": EfficientNetBlock, + } @torch.no_grad() def _init_weights(self, module: nn.Module): @@ -467,42 +461,29 @@ def __init__(self, config: EfficientNetConfig): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if pixel_values is None: raise ValueError("You have to specify pixel_values") embedding_output = self.embeddings(pixel_values) - encoder_outputs = self.encoder( - embedding_output, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + encoder_outputs = self.encoder(embedding_output) + last_hidden_state = encoder_outputs.last_hidden_state + # Apply pooling - last_hidden_state = encoder_outputs[0] pooled_output = self.pooler(last_hidden_state) # Reshape (batch_size, 1280, 1 , 1) -> (batch_size, 1280) pooled_output = pooled_output.reshape(pooled_output.shape[:2]) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -525,13 +506,12 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -540,11 +520,9 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.efficientnet(pixel_values, **kwargs) - outputs = self.efficientnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) - - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -552,10 +530,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention( loss=loss, logits=logits, From 2f9a2dbd614105ecbe71f7824f6ef782db1a10d6 Mon Sep 17 00:00:00 2001 From: Paritosh Dwivedi <73585231+pdwi2020@users.noreply.github.com> Date: Thu, 19 Feb 2026 20:38:44 +0530 Subject: [PATCH 09/12] Fix repository copy checks after ResNet output refactor --- .../models/regnet/modeling_regnet.py | 33 +++++-------------- .../models/rt_detr/modeling_rt_detr_resnet.py | 1 + 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py index f8db0b166b92..7ed6b6b66590 100644 --- a/src/transformers/models/regnet/modeling_regnet.py +++ b/src/transformers/models/regnet/modeling_regnet.py @@ -294,36 +294,24 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: Tensor, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - embedding_output = self.embedder(pixel_values) - encoder_outputs = self.encoder( - embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict - ) + encoder_outputs = self.encoder(embedding_output, return_dict=True) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -347,13 +335,13 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> ImageClassifierOutputWithNoAttention: r""" @@ -361,11 +349,12 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.regnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + model_kwargs = {} + if output_hidden_states is not None: + model_kwargs["output_hidden_states"] = output_hidden_states + outputs = self.regnet(pixel_values, **model_kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) @@ -374,10 +363,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return (loss,) + output if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) diff --git a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py index a79afaf88b1e..89a45d8e21f1 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py @@ -302,6 +302,7 @@ class RTDetrResNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["RTDetrResNetConvLayer", "RTDetrResNetShortCut"] + _can_record_outputs = {"hidden_states": RTDetrResNetStage} @torch.no_grad() def _init_weights(self, module): From 6e022f676d0bb605cf4dd82f2e836995b05d2f27 Mon Sep 17 00:00:00 2001 From: Paritosh Dwivedi <73585231+pdwi2020@users.noreply.github.com> Date: Thu, 19 Feb 2026 20:43:07 +0530 Subject: [PATCH 10/12] Fix missing RegNet decorator imports after copy sync --- src/transformers/models/regnet/modeling_regnet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py index 7ed6b6b66590..5a291bdbaf50 100644 --- a/src/transformers/models/regnet/modeling_regnet.py +++ b/src/transformers/models/regnet/modeling_regnet.py @@ -26,7 +26,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_regnet import RegNetConfig From 411c2c7ddc8784fb150be167623747beb2c91ede Mon Sep 17 00:00:00 2001 From: Paritosh Dwivedi <73585231+pdwi2020@users.noreply.github.com> Date: Thu, 19 Feb 2026 21:18:16 +0530 Subject: [PATCH 11/12] Enable hidden-state capture for RegNet outputs --- src/transformers/models/regnet/modeling_regnet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py index 5a291bdbaf50..19f8e1bc07eb 100644 --- a/src/transformers/models/regnet/modeling_regnet.py +++ b/src/transformers/models/regnet/modeling_regnet.py @@ -262,6 +262,7 @@ class RegNetPreTrainedModel(PreTrainedModel): base_model_prefix = "regnet" main_input_name = "pixel_values" _no_split_modules = ["RegNetYLayer"] + _can_record_outputs = {"hidden_states": RegNetStage} @torch.no_grad() def _init_weights(self, module): From 4f613b26faf2de1b7d64fc6b5e932aa383fd18e1 Mon Sep 17 00:00:00 2001 From: evalstate <1936278+evalstate@users.noreply.github.com> Date: Tue, 21 Apr 2026 18:34:03 +0100 Subject: [PATCH 12/12] Fix leftover conflict markers in MobileNetV2 merge --- .../models/mobilenet_v2/modeling_mobilenet_v2.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py index ee7b8be9a4ce..0aee736d8186 100755 --- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py @@ -331,14 +331,6 @@ def forward( pixel_values: torch.Tensor | None = None, **kwargs, ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: -<<<<<<< HEAD - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - -======= ->>>>>>> pr-44013 if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -540,14 +532,6 @@ def forward( >>> # logits are of shape (batch_size, num_labels, height, width) >>> logits = outputs.logits ```""" -<<<<<<< HEAD - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - -======= ->>>>>>> pr-44013 if labels is not None and self.config.num_labels == 1: raise ValueError("The number of labels should be greater than one")