diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py index 3ceccb7d1dab..bccc726e6e47 100644 --- a/src/transformers/models/cvt/modeling_cvt.py +++ b/src/transformers/models/cvt/modeling_cvt.py @@ -23,7 +23,7 @@ from ... import initialization as init from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging from .configuration_cvt import CvtConfig @@ -461,23 +461,15 @@ def __init__(self, config): for stage_idx in range(len(config.depth)): self.stages.append(CvtStage(config, stage_idx)) - def forward(self, pixel_values, output_hidden_states=False, return_dict=True): - all_hidden_states = () if output_hidden_states else None + def forward(self, pixel_values): hidden_state = pixel_values - cls_token = None - for _, (stage_module) in enumerate(self.stages): + for stage_module in self.stages: hidden_state, cls_token = stage_module(hidden_state) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_state,) - - if not return_dict: - return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None) return BaseModelOutputWithCLSToken( last_hidden_state=hidden_state, cls_token_value=cls_token, - hidden_states=all_hidden_states, ) @@ -519,36 +511,42 @@ def __init__(self, config, add_pooling_layer=True): self.encoder = CvtEncoder(config) self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.Tensor | None = None, output_hidden_states: bool | None = None, + output_attentions: bool | None = None, return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithCLSToken: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") - encoder_outputs = self.encoder( - pixel_values, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = encoder_outputs[0] + # Manually collect hidden states from encoder stages + all_hidden_states = () if output_hidden_states else None + hidden_state = pixel_values + cls_token = None + + for stage_module in self.encoder.stages: + hidden_state, cls_token = stage_module(hidden_state) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) if not return_dict: - return (sequence_output,) + encoder_outputs[1:] + return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None) return BaseModelOutputWithCLSToken( - last_hidden_state=sequence_output, - cls_token_value=encoder_outputs.cls_token_value, - hidden_states=encoder_outputs.hidden_states, + last_hidden_state=hidden_state, + cls_token_value=cls_token, + hidden_states=all_hidden_states, ) @@ -573,13 +571,12 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -588,12 +585,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.cvt( - pixel_values, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + outputs = self.cvt(pixel_values, **kwargs) sequence_output = outputs[0] cls_token = outputs[1] @@ -631,10 +623,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index b30cbd6342dc..e0c0d87ab98d 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -31,7 +31,9 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_deberta_v2 import DebertaV2Config @@ -269,8 +271,7 @@ def forward( ) new_context_layer_shape = context_layer.size()[:-2] + (-1,) context_layer = context_layer.view(new_context_layer_shape) - if not output_attentions: - return (context_layer, None) + return (context_layer, attention_probs) def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): @@ -428,8 +429,8 @@ def forward( relative_pos=None, rel_embeddings=None, output_attentions: bool = False, - ) -> tuple[torch.Tensor, torch.Tensor | None]: - attention_output, att_matrix = self.attention( + ) -> torch.Tensor: + attention_output, _ = self.attention( hidden_states, attention_mask, output_attentions=output_attentions, @@ -440,10 +441,7 @@ def forward( intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) - if output_attentions: - return (layer_output, att_matrix) - else: - return (layer_output, None) + return layer_output class ConvLayer(nn.Module): @@ -631,11 +629,10 @@ def forward( self, hidden_states, attention_mask, - output_hidden_states=True, output_attentions=False, query_states=None, relative_pos=None, - return_dict=True, + **kwargs: Unpack[TransformersKwargs], ): if attention_mask.dim() <= 2: input_mask = attention_mask @@ -644,13 +641,11 @@ def forward( attention_mask = self.get_attention_mask(attention_mask) relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) - all_hidden_states: tuple[torch.Tensor] | None = (hidden_states,) if output_hidden_states else None - all_attentions = () if output_attentions else None - next_kv = hidden_states rel_embeddings = self.get_rel_embedding() + for i, layer_module in enumerate(self.layer): - output_states, attn_weights = layer_module( + output_states = layer_module( next_kv, attention_mask, query_states=query_states, @@ -659,15 +654,9 @@ def forward( output_attentions=output_attentions, ) - if output_attentions: - all_attentions = all_attentions + (attn_weights,) - if i == 0 and self.conv is not None: output_states = self.conv(hidden_states, output_states, input_mask) - if output_hidden_states: - all_hidden_states = all_hidden_states + (output_states,) - if query_states is not None: query_states = output_states if isinstance(hidden_states, Sequence): @@ -675,11 +664,7 @@ def forward( else: next_kv = output_states - if not return_dict: - return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions - ) + return BaseModelOutput(last_hidden_state=output_states) @auto_docstring @@ -688,6 +673,10 @@ class DebertaV2PreTrainedModel(PreTrainedModel): base_model_prefix = "deberta" _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True + _can_record_outputs = { + "hidden_states": DebertaV2Layer, + "attentions": DisentangledSelfAttention, + } @torch.no_grad() def _init_weights(self, module): @@ -718,6 +707,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.embeddings.word_embeddings = new_embeddings + @capture_outputs @auto_docstring def forward( self, @@ -726,17 +716,8 @@ def forward( token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutput: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -765,38 +746,40 @@ def forward( encoder_outputs = self.encoder( embedding_output, attention_mask, - output_hidden_states=True, - output_attentions=output_attentions, - return_dict=return_dict, + query_states=None, + relative_pos=None, + **kwargs, ) - encoded_layers = encoder_outputs[1] + + sequence_output = encoder_outputs.last_hidden_state if self.z_steps > 1: - hidden_states = encoded_layers[-2] + if encoder_outputs.hidden_states and len(encoder_outputs.hidden_states) >= 2: + hidden_states = encoder_outputs.hidden_states[-2] + else: + hidden_states = sequence_output + layers = [self.encoder.layer[-1] for _ in range(self.z_steps)] - query_states = encoded_layers[-1] + query_states = sequence_output rel_embeddings = self.encoder.get_rel_embedding() - attention_mask = self.encoder.get_attention_mask(attention_mask) + attention_mask_encoded = self.encoder.get_attention_mask(attention_mask) rel_pos = self.encoder.get_rel_pos(embedding_output) + for layer in layers[1:]: query_states = layer( hidden_states, - attention_mask, - output_attentions=False, + attention_mask_encoded, query_states=query_states, relative_pos=rel_pos, rel_embeddings=rel_embeddings, + output_attentions=kwargs.get("output_attentions", False), ) - encoded_layers.append(query_states) - - sequence_output = encoded_layers[-1] - if not return_dict: - return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :] + sequence_output = query_states return BaseModelOutput( last_hidden_state=sequence_output, - hidden_states=encoder_outputs.hidden_states if output_hidden_states else None, + hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) @@ -921,6 +904,7 @@ def set_output_embeddings(self, new_embeddings): self.lm_predictions.lm_head.dense = new_embeddings self.lm_predictions.lm_head.bias = new_embeddings.bias + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM.forward with Deberta->DebertaV2 def forward( @@ -931,10 +915,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -943,17 +924,13 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.deberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -967,10 +944,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[1:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1033,6 +1006,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.deberta.set_input_embeddings(new_embeddings) + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification.forward with Deberta->DebertaV2 def forward( @@ -1043,10 +1017,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1054,7 +1025,6 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, @@ -1062,9 +1032,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) encoder_layer = outputs[0] @@ -1107,9 +1075,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions @@ -1130,6 +1095,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -1139,16 +1105,12 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, @@ -1156,9 +1118,7 @@ def forward( token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1171,10 +1131,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) @@ -1192,6 +1148,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering.forward with Deberta->DebertaV2 def forward( @@ -1203,22 +1160,15 @@ def forward( inputs_embeds: torch.Tensor | None = None, start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.deberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1245,10 +1195,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[1:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1283,6 +1229,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.deberta.set_input_embeddings(new_embeddings) + @can_return_tuple @auto_docstring def forward( self, @@ -1292,10 +1239,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1303,7 +1247,6 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1322,9 +1265,7 @@ def forward( token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, inputs_embeds=flat_inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) encoder_layer = outputs[0] @@ -1338,10 +1279,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py index 5a71c95205c5..30d235ddcb8d 100644 --- a/src/transformers/models/efficientnet/modeling_efficientnet.py +++ b/src/transformers/models/efficientnet/modeling_efficientnet.py @@ -26,7 +26,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_efficientnet import EfficientNetConfig @@ -404,26 +405,16 @@ def round_repeats(repeats): def forward( self, hidden_states: torch.FloatTensor, - output_hidden_states: bool | None = False, - return_dict: bool | None = True, ) -> BaseModelOutputWithNoAttention: - all_hidden_states = (hidden_states,) if output_hidden_states else None - for block in self.blocks: hidden_states = block(hidden_states) - if output_hidden_states: - all_hidden_states += (hidden_states,) hidden_states = self.top_conv(hidden_states) hidden_states = self.top_bn(hidden_states) hidden_states = self.top_activation(hidden_states) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - return BaseModelOutputWithNoAttention( last_hidden_state=hidden_states, - hidden_states=all_hidden_states, ) @@ -434,6 +425,9 @@ class EfficientNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["EfficientNetBlock"] + _can_record_outputs = { + "hidden_states": EfficientNetBlock, + } @torch.no_grad() def _init_weights(self, module: nn.Module): @@ -467,42 +461,29 @@ def __init__(self, config: EfficientNetConfig): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if pixel_values is None: raise ValueError("You have to specify pixel_values") embedding_output = self.embeddings(pixel_values) - encoder_outputs = self.encoder( - embedding_output, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + encoder_outputs = self.encoder(embedding_output) + last_hidden_state = encoder_outputs.last_hidden_state + # Apply pooling - last_hidden_state = encoder_outputs[0] pooled_output = self.pooler(last_hidden_state) # Reshape (batch_size, 1280, 1 , 1) -> (batch_size, 1280) pooled_output = pooled_output.reshape(pooled_output.shape[:2]) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -525,13 +506,12 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -540,11 +520,9 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict + outputs = self.efficientnet(pixel_values, **kwargs) - outputs = self.efficientnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) - - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -552,10 +530,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention( loss=loss, logits=logits, diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index c5b08bd94050..a99ff2b8a5c3 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -21,7 +21,8 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ... import initialization as init -from ...utils import auto_docstring, is_scipy_available +from ...utils import auto_docstring, can_return_tuple, is_scipy_available +from ...utils.output_capturing import capture_outputs if is_scipy_available(): @@ -265,24 +266,12 @@ def __init__(self, config): self.layer = nn.ModuleList([FNetLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - def forward(self, hidden_states, output_hidden_states=False, return_dict=True): - all_hidden_states = () if output_hidden_states else None - - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - + def forward(self, hidden_states): + for layer_module in self.layer: layer_outputs = layer_module(hidden_states) - hidden_states = layer_outputs[0] - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - - return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states) + return BaseModelOutput(last_hidden_state=hidden_states) # Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->FNet @@ -371,6 +360,9 @@ class FNetPreTrainedModel(PreTrainedModel): config: FNetConfig base_model_prefix = "fnet" supports_gradient_checkpointing = True + _can_record_outputs = { + "hidden_states": FNetLayer, + } def _init_weights(self, module): super()._init_weights(module) @@ -434,6 +426,8 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value + @can_return_tuple + @capture_outputs @auto_docstring def forward( self, @@ -441,15 +435,8 @@ def forward( token_type_ids: torch.LongTensor | None = None, position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutput: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -487,18 +474,11 @@ def forward( token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) - encoder_outputs = self.encoder( - embedding_output, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + encoder_outputs = self.encoder(embedding_output) sequence_output = encoder_outputs[0] pooler_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooler_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=sequence_output, pooler_output=pooler_output, @@ -534,6 +514,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @auto_docstring def forward( self, @@ -543,8 +524,6 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, next_sentence_label: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | FNetForPreTrainingOutput: r""" @@ -572,15 +551,12 @@ def forward( >>> prediction_logits = outputs.prediction_logits >>> seq_relationship_logits = outputs.seq_relationship_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output, pooled_output = outputs[:2] @@ -593,10 +569,6 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return FNetForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, @@ -628,6 +600,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @auto_docstring def forward( self, @@ -636,8 +609,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | MaskedLMOutput: r""" @@ -646,15 +617,12 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -665,10 +633,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput(loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states) @@ -687,6 +651,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -695,8 +660,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | NextSentencePredictorOutput: r""" @@ -722,16 +685,11 @@ def forward( >>> logits = outputs.logits >>> assert logits[0, 0] < logits[0, 1] # next sentence was random ```""" - - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, ) pooled_output = outputs[1] @@ -743,10 +701,6 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) - if not return_dict: - output = (seq_relationship_scores,) + outputs[2:] - return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output - return NextSentencePredictorOutput( loss=next_sentence_loss, logits=seq_relationship_scores, @@ -772,6 +726,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -780,8 +735,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | SequenceClassifierOutput: r""" @@ -790,15 +743,11 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, ) pooled_output = outputs[1] @@ -827,9 +776,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @@ -846,6 +792,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -854,8 +801,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | MultipleChoiceModelOutput: r""" @@ -888,7 +833,6 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -905,8 +849,7 @@ def forward( token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) pooled_output = outputs[1] @@ -920,10 +863,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput(loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states) @@ -941,6 +880,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -949,23 +889,17 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, ) sequence_output = outputs[0] @@ -979,10 +913,6 @@ def forward( # Only keep active parts of the loss loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @@ -999,6 +929,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -1008,19 +939,14 @@ def forward( inputs_embeds: torch.Tensor | None = None, start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1047,10 +973,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states ) diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py index a9b8d92cb589..0aee736d8186 100755 --- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py @@ -24,7 +24,8 @@ SemanticSegmenterOutput, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_mobilenet_v2 import MobileNetV2Config @@ -254,6 +255,7 @@ class MobileNetV2PreTrainedModel(PreTrainedModel): input_modalities = ("image",) supports_gradient_checkpointing = False _no_split_modules = [] + _can_record_outputs = {"hidden_states": MobileNetV2InvertedResidual} @auto_docstring @@ -323,31 +325,20 @@ def __init__(self, config: MobileNetV2Config, add_pooling_layer: bool = True): self.post_init() @auto_docstring + @capture_outputs def forward( self, pixel_values: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if pixel_values is None: raise ValueError("You have to specify pixel_values") hidden_states = self.conv_stem(pixel_values) - all_hidden_states = () if output_hidden_states else None - for i, layer_module in enumerate(self.layer): hidden_states = layer_module(hidden_states) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - last_hidden_state = self.conv_1x1(hidden_states) if self.pooler is not None: @@ -355,13 +346,9 @@ def forward( else: pooled_output = None - if not return_dict: - return tuple(v for v in [last_hidden_state, pooled_output, all_hidden_states] if v is not None) - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=all_hidden_states, ) @@ -388,12 +375,11 @@ def __init__(self, config: MobileNetV2Config) -> None: self.post_init() @auto_docstring + @can_return_tuple def forward( self, pixel_values: torch.Tensor | None = None, - output_hidden_states: bool | None = None, labels: torch.Tensor | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -402,11 +388,9 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - outputs = self.mobilenet_v2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + outputs = self.mobilenet_v2(pixel_values, **kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(self.dropout(pooled_output)) @@ -414,10 +398,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention( loss=loss, logits=logits, @@ -517,12 +497,11 @@ def __init__(self, config: MobileNetV2Config) -> None: self.post_init() @auto_docstring + @can_return_tuple def forward( self, pixel_values: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | SemanticSegmenterOutput: r""" @@ -553,21 +532,16 @@ def forward( >>> # logits are of shape (batch_size, num_labels, height, width) >>> logits = outputs.logits ```""" - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if labels is not None and self.config.num_labels == 1: raise ValueError("The number of labels should be greater than one") outputs = self.mobilenet_v2( pixel_values, output_hidden_states=True, # we need the intermediate hidden states - return_dict=return_dict, + **kwargs, ) - encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + encoder_hidden_states = outputs.hidden_states logits = self.segmentation_head(encoder_hidden_states[-1]) @@ -580,17 +554,10 @@ def forward( loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index) loss = loss_fct(upsampled_logits, labels) - if not return_dict: - if output_hidden_states: - output = (logits,) + outputs[1:] - else: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SemanticSegmenterOutput( loss=loss, logits=logits, - hidden_states=outputs.hidden_states if output_hidden_states else None, + hidden_states=outputs.hidden_states, attentions=None, ) diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py index 6c43d79d7894..19f8e1bc07eb 100644 --- a/src/transformers/models/regnet/modeling_regnet.py +++ b/src/transformers/models/regnet/modeling_regnet.py @@ -26,7 +26,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_regnet import RegNetConfig @@ -261,6 +262,7 @@ class RegNetPreTrainedModel(PreTrainedModel): base_model_prefix = "regnet" main_input_name = "pixel_values" _no_split_modules = ["RegNetYLayer"] + _can_record_outputs = {"hidden_states": RegNetStage} @torch.no_grad() def _init_weights(self, module): @@ -294,36 +296,24 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: Tensor, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - embedding_output = self.embedder(pixel_values) - encoder_outputs = self.encoder( - embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict - ) + encoder_outputs = self.encoder(embedding_output, return_dict=True) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -347,13 +337,13 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> ImageClassifierOutputWithNoAttention: r""" @@ -361,11 +351,12 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - outputs = self.regnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + model_kwargs = {} + if output_hidden_states is not None: + model_kwargs["output_hidden_states"] = output_hidden_states + outputs = self.regnet(pixel_values, **model_kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) @@ -374,10 +365,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return (loss,) + output if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py index 9e4ed8aecfa7..270940568f43 100644 --- a/src/transformers/models/resnet/modeling_resnet.py +++ b/src/transformers/models/resnet/modeling_resnet.py @@ -28,8 +28,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging -from ...utils.generic import can_return_tuple +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_resnet import ResNetConfig @@ -249,6 +249,7 @@ class ResNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["ResNetConvLayer", "ResNetShortCut"] + _can_record_outputs = {"hidden_states": ResNetStage} @torch.no_grad() def _init_weights(self, module): @@ -282,36 +283,24 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: Tensor, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - embedding_output = self.embedder(pixel_values) - encoder_outputs = self.encoder( - embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict - ) + encoder_outputs = self.encoder(embedding_output, return_dict=True) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -334,13 +323,13 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> ImageClassifierOutputWithNoAttention: r""" @@ -348,11 +337,12 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - outputs = self.resnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + model_kwargs = {} + if output_hidden_states is not None: + model_kwargs["output_hidden_states"] = output_hidden_states + outputs = self.resnet(pixel_values, **model_kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) @@ -361,10 +351,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return (loss,) + output if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @@ -393,7 +379,6 @@ def forward( self, pixel_values: Tensor, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BackboneOutput: r""" @@ -422,7 +407,6 @@ def forward( >>> list(feature_maps[-1].shape) [1, 2048, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -438,12 +422,6 @@ def forward( if stage in self.out_features: feature_maps += (hidden_states[idx],) - if not return_dict: - output = (feature_maps,) - if output_hidden_states: - output += (outputs.hidden_states,) - return output - return BackboneOutput( feature_maps=feature_maps, hidden_states=outputs.hidden_states if output_hidden_states else None, diff --git a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py index e3b0ade23f8f..861c7d327edc 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py @@ -303,6 +303,7 @@ class RTDetrResNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["RTDetrResNetConvLayer", "RTDetrResNetShortCut"] + _can_record_outputs = {"hidden_states": RTDetrResNetStage} @torch.no_grad() def _init_weights(self, module):