diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index d0fe74dd8714..e188c57af32d 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -238,23 +238,22 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, - output_attentions: bool | None = False, - **kwargs, - ) -> tuple[torch.Tensor] | tuple[torch.Tensor, tuple[torch.FloatTensor, ...]] | None: + cache_position: torch.LongTensor | None = None, + ) -> torch.Tensor: residual = hidden_states hidden_states = self.ln_1(hidden_states) - attn_outputs, attn_weights = self.attn( + attn_outputs, _ = self.attn( hidden_states=hidden_states, layer_past=layer_past, attention_mask=attention_mask, position_ids=position_ids, use_cache=use_cache, - output_attentions=output_attentions, + cache_position=cache_position, ) feed_forward_hidden_states = self.mlp(hidden_states) hidden_states = attn_outputs + feed_forward_hidden_states + residual - return hidden_states, attn_weights + return hidden_states @auto_docstring diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py index 3ceccb7d1dab..c3009ba0422d 100644 --- a/src/transformers/models/cvt/modeling_cvt.py +++ b/src/transformers/models/cvt/modeling_cvt.py @@ -23,7 +23,7 @@ from ... import initialization as init from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging from .configuration_cvt import CvtConfig @@ -461,23 +461,15 @@ def __init__(self, config): for stage_idx in range(len(config.depth)): self.stages.append(CvtStage(config, stage_idx)) - def forward(self, pixel_values, output_hidden_states=False, return_dict=True): - all_hidden_states = () if output_hidden_states else None + def forward(self, pixel_values): hidden_state = pixel_values - cls_token = None - for _, (stage_module) in enumerate(self.stages): + for stage_module in self.stages: hidden_state, cls_token = stage_module(hidden_state) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_state,) - - if not return_dict: - return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None) return BaseModelOutputWithCLSToken( last_hidden_state=hidden_state, cls_token_value=cls_token, - hidden_states=all_hidden_states, ) @@ -491,11 +483,11 @@ class CvtPreTrainedModel(PreTrainedModel): @torch.no_grad() def _init_weights(self, module): """Initialize the weights""" - if isinstance(module, (nn.Linear, nn.Conv2d)): + if isinstance(module, nn.Linear | nn.Conv2d): init.trunc_normal_(module.weight, mean=0.0, std=self.config.initializer_range) if module.bias is not None: init.zeros_(module.bias) - elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)): + elif isinstance(module, nn.LayerNorm | nn.BatchNorm2d): init.zeros_(module.bias) init.ones_(module.weight) if getattr(module, "running_mean", None) is not None: @@ -519,36 +511,42 @@ def __init__(self, config, add_pooling_layer=True): self.encoder = CvtEncoder(config) self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.Tensor | None = None, output_hidden_states: bool | None = None, + output_attentions: bool | None = None, return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithCLSToken: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions return_dict = return_dict if return_dict is not None else self.config.return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") - encoder_outputs = self.encoder( - pixel_values, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = encoder_outputs[0] + # Manually collect hidden states from encoder stages + all_hidden_states = () if output_hidden_states else None + hidden_state = pixel_values + cls_token = None + + for stage_module in self.encoder.stages: + hidden_state, cls_token = stage_module(hidden_state) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) if not return_dict: - return (sequence_output,) + encoder_outputs[1:] + return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None) return BaseModelOutputWithCLSToken( - last_hidden_state=sequence_output, - cls_token_value=encoder_outputs.cls_token_value, - hidden_states=encoder_outputs.hidden_states, + last_hidden_state=hidden_state, + cls_token_value=cls_token, + hidden_states=all_hidden_states, ) @@ -573,13 +571,12 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -588,12 +585,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.cvt( - pixel_values, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + outputs = self.cvt(pixel_values, **kwargs) sequence_output = outputs[0] cls_token = outputs[1] @@ -631,10 +623,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index b30cbd6342dc..c8b33b7021c0 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -31,7 +31,9 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_deberta_v2 import DebertaV2Config @@ -269,8 +271,7 @@ def forward( ) new_context_layer_shape = context_layer.size()[:-2] + (-1,) context_layer = context_layer.view(new_context_layer_shape) - if not output_attentions: - return (context_layer, None) + return (context_layer, attention_probs) def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): @@ -428,8 +429,8 @@ def forward( relative_pos=None, rel_embeddings=None, output_attentions: bool = False, - ) -> tuple[torch.Tensor, torch.Tensor | None]: - attention_output, att_matrix = self.attention( + ) -> torch.Tensor: + attention_output, _ = self.attention( hidden_states, attention_mask, output_attentions=output_attentions, @@ -440,10 +441,7 @@ def forward( intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) - if output_attentions: - return (layer_output, att_matrix) - else: - return (layer_output, None) + return layer_output class ConvLayer(nn.Module): @@ -631,11 +629,10 @@ def forward( self, hidden_states, attention_mask, - output_hidden_states=True, output_attentions=False, query_states=None, relative_pos=None, - return_dict=True, + **kwargs: Unpack[TransformersKwargs], ): if attention_mask.dim() <= 2: input_mask = attention_mask @@ -644,13 +641,11 @@ def forward( attention_mask = self.get_attention_mask(attention_mask) relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) - all_hidden_states: tuple[torch.Tensor] | None = (hidden_states,) if output_hidden_states else None - all_attentions = () if output_attentions else None - next_kv = hidden_states rel_embeddings = self.get_rel_embedding() + for i, layer_module in enumerate(self.layer): - output_states, attn_weights = layer_module( + output_states = layer_module( next_kv, attention_mask, query_states=query_states, @@ -659,15 +654,9 @@ def forward( output_attentions=output_attentions, ) - if output_attentions: - all_attentions = all_attentions + (attn_weights,) - if i == 0 and self.conv is not None: output_states = self.conv(hidden_states, output_states, input_mask) - if output_hidden_states: - all_hidden_states = all_hidden_states + (output_states,) - if query_states is not None: query_states = output_states if isinstance(hidden_states, Sequence): @@ -675,11 +664,7 @@ def forward( else: next_kv = output_states - if not return_dict: - return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions - ) + return BaseModelOutput(last_hidden_state=output_states) @auto_docstring @@ -688,12 +673,16 @@ class DebertaV2PreTrainedModel(PreTrainedModel): base_model_prefix = "deberta" _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True + _can_record_outputs = { + "hidden_states": DebertaV2Layer, + "attentions": DisentangledSelfAttention, + } @torch.no_grad() def _init_weights(self, module): """Initialize the weights.""" super()._init_weights(module) - if isinstance(module, (LegacyDebertaV2LMPredictionHead, DebertaV2LMPredictionHead)): + if isinstance(module, LegacyDebertaV2LMPredictionHead | DebertaV2LMPredictionHead): init.zeros_(module.bias) elif isinstance(module, DebertaV2Embeddings): init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1))) @@ -718,6 +707,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.embeddings.word_embeddings = new_embeddings + @capture_outputs @auto_docstring def forward( self, @@ -726,17 +716,8 @@ def forward( token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutput: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -765,38 +746,40 @@ def forward( encoder_outputs = self.encoder( embedding_output, attention_mask, - output_hidden_states=True, - output_attentions=output_attentions, - return_dict=return_dict, + query_states=None, + relative_pos=None, + **kwargs, ) - encoded_layers = encoder_outputs[1] + + sequence_output = encoder_outputs.last_hidden_state if self.z_steps > 1: - hidden_states = encoded_layers[-2] + if encoder_outputs.hidden_states and len(encoder_outputs.hidden_states) >= 2: + hidden_states = encoder_outputs.hidden_states[-2] + else: + hidden_states = sequence_output + layers = [self.encoder.layer[-1] for _ in range(self.z_steps)] - query_states = encoded_layers[-1] + query_states = sequence_output rel_embeddings = self.encoder.get_rel_embedding() - attention_mask = self.encoder.get_attention_mask(attention_mask) + attention_mask_encoded = self.encoder.get_attention_mask(attention_mask) rel_pos = self.encoder.get_rel_pos(embedding_output) + for layer in layers[1:]: query_states = layer( hidden_states, - attention_mask, - output_attentions=False, + attention_mask_encoded, query_states=query_states, relative_pos=rel_pos, rel_embeddings=rel_embeddings, + output_attentions=kwargs.get("output_attentions", False), ) - encoded_layers.append(query_states) - - sequence_output = encoded_layers[-1] - if not return_dict: - return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :] + sequence_output = query_states return BaseModelOutput( last_hidden_state=sequence_output, - hidden_states=encoder_outputs.hidden_states if output_hidden_states else None, + hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) @@ -921,6 +904,7 @@ def set_output_embeddings(self, new_embeddings): self.lm_predictions.lm_head.dense = new_embeddings self.lm_predictions.lm_head.bias = new_embeddings.bias + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM.forward with Deberta->DebertaV2 def forward( @@ -931,10 +915,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -943,17 +924,13 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.deberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -967,10 +944,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[1:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1033,6 +1006,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.deberta.set_input_embeddings(new_embeddings) + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification.forward with Deberta->DebertaV2 def forward( @@ -1043,10 +1017,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1054,7 +1025,6 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, @@ -1062,9 +1032,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) encoder_layer = outputs[0] @@ -1107,9 +1075,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions @@ -1130,6 +1095,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -1139,16 +1105,12 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.return_dict outputs = self.deberta( input_ids, @@ -1156,9 +1118,7 @@ def forward( token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1171,10 +1131,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) @@ -1192,6 +1148,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering.forward with Deberta->DebertaV2 def forward( @@ -1203,22 +1160,15 @@ def forward( inputs_embeds: torch.Tensor | None = None, start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.deberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1245,10 +1195,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[1:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1283,6 +1229,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.deberta.set_input_embeddings(new_embeddings) + @can_return_tuple @auto_docstring def forward( self, @@ -1292,10 +1239,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1303,7 +1247,6 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1322,9 +1265,7 @@ def forward( token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, inputs_embeds=flat_inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) encoder_layer = outputs[0] @@ -1338,10 +1279,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py index 5a71c95205c5..61299787ae4f 100644 --- a/src/transformers/models/efficientnet/modeling_efficientnet.py +++ b/src/transformers/models/efficientnet/modeling_efficientnet.py @@ -26,7 +26,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_efficientnet import EfficientNetConfig @@ -404,26 +405,16 @@ def round_repeats(repeats): def forward( self, hidden_states: torch.FloatTensor, - output_hidden_states: bool | None = False, - return_dict: bool | None = True, ) -> BaseModelOutputWithNoAttention: - all_hidden_states = (hidden_states,) if output_hidden_states else None - for block in self.blocks: hidden_states = block(hidden_states) - if output_hidden_states: - all_hidden_states += (hidden_states,) hidden_states = self.top_conv(hidden_states) hidden_states = self.top_bn(hidden_states) hidden_states = self.top_activation(hidden_states) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - return BaseModelOutputWithNoAttention( last_hidden_state=hidden_states, - hidden_states=all_hidden_states, ) @@ -434,11 +425,14 @@ class EfficientNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["EfficientNetBlock"] + _can_record_outputs = { + "hidden_states": EfficientNetBlock, + } @torch.no_grad() def _init_weights(self, module: nn.Module): """Initialize the weights""" - if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + if isinstance(module, nn.Linear | nn.Conv2d | nn.BatchNorm2d): init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) if module.bias is not None: init.zeros_(module.bias) @@ -467,42 +461,29 @@ def __init__(self, config: EfficientNetConfig): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if pixel_values is None: raise ValueError("You have to specify pixel_values") embedding_output = self.embeddings(pixel_values) - encoder_outputs = self.encoder( - embedding_output, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + encoder_outputs = self.encoder(embedding_output) + last_hidden_state = encoder_outputs.last_hidden_state + # Apply pooling - last_hidden_state = encoder_outputs[0] pooled_output = self.pooler(last_hidden_state) # Reshape (batch_size, 1280, 1 , 1) -> (batch_size, 1280) pooled_output = pooled_output.reshape(pooled_output.shape[:2]) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -525,13 +506,12 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -540,11 +520,9 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict + outputs = self.efficientnet(pixel_values, **kwargs) - outputs = self.efficientnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) - - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -552,10 +530,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention( loss=loss, logits=logits, diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index c5b08bd94050..a99ff2b8a5c3 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -21,7 +21,8 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ... import initialization as init -from ...utils import auto_docstring, is_scipy_available +from ...utils import auto_docstring, can_return_tuple, is_scipy_available +from ...utils.output_capturing import capture_outputs if is_scipy_available(): @@ -265,24 +266,12 @@ def __init__(self, config): self.layer = nn.ModuleList([FNetLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - def forward(self, hidden_states, output_hidden_states=False, return_dict=True): - all_hidden_states = () if output_hidden_states else None - - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - + def forward(self, hidden_states): + for layer_module in self.layer: layer_outputs = layer_module(hidden_states) - hidden_states = layer_outputs[0] - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - - return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states) + return BaseModelOutput(last_hidden_state=hidden_states) # Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->FNet @@ -371,6 +360,9 @@ class FNetPreTrainedModel(PreTrainedModel): config: FNetConfig base_model_prefix = "fnet" supports_gradient_checkpointing = True + _can_record_outputs = { + "hidden_states": FNetLayer, + } def _init_weights(self, module): super()._init_weights(module) @@ -434,6 +426,8 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value + @can_return_tuple + @capture_outputs @auto_docstring def forward( self, @@ -441,15 +435,8 @@ def forward( token_type_ids: torch.LongTensor | None = None, position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutput: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -487,18 +474,11 @@ def forward( token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) - encoder_outputs = self.encoder( - embedding_output, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + encoder_outputs = self.encoder(embedding_output) sequence_output = encoder_outputs[0] pooler_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooler_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=sequence_output, pooler_output=pooler_output, @@ -534,6 +514,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @auto_docstring def forward( self, @@ -543,8 +524,6 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, next_sentence_label: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | FNetForPreTrainingOutput: r""" @@ -572,15 +551,12 @@ def forward( >>> prediction_logits = outputs.prediction_logits >>> seq_relationship_logits = outputs.seq_relationship_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output, pooled_output = outputs[:2] @@ -593,10 +569,6 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return FNetForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, @@ -628,6 +600,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @auto_docstring def forward( self, @@ -636,8 +609,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | MaskedLMOutput: r""" @@ -646,15 +617,12 @@ def forward( config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -665,10 +633,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput(loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states) @@ -687,6 +651,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -695,8 +660,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | NextSentencePredictorOutput: r""" @@ -722,16 +685,11 @@ def forward( >>> logits = outputs.logits >>> assert logits[0, 0] < logits[0, 1] # next sentence was random ```""" - - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, ) pooled_output = outputs[1] @@ -743,10 +701,6 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) - if not return_dict: - output = (seq_relationship_scores,) + outputs[2:] - return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output - return NextSentencePredictorOutput( loss=next_sentence_loss, logits=seq_relationship_scores, @@ -772,6 +726,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -780,8 +735,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | SequenceClassifierOutput: r""" @@ -790,15 +743,11 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, ) pooled_output = outputs[1] @@ -827,9 +776,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @@ -846,6 +792,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -854,8 +801,6 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | MultipleChoiceModelOutput: r""" @@ -888,7 +833,6 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -905,8 +849,7 @@ def forward( token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) pooled_output = outputs[1] @@ -920,10 +863,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput(loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states) @@ -941,6 +880,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -949,23 +889,17 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, ) sequence_output = outputs[0] @@ -979,10 +913,6 @@ def forward( # Only keep active parts of the loss loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @@ -999,6 +929,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -1008,19 +939,14 @@ def forward( inputs_embeds: torch.Tensor | None = None, start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.fnet( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1047,10 +973,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states ) diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index c92026f2f209..ad5b6d147f8e 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -33,7 +33,9 @@ SequenceClassifierOutputWithPast, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.generic import merge_with_config_defaults +from ...utils.output_capturing import capture_outputs from .configuration_gptj import GPTJConfig @@ -172,13 +174,8 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, - output_attentions: bool | None = False, - **kwargs, - ) -> ( - tuple[torch.Tensor, tuple[torch.Tensor]] - | tuple[torch.Tensor, tuple[torch.Tensor], tuple[torch.Tensor, ...]] - | None - ): + cache_position: torch.LongTensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: query = self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) @@ -247,13 +244,8 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, - output_attentions: bool | None = False, - **kwargs, - ) -> ( - tuple[torch.Tensor, tuple[torch.Tensor]] - | tuple[torch.Tensor, tuple[torch.Tensor], tuple[torch.Tensor, ...]] - | None - ): + cache_position: torch.LongTensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: query = self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) @@ -394,23 +386,22 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, - output_attentions: bool | None = False, - **kwargs, - ) -> tuple[torch.Tensor] | tuple[torch.Tensor, tuple[torch.FloatTensor, ...]] | None: + cache_position: torch.LongTensor | None = None, + ) -> torch.Tensor: residual = hidden_states hidden_states = self.ln_1(hidden_states) - attn_outputs, attn_weights = self.attn( + attn_outputs, _ = self.attn( hidden_states=hidden_states, layer_past=layer_past, attention_mask=attention_mask, position_ids=position_ids, use_cache=use_cache, - output_attentions=output_attentions, + cache_position=cache_position, ) feed_forward_hidden_states = self.mlp(hidden_states) hidden_states = attn_outputs + feed_forward_hidden_states + residual - return hidden_states, attn_weights + return hidden_states @auto_docstring @@ -422,6 +413,10 @@ class GPTJPreTrainedModel(PreTrainedModel): _skip_keys_device_placement = "past_key_values" _supports_flash_attn = True _can_compile_fullgraph = True + _can_record_outputs = { + "hidden_states": GPTJBlock, + "attentions": GPTJAttention, + } def _init_weights(self, module): super()._init_weights(module) @@ -452,6 +447,8 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.wte = new_embeddings + @merge_with_config_defaults + @capture_outputs @auto_docstring def forward( self, @@ -462,34 +459,18 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, + cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.return_dict - if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - if inputs_embeds is None: inputs_embeds = self.wte(input_ids) @@ -520,42 +501,23 @@ def forward( hidden_states = self.drop(hidden_states) output_shape = (-1, seq_length, hidden_states.size(-1)) - all_self_attentions = () if output_attentions else None - all_hidden_states = () if output_hidden_states else None - for i, block in enumerate(self.h): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - outputs = block( + for block in self.h: + hidden_states = block( hidden_states, layer_past=past_key_values, attention_mask=causal_mask, position_ids=position_ids, use_cache=use_cache, - output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states = outputs[0] - if output_attentions: - all_self_attentions = all_self_attentions + (outputs[1],) - hidden_states = self.ln_f(hidden_states) hidden_states = hidden_states.view(output_shape) - # Add last hidden state - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple( - v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions] if v is not None - ) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=past_key_values, - hidden_states=all_hidden_states, - attentions=all_self_attentions, ) @@ -575,6 +537,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -586,12 +549,10 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, + cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This @@ -602,9 +563,7 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - transformer_outputs = self.transformer( + transformer_outputs: BaseModelOutputWithPast = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, @@ -612,12 +571,11 @@ def forward( position_ids=position_ids, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + cache_position=cache_position, + **kwargs, ) - hidden_states = transformer_outputs[0] + hidden_states = transformer_outputs.last_hidden_state # Only compute necessary logits, and do not upcast them to float if we are not computing the loss slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep logits = self.lm_head(hidden_states[:, slice_indices, :]) @@ -626,10 +584,6 @@ def forward( if labels is not None: loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) - if not return_dict: - output = (logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - return CausalLMOutputWithPast( loss=loss, logits=logits, @@ -663,6 +617,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -674,11 +629,8 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, - ) -> tuple | SequenceClassifierOutputWithPast: + ) -> SequenceClassifierOutputWithPast: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This @@ -689,9 +641,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - transformer_outputs = self.transformer( + transformer_outputs: BaseModelOutputWithPast = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, @@ -699,11 +649,9 @@ def forward( position_ids=position_ids, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) - hidden_states = transformer_outputs[0] + hidden_states = transformer_outputs.last_hidden_state logits = self.score(hidden_states) if input_ids is not None: @@ -752,9 +700,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutputWithPast( loss=loss, @@ -776,6 +721,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -786,31 +732,24 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, - ) -> tuple | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix. """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - outputs = self.transformer( + outputs: BaseModelOutputWithPast = self.transformer( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -834,10 +773,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py index a9b8d92cb589..fd465e9c2de2 100755 --- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py @@ -25,6 +25,7 @@ ) from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, logging +from ...utils.output_manager import can_return_tuple, capture_outputs from .configuration_mobilenet_v2 import MobileNetV2Config @@ -254,6 +255,7 @@ class MobileNetV2PreTrainedModel(PreTrainedModel): input_modalities = ("image",) supports_gradient_checkpointing = False _no_split_modules = [] + _can_record_outputs = {"hidden_states": MobileNetV2InvertedResidual} @auto_docstring @@ -323,31 +325,20 @@ def __init__(self, config: MobileNetV2Config, add_pooling_layer: bool = True): self.post_init() @auto_docstring + @capture_outputs def forward( self, pixel_values: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if pixel_values is None: raise ValueError("You have to specify pixel_values") hidden_states = self.conv_stem(pixel_values) - all_hidden_states = () if output_hidden_states else None - for i, layer_module in enumerate(self.layer): hidden_states = layer_module(hidden_states) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - last_hidden_state = self.conv_1x1(hidden_states) if self.pooler is not None: @@ -355,13 +346,9 @@ def forward( else: pooled_output = None - if not return_dict: - return tuple(v for v in [last_hidden_state, pooled_output, all_hidden_states] if v is not None) - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=all_hidden_states, ) @@ -388,12 +375,11 @@ def __init__(self, config: MobileNetV2Config) -> None: self.post_init() @auto_docstring + @can_return_tuple def forward( self, pixel_values: torch.Tensor | None = None, - output_hidden_states: bool | None = None, labels: torch.Tensor | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -402,11 +388,9 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - outputs = self.mobilenet_v2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + outputs = self.mobilenet_v2(pixel_values, **kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(self.dropout(pooled_output)) @@ -414,10 +398,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention( loss=loss, logits=logits, @@ -517,12 +497,11 @@ def __init__(self, config: MobileNetV2Config) -> None: self.post_init() @auto_docstring + @can_return_tuple def forward( self, pixel_values: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | SemanticSegmenterOutput: r""" @@ -553,21 +532,16 @@ def forward( >>> # logits are of shape (batch_size, num_labels, height, width) >>> logits = outputs.logits ```""" - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if labels is not None and self.config.num_labels == 1: raise ValueError("The number of labels should be greater than one") outputs = self.mobilenet_v2( pixel_values, output_hidden_states=True, # we need the intermediate hidden states - return_dict=return_dict, + **kwargs, ) - encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + encoder_hidden_states = outputs.hidden_states logits = self.segmentation_head(encoder_hidden_states[-1]) @@ -580,17 +554,10 @@ def forward( loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index) loss = loss_fct(upsampled_logits, labels) - if not return_dict: - if output_hidden_states: - output = (logits,) + outputs[1:] - else: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SemanticSegmenterOutput( loss=loss, logits=logits, - hidden_states=outputs.hidden_states if output_hidden_states else None, + hidden_states=outputs.hidden_states, attentions=None, ) diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py index 6c43d79d7894..aabed08447e1 100644 --- a/src/transformers/models/regnet/modeling_regnet.py +++ b/src/transformers/models/regnet/modeling_regnet.py @@ -26,7 +26,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_regnet import RegNetConfig @@ -261,6 +262,7 @@ class RegNetPreTrainedModel(PreTrainedModel): base_model_prefix = "regnet" main_input_name = "pixel_values" _no_split_modules = ["RegNetYLayer"] + _can_record_outputs = {"hidden_states": RegNetStage} @torch.no_grad() def _init_weights(self, module): @@ -273,7 +275,7 @@ def _init_weights(self, module): fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(module.weight) bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 init.uniform_(module.bias, -bound, bound) - elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)): + elif isinstance(module, nn.BatchNorm2d | nn.GroupNorm): init.constant_(module.weight, 1) init.constant_(module.bias, 0) if getattr(module, "running_mean", None) is not None: @@ -294,36 +296,24 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: Tensor, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - embedding_output = self.embedder(pixel_values) - encoder_outputs = self.encoder( - embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict - ) + encoder_outputs = self.encoder(embedding_output, return_dict=True) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -347,13 +337,13 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> ImageClassifierOutputWithNoAttention: r""" @@ -361,11 +351,12 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - outputs = self.regnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + model_kwargs = {} + if output_hidden_states is not None: + model_kwargs["output_hidden_states"] = output_hidden_states + outputs = self.regnet(pixel_values, **model_kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) @@ -374,10 +365,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return (loss,) + output if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py index 9e4ed8aecfa7..8a8b8531f6ef 100644 --- a/src/transformers/models/resnet/modeling_resnet.py +++ b/src/transformers/models/resnet/modeling_resnet.py @@ -20,7 +20,7 @@ from ... import initialization as init from ...activations import ACT2FN -from ...backbone_utils import BackboneMixin, filter_output_hidden_states +from ...backbone_utils import BackboneMixin from ...modeling_outputs import ( BackboneOutput, BaseModelOutputWithNoAttention, @@ -28,8 +28,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging -from ...utils.generic import can_return_tuple +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_resnet import ResNetConfig @@ -249,6 +249,7 @@ class ResNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["ResNetConvLayer", "ResNetShortCut"] + _can_record_outputs = {"hidden_states": ResNetStage} @torch.no_grad() def _init_weights(self, module): @@ -282,36 +283,24 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: Tensor, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - embedding_output = self.embedder(pixel_values) - encoder_outputs = self.encoder( - embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict - ) + encoder_outputs = self.encoder(embedding_output, return_dict=True) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -334,13 +323,13 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> ImageClassifierOutputWithNoAttention: r""" @@ -348,11 +337,12 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - outputs = self.resnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + model_kwargs = {} + if output_hidden_states is not None: + model_kwargs["output_hidden_states"] = output_hidden_states + outputs = self.resnet(pixel_values, **model_kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) @@ -361,10 +351,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return (loss,) + output if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @@ -387,13 +373,11 @@ def __init__(self, config): self.post_init() @can_return_tuple - @filter_output_hidden_states @auto_docstring def forward( self, pixel_values: Tensor, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BackboneOutput: r""" @@ -422,7 +406,6 @@ def forward( >>> list(feature_maps[-1].shape) [1, 2048, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -438,12 +421,6 @@ def forward( if stage in self.out_features: feature_maps += (hidden_states[idx],) - if not return_dict: - output = (feature_maps,) - if output_hidden_states: - output += (outputs.hidden_states,) - return output - return BackboneOutput( feature_maps=feature_maps, hidden_states=outputs.hidden_states if output_hidden_states else None, diff --git a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py index e3b0ade23f8f..861c7d327edc 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py @@ -303,6 +303,7 @@ class RTDetrResNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["RTDetrResNetConvLayer", "RTDetrResNetShortCut"] + _can_record_outputs = {"hidden_states": RTDetrResNetStage} @torch.no_grad() def _init_weights(self, module):