diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index d0fe74dd8714..d73f98d34e98 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -221,9 +221,7 @@ def forward(self, hidden_states: torch.FloatTensor | None) -> torch.FloatTensor: return hidden_states -# Copied from transformers.models.gptj.modeling_gptj.GPTJBlock with GPTJ->CodeGen class CodeGenBlock(GradientCheckpointingLayer): - # Ignore copy def __init__(self, config, layer_idx=None): super().__init__() inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index b30cbd6342dc..6881b67d8e8b 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -31,7 +31,9 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_deberta_v2 import DebertaV2Config @@ -269,8 +271,7 @@ def forward( ) new_context_layer_shape = context_layer.size()[:-2] + (-1,) context_layer = context_layer.view(new_context_layer_shape) - if not output_attentions: - return (context_layer, None) + return (context_layer, attention_probs) def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): @@ -428,8 +429,8 @@ def forward( relative_pos=None, rel_embeddings=None, output_attentions: bool = False, - ) -> tuple[torch.Tensor, torch.Tensor | None]: - attention_output, att_matrix = self.attention( + ) -> torch.Tensor: + attention_output, _ = self.attention( hidden_states, attention_mask, output_attentions=output_attentions, @@ -440,10 +441,7 @@ def forward( intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) - if output_attentions: - return (layer_output, att_matrix) - else: - return (layer_output, None) + return layer_output class ConvLayer(nn.Module): @@ -631,11 +629,10 @@ def forward( self, hidden_states, attention_mask, - output_hidden_states=True, output_attentions=False, query_states=None, relative_pos=None, - return_dict=True, + **kwargs: Unpack[TransformersKwargs], ): if attention_mask.dim() <= 2: input_mask = attention_mask @@ -644,13 +641,11 @@ def forward( attention_mask = self.get_attention_mask(attention_mask) relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) - all_hidden_states: tuple[torch.Tensor] | None = (hidden_states,) if output_hidden_states else None - all_attentions = () if output_attentions else None - next_kv = hidden_states rel_embeddings = self.get_rel_embedding() + for i, layer_module in enumerate(self.layer): - output_states, attn_weights = layer_module( + output_states = layer_module( next_kv, attention_mask, query_states=query_states, @@ -659,15 +654,9 @@ def forward( output_attentions=output_attentions, ) - if output_attentions: - all_attentions = all_attentions + (attn_weights,) - if i == 0 and self.conv is not None: output_states = self.conv(hidden_states, output_states, input_mask) - if output_hidden_states: - all_hidden_states = all_hidden_states + (output_states,) - if query_states is not None: query_states = output_states if isinstance(hidden_states, Sequence): @@ -675,11 +664,7 @@ def forward( else: next_kv = output_states - if not return_dict: - return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions - ) + return BaseModelOutput(last_hidden_state=output_states) @auto_docstring @@ -688,6 +673,10 @@ class DebertaV2PreTrainedModel(PreTrainedModel): base_model_prefix = "deberta" _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True + _can_record_outputs = { + "hidden_states": DebertaV2Layer, + "attentions": DisentangledSelfAttention, + } @torch.no_grad() def _init_weights(self, module): @@ -718,6 +707,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.embeddings.word_embeddings = new_embeddings + @capture_outputs @auto_docstring def forward( self, @@ -726,17 +716,8 @@ def forward( token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutput: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -765,38 +746,40 @@ def forward( encoder_outputs = self.encoder( embedding_output, attention_mask, - output_hidden_states=True, - output_attentions=output_attentions, - return_dict=return_dict, + query_states=None, + relative_pos=None, + **kwargs, ) - encoded_layers = encoder_outputs[1] + + sequence_output = encoder_outputs.last_hidden_state if self.z_steps > 1: - hidden_states = encoded_layers[-2] + if encoder_outputs.hidden_states and len(encoder_outputs.hidden_states) >= 2: + hidden_states = encoder_outputs.hidden_states[-2] + else: + hidden_states = sequence_output + layers = [self.encoder.layer[-1] for _ in range(self.z_steps)] - query_states = encoded_layers[-1] + query_states = sequence_output rel_embeddings = self.encoder.get_rel_embedding() - attention_mask = self.encoder.get_attention_mask(attention_mask) + attention_mask_encoded = self.encoder.get_attention_mask(attention_mask) rel_pos = self.encoder.get_rel_pos(embedding_output) + for layer in layers[1:]: query_states = layer( hidden_states, - attention_mask, - output_attentions=False, + attention_mask_encoded, query_states=query_states, relative_pos=rel_pos, rel_embeddings=rel_embeddings, + output_attentions=kwargs.get("output_attentions", False), ) - encoded_layers.append(query_states) - sequence_output = encoded_layers[-1] - - if not return_dict: - return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :] + sequence_output = query_states return BaseModelOutput( last_hidden_state=sequence_output, - hidden_states=encoder_outputs.hidden_states if output_hidden_states else None, + hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) @@ -921,6 +904,7 @@ def set_output_embeddings(self, new_embeddings): self.lm_predictions.lm_head.dense = new_embeddings self.lm_predictions.lm_head.bias = new_embeddings.bias + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM.forward with Deberta->DebertaV2 def forward( @@ -931,10 +915,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -943,17 +924,13 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.deberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -967,10 +944,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[1:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1033,6 +1006,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.deberta.set_input_embeddings(new_embeddings) + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification.forward with Deberta->DebertaV2 def forward( @@ -1043,10 +1017,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1054,17 +1025,13 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.deberta( input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) encoder_layer = outputs[0] @@ -1107,9 +1074,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions @@ -1130,6 +1094,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -1139,26 +1104,19 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.deberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1171,10 +1129,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) @@ -1192,6 +1146,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring # Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering.forward with Deberta->DebertaV2 def forward( @@ -1203,22 +1158,15 @@ def forward( inputs_embeds: torch.Tensor | None = None, start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | QuestionAnsweringModelOutput: - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.deberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) sequence_output = outputs[0] @@ -1245,10 +1193,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[1:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1283,6 +1227,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.deberta.set_input_embeddings(new_embeddings) + @can_return_tuple @auto_docstring def forward( self, @@ -1292,10 +1237,7 @@ def forward( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1303,7 +1245,6 @@ def forward( num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1322,9 +1263,7 @@ def forward( token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, inputs_embeds=flat_inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) encoder_layer = outputs[0] @@ -1338,10 +1277,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py index 5a71c95205c5..0d3fbb24381a 100644 --- a/src/transformers/models/efficientnet/modeling_efficientnet.py +++ b/src/transformers/models/efficientnet/modeling_efficientnet.py @@ -26,7 +26,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_efficientnet import EfficientNetConfig @@ -404,26 +405,16 @@ def round_repeats(repeats): def forward( self, hidden_states: torch.FloatTensor, - output_hidden_states: bool | None = False, - return_dict: bool | None = True, ) -> BaseModelOutputWithNoAttention: - all_hidden_states = (hidden_states,) if output_hidden_states else None - for block in self.blocks: hidden_states = block(hidden_states) - if output_hidden_states: - all_hidden_states += (hidden_states,) hidden_states = self.top_conv(hidden_states) hidden_states = self.top_bn(hidden_states) hidden_states = self.top_activation(hidden_states) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - return BaseModelOutputWithNoAttention( last_hidden_state=hidden_states, - hidden_states=all_hidden_states, ) @@ -434,6 +425,9 @@ class EfficientNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["EfficientNetBlock"] + _can_record_outputs = { + "hidden_states": EfficientNetBlock, + } @torch.no_grad() def _init_weights(self, module: nn.Module): @@ -467,42 +461,29 @@ def __init__(self, config: EfficientNetConfig): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if pixel_values is None: raise ValueError("You have to specify pixel_values") embedding_output = self.embeddings(pixel_values) - encoder_outputs = self.encoder( - embedding_output, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + encoder_outputs = self.encoder(embedding_output) + last_hidden_state = encoder_outputs.last_hidden_state + # Apply pooling - last_hidden_state = encoder_outputs[0] pooled_output = self.pooler(last_hidden_state) # Reshape (batch_size, 1280, 1 , 1) -> (batch_size, 1280) pooled_output = pooled_output.reshape(pooled_output.shape[:2]) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -525,13 +506,12 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -540,11 +520,8 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - outputs = self.efficientnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) - - pooled_output = outputs.pooler_output if return_dict else outputs[1] + outputs = self.efficientnet(pixel_values, **kwargs) + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -552,10 +529,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention( loss=loss, logits=logits, diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index c92026f2f209..2fb2b5c40e26 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -33,7 +33,10 @@ SequenceClassifierOutputWithPast, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.generic import merge_with_config_defaults +from ...utils.output_capturing import capture_outputs from .configuration_gptj import GPTJConfig @@ -172,13 +175,9 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, - output_attentions: bool | None = False, - **kwargs, - ) -> ( - tuple[torch.Tensor, tuple[torch.Tensor]] - | tuple[torch.Tensor, tuple[torch.Tensor], tuple[torch.Tensor, ...]] - | None - ): + cache_position: torch.LongTensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + del cache_position query = self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) @@ -247,13 +246,9 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, - output_attentions: bool | None = False, - **kwargs, - ) -> ( - tuple[torch.Tensor, tuple[torch.Tensor]] - | tuple[torch.Tensor, tuple[torch.Tensor], tuple[torch.Tensor, ...]] - | None - ): + cache_position: torch.LongTensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + del cache_position query = self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) @@ -394,23 +389,22 @@ def forward( attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, use_cache: bool | None = False, - output_attentions: bool | None = False, - **kwargs, - ) -> tuple[torch.Tensor] | tuple[torch.Tensor, tuple[torch.FloatTensor, ...]] | None: + cache_position: torch.LongTensor | None = None, + ) -> torch.Tensor: residual = hidden_states hidden_states = self.ln_1(hidden_states) - attn_outputs, attn_weights = self.attn( + attn_outputs, _ = self.attn( hidden_states=hidden_states, layer_past=layer_past, attention_mask=attention_mask, position_ids=position_ids, use_cache=use_cache, - output_attentions=output_attentions, + cache_position=cache_position, ) feed_forward_hidden_states = self.mlp(hidden_states) hidden_states = attn_outputs + feed_forward_hidden_states + residual - return hidden_states, attn_weights + return hidden_states @auto_docstring @@ -422,6 +416,10 @@ class GPTJPreTrainedModel(PreTrainedModel): _skip_keys_device_placement = "past_key_values" _supports_flash_attn = True _can_compile_fullgraph = True + _can_record_outputs = { + "hidden_states": GPTJBlock, + "attentions": GPTJAttention, + } def _init_weights(self, module): super()._init_weights(module) @@ -452,6 +450,8 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.wte = new_embeddings + @merge_with_config_defaults + @capture_outputs @auto_docstring def forward( self, @@ -462,34 +462,19 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, - ) -> tuple | BaseModelOutputWithPast: + cache_position: torch.LongTensor | None = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + del kwargs r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.return_dict - if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - if inputs_embeds is None: inputs_embeds = self.wte(input_ids) @@ -497,15 +482,20 @@ def forward( past_key_values = DynamicCache(config=self.config) seq_length = inputs_embeds.shape[1] - if position_ids is None: + if cache_position is None: past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 - position_ids = torch.arange(seq_length, device=inputs_embeds.device) + past_key_values_length - position_ids = position_ids.unsqueeze(0) + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) causal_mask = create_causal_mask( config=self.config, inputs_embeds=inputs_embeds, attention_mask=attention_mask, + cache_position=cache_position, past_key_values=past_key_values, position_ids=position_ids, ) @@ -520,42 +510,22 @@ def forward( hidden_states = self.drop(hidden_states) output_shape = (-1, seq_length, hidden_states.size(-1)) - all_self_attentions = () if output_attentions else None - all_hidden_states = () if output_hidden_states else None - for i, block in enumerate(self.h): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - outputs = block( + for block in self.h: + hidden_states = block( hidden_states, layer_past=past_key_values, attention_mask=causal_mask, position_ids=position_ids, use_cache=use_cache, - output_attentions=output_attentions, + cache_position=cache_position, ) - hidden_states = outputs[0] - if output_attentions: - all_self_attentions = all_self_attentions + (outputs[1],) - hidden_states = self.ln_f(hidden_states) - hidden_states = hidden_states.view(output_shape) - # Add last hidden state - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple( - v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions] if v is not None - ) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=past_key_values, - hidden_states=all_hidden_states, - attentions=all_self_attentions, ) @@ -575,6 +545,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -586,12 +557,10 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, + cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, - **kwargs, - ) -> tuple | CausalLMOutputWithPast: + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This @@ -602,9 +571,7 @@ def forward( `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - transformer_outputs = self.transformer( + outputs: BaseModelOutputWithPast = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, @@ -612,12 +579,11 @@ def forward( position_ids=position_ids, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + cache_position=cache_position, + **kwargs, ) - hidden_states = transformer_outputs[0] + hidden_states = outputs.last_hidden_state # Only compute necessary logits, and do not upcast them to float if we are not computing the loss slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep logits = self.lm_head(hidden_states[:, slice_indices, :]) @@ -626,16 +592,12 @@ def forward( if labels is not None: loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) - if not return_dict: - output = (logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - return CausalLMOutputWithPast( loss=loss, logits=logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, ) @@ -663,6 +625,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -674,11 +637,8 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, - ) -> tuple | SequenceClassifierOutputWithPast: + **kwargs: Unpack[TransformersKwargs], + ) -> SequenceClassifierOutputWithPast: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This @@ -689,8 +649,6 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, @@ -699,11 +657,9 @@ def forward( position_ids=position_ids, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) - hidden_states = transformer_outputs[0] + hidden_states = transformer_outputs.last_hidden_state logits = self.score(hidden_states) if input_ids is not None: @@ -752,9 +708,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutputWithPast( loss=loss, @@ -776,6 +729,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, @@ -786,31 +740,24 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, - ) -> tuple | QuestionAnsweringModelOutput: + **kwargs: Unpack[TransformersKwargs], + ) -> QuestionAnsweringModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix. """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - outputs = self.transformer( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -834,10 +781,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py index a9b8d92cb589..f2a64e2c7548 100755 --- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py @@ -25,6 +25,8 @@ ) from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, logging +from ...utils.generic import can_return_tuple +from ...utils.output_capturing import capture_outputs from .configuration_mobilenet_v2 import MobileNetV2Config @@ -254,6 +256,7 @@ class MobileNetV2PreTrainedModel(PreTrainedModel): input_modalities = ("image",) supports_gradient_checkpointing = False _no_split_modules = [] + _can_record_outputs = {"hidden_states": MobileNetV2InvertedResidual} @auto_docstring @@ -323,31 +326,20 @@ def __init__(self, config: MobileNetV2Config, add_pooling_layer: bool = True): self.post_init() @auto_docstring + @capture_outputs def forward( self, pixel_values: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if pixel_values is None: raise ValueError("You have to specify pixel_values") hidden_states = self.conv_stem(pixel_values) - all_hidden_states = () if output_hidden_states else None - for i, layer_module in enumerate(self.layer): hidden_states = layer_module(hidden_states) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - last_hidden_state = self.conv_1x1(hidden_states) if self.pooler is not None: @@ -355,13 +347,9 @@ def forward( else: pooled_output = None - if not return_dict: - return tuple(v for v in [last_hidden_state, pooled_output, all_hidden_states] if v is not None) - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=all_hidden_states, ) @@ -388,12 +376,11 @@ def __init__(self, config: MobileNetV2Config) -> None: self.post_init() @auto_docstring + @can_return_tuple def forward( self, pixel_values: torch.Tensor | None = None, - output_hidden_states: bool | None = None, labels: torch.Tensor | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | ImageClassifierOutputWithNoAttention: r""" @@ -402,11 +389,9 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - outputs = self.mobilenet_v2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + outputs = self.mobilenet_v2(pixel_values, **kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(self.dropout(pooled_output)) @@ -414,10 +399,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention( loss=loss, logits=logits, @@ -517,12 +498,11 @@ def __init__(self, config: MobileNetV2Config) -> None: self.post_init() @auto_docstring + @can_return_tuple def forward( self, pixel_values: torch.Tensor | None = None, labels: torch.Tensor | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> tuple | SemanticSegmenterOutput: r""" @@ -553,21 +533,16 @@ def forward( >>> # logits are of shape (batch_size, num_labels, height, width) >>> logits = outputs.logits ```""" - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - if labels is not None and self.config.num_labels == 1: raise ValueError("The number of labels should be greater than one") outputs = self.mobilenet_v2( pixel_values, output_hidden_states=True, # we need the intermediate hidden states - return_dict=return_dict, + **kwargs, ) - encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + encoder_hidden_states = outputs.hidden_states logits = self.segmentation_head(encoder_hidden_states[-1]) @@ -580,17 +555,10 @@ def forward( loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index) loss = loss_fct(upsampled_logits, labels) - if not return_dict: - if output_hidden_states: - output = (logits,) + outputs[1:] - else: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SemanticSegmenterOutput( loss=loss, logits=logits, - hidden_states=outputs.hidden_states if output_hidden_states else None, + hidden_states=outputs.hidden_states, attentions=None, ) diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py index 6c43d79d7894..19f8e1bc07eb 100644 --- a/src/transformers/models/regnet/modeling_regnet.py +++ b/src/transformers/models/regnet/modeling_regnet.py @@ -26,7 +26,8 @@ ImageClassifierOutputWithNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, logging +from ...utils import auto_docstring, can_return_tuple, logging +from ...utils.output_capturing import capture_outputs from .configuration_regnet import RegNetConfig @@ -261,6 +262,7 @@ class RegNetPreTrainedModel(PreTrainedModel): base_model_prefix = "regnet" main_input_name = "pixel_values" _no_split_modules = ["RegNetYLayer"] + _can_record_outputs = {"hidden_states": RegNetStage} @torch.no_grad() def _init_weights(self, module): @@ -294,36 +296,24 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: Tensor, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - embedding_output = self.embedder(pixel_values) - encoder_outputs = self.encoder( - embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict - ) + encoder_outputs = self.encoder(embedding_output, return_dict=True) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -347,13 +337,13 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> ImageClassifierOutputWithNoAttention: r""" @@ -361,11 +351,12 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - outputs = self.regnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + model_kwargs = {} + if output_hidden_states is not None: + model_kwargs["output_hidden_states"] = output_hidden_states + outputs = self.regnet(pixel_values, **model_kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) @@ -374,10 +365,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return (loss,) + output if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py index 9e4ed8aecfa7..fb93f1944ead 100644 --- a/src/transformers/models/resnet/modeling_resnet.py +++ b/src/transformers/models/resnet/modeling_resnet.py @@ -30,6 +30,7 @@ from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, logging from ...utils.generic import can_return_tuple +from ...utils.output_capturing import capture_outputs from .configuration_resnet import ResNetConfig @@ -249,6 +250,7 @@ class ResNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["ResNetConvLayer", "ResNetShortCut"] + _can_record_outputs = {"hidden_states": ResNetStage} @torch.no_grad() def _init_weights(self, module): @@ -282,36 +284,24 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @capture_outputs @auto_docstring def forward( self, pixel_values: Tensor, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BaseModelOutputWithPoolingAndNoAttention: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - embedding_output = self.embedder(pixel_values) - encoder_outputs = self.encoder( - embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict - ) + encoder_outputs = self.encoder(embedding_output, return_dict=True) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, ) @@ -334,13 +324,13 @@ def __init__(self, config): # initialize weights and apply final processing self.post_init() + @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> ImageClassifierOutputWithNoAttention: r""" @@ -348,11 +338,12 @@ def forward( Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.return_dict - - outputs = self.resnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + model_kwargs = {} + if output_hidden_states is not None: + model_kwargs["output_hidden_states"] = output_hidden_states + outputs = self.resnet(pixel_values, **model_kwargs) - pooled_output = outputs.pooler_output if return_dict else outputs[1] + pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) @@ -361,10 +352,6 @@ def forward( if labels is not None: loss = self.loss_function(labels, logits, self.config) - if not return_dict: - output = (logits,) + outputs[2:] - return (loss,) + output if loss is not None else output - return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) @@ -393,7 +380,6 @@ def forward( self, pixel_values: Tensor, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BackboneOutput: r""" @@ -422,7 +408,6 @@ def forward( >>> list(feature_maps[-1].shape) [1, 2048, 7, 7] ```""" - return_dict = return_dict if return_dict is not None else self.config.return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -438,12 +423,6 @@ def forward( if stage in self.out_features: feature_maps += (hidden_states[idx],) - if not return_dict: - output = (feature_maps,) - if output_hidden_states: - output += (outputs.hidden_states,) - return output - return BackboneOutput( feature_maps=feature_maps, hidden_states=outputs.hidden_states if output_hidden_states else None, diff --git a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py index e3b0ade23f8f..861c7d327edc 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr_resnet.py @@ -303,6 +303,7 @@ class RTDetrResNetPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["RTDetrResNetConvLayer", "RTDetrResNetShortCut"] + _can_record_outputs = {"hidden_states": RTDetrResNetStage} @torch.no_grad() def _init_weights(self, module):