Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
fc58a7b
Refactor FNet and CVT output tracing
beelapranay Feb 14, 2026
08f02cf
Refactor ResNet output handling to decorators
Feb 15, 2026
1d5aa9c
refactor: tracing
gabrielfruet Feb 15, 2026
a1c15f9
add hooks to deberta_v2
rwtarpit Feb 16, 2026
1361b6d
fix ruff
rwtarpit Feb 16, 2026
c8f5b2a
fix output_attentions arg
rwtarpit Feb 16, 2026
75ff815
fix output_attentions arg
rwtarpit Feb 16, 2026
1c10f8b
Refactor GPT-J to use standardized output tracing (#43979)
jayavelubalaji-ai Feb 17, 2026
e6be956
refactor efficientnet output tracing with @capture_outputs and @can_r…
Siddhartha7340 Feb 17, 2026
ceca4be
Merge branch 'main' into 43979/refactor-gptj-output-tracing
jayavelubalaji-ai Feb 17, 2026
091bc17
Merge branch 'main' into 43979/refactor-gptj-output-tracing
jayavelubalaji-ai Feb 18, 2026
2f9a2db
Fix repository copy checks after ResNet output refactor
pdwi2020 Feb 19, 2026
6e022f6
Fix missing RegNet decorator imports after copy sync
pdwi2020 Feb 19, 2026
411c2c7
Enable hidden-state capture for RegNet outputs
pdwi2020 Feb 19, 2026
138f19d
Merge commit 'refs/pr/44013' into merge-cluster-cluster-43979-11-2026…
evalstate Apr 24, 2026
fad4e1a
Merge commit 'refs/pr/43996' into merge-cluster-cluster-43979-11-2026…
evalstate Apr 24, 2026
85aefb3
Merge commit 'refs/pr/44007' into merge-cluster-cluster-43979-11-2026…
evalstate Apr 24, 2026
db3d5f3
Merge commit 'refs/pr/44044' into merge-cluster-cluster-43979-11-2026…
evalstate Apr 24, 2026
f2126c2
Merge commit 'refs/pr/44066' into merge-cluster-cluster-43979-11-2026…
evalstate Apr 24, 2026
385e015
Merge commit 'refs/pr/44072' into merge-cluster-cluster-43979-11-2026…
evalstate Apr 24, 2026
e1a2c28
Resolve style after output tracing merges
evalstate Apr 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions src/transformers/models/codegen/modeling_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,23 +238,22 @@ def forward(
attention_mask: torch.FloatTensor | None = None,
position_ids: torch.LongTensor | None = None,
use_cache: bool | None = False,
output_attentions: bool | None = False,
**kwargs,
) -> tuple[torch.Tensor] | tuple[torch.Tensor, tuple[torch.FloatTensor, ...]] | None:
cache_position: torch.LongTensor | None = None,
) -> torch.Tensor:
residual = hidden_states
hidden_states = self.ln_1(hidden_states)
attn_outputs, attn_weights = self.attn(
attn_outputs, _ = self.attn(
hidden_states=hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
position_ids=position_ids,
use_cache=use_cache,
output_attentions=output_attentions,
cache_position=cache_position,
)
feed_forward_hidden_states = self.mlp(hidden_states)
hidden_states = attn_outputs + feed_forward_hidden_states + residual

return hidden_states, attn_weights
return hidden_states


@auto_docstring
Expand Down
58 changes: 23 additions & 35 deletions src/transformers/models/cvt/modeling_cvt.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from ... import initialization as init
from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import auto_docstring, logging
from ...utils import auto_docstring, can_return_tuple, logging
from .configuration_cvt import CvtConfig


Expand Down Expand Up @@ -461,23 +461,15 @@ def __init__(self, config):
for stage_idx in range(len(config.depth)):
self.stages.append(CvtStage(config, stage_idx))

def forward(self, pixel_values, output_hidden_states=False, return_dict=True):
all_hidden_states = () if output_hidden_states else None
def forward(self, pixel_values):
hidden_state = pixel_values

cls_token = None
for _, (stage_module) in enumerate(self.stages):
for stage_module in self.stages:
hidden_state, cls_token = stage_module(hidden_state)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,)

if not return_dict:
return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)

return BaseModelOutputWithCLSToken(
last_hidden_state=hidden_state,
cls_token_value=cls_token,
hidden_states=all_hidden_states,
)


Expand All @@ -491,11 +483,11 @@ class CvtPreTrainedModel(PreTrainedModel):
@torch.no_grad()
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
if isinstance(module, nn.Linear | nn.Conv2d):
init.trunc_normal_(module.weight, mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
init.zeros_(module.bias)
elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
elif isinstance(module, nn.LayerNorm | nn.BatchNorm2d):
init.zeros_(module.bias)
init.ones_(module.weight)
if getattr(module, "running_mean", None) is not None:
Expand All @@ -519,36 +511,42 @@ def __init__(self, config, add_pooling_layer=True):
self.encoder = CvtEncoder(config)
self.post_init()

@can_return_tuple
@auto_docstring
def forward(
self,
pixel_values: torch.Tensor | None = None,
output_hidden_states: bool | None = None,
output_attentions: bool | None = None,
return_dict: bool | None = None,
**kwargs,
) -> tuple | BaseModelOutputWithCLSToken:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
return_dict = return_dict if return_dict is not None else self.config.return_dict

if pixel_values is None:
raise ValueError("You have to specify pixel_values")

encoder_outputs = self.encoder(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
# Manually collect hidden states from encoder stages
all_hidden_states = () if output_hidden_states else None
hidden_state = pixel_values
cls_token = None

for stage_module in self.encoder.stages:
hidden_state, cls_token = stage_module(hidden_state)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,)

if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)

return BaseModelOutputWithCLSToken(
last_hidden_state=sequence_output,
cls_token_value=encoder_outputs.cls_token_value,
hidden_states=encoder_outputs.hidden_states,
last_hidden_state=hidden_state,
cls_token_value=cls_token,
hidden_states=all_hidden_states,
)


Expand All @@ -573,13 +571,12 @@ def __init__(self, config):
# Initialize weights and apply final processing
self.post_init()

@can_return_tuple
@auto_docstring
def forward(
self,
pixel_values: torch.Tensor | None = None,
labels: torch.Tensor | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
**kwargs,
) -> tuple | ImageClassifierOutputWithNoAttention:
r"""
Expand All @@ -588,12 +585,7 @@ def forward(
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.return_dict
outputs = self.cvt(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
outputs = self.cvt(pixel_values, **kwargs)

sequence_output = outputs[0]
cls_token = outputs[1]
Expand Down Expand Up @@ -631,10 +623,6 @@ def forward(
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)

if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output

return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)


Expand Down
Loading