Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cortex/model/branch/_conv1d_branch.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def forward(
padding_mask = trunk_outputs.padding_mask

branch_features, branch_mask = self.encoder((trunk_features, padding_mask.to(trunk_features)))
pooled_features = self.pooling_op(branch_features, branch_mask)
pooled_features = self.pooling_op((branch_features, branch_mask))

branch_outputs = Conv1dBranchOutput(
branch_features=branch_features.contiguous(),
Expand Down
7 changes: 5 additions & 2 deletions cortex/model/branch/_transformer_branch.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,10 @@ def __init__(
elif pooling_type == "weighted_mean":
self.pooling_op = WeightedMeanPooling(out_dim)
elif pooling_type == "attention":
self.pooling_op = PoolingSelfAttention(num_heads=num_heads, embed_dim=out_dim, dropout_p=dropout_prob)
self.pooling_op = nn.Sequential(
Apply(nn.LayerNorm(out_dim, bias=False)),
PoolingSelfAttention(num_heads=num_heads, embed_dim=out_dim, dropout_p=dropout_prob),
)
else:
raise NotImplementedError

Expand All @@ -94,7 +97,7 @@ def forward(
padding_mask = trunk_outputs.padding_mask

branch_features, branch_mask = self.encoder((trunk_features, padding_mask.to(trunk_features)))
pooled_features = self.pooling_op(branch_features, branch_mask)
pooled_features = self.pooling_op((branch_features, branch_mask))

branch_outputs = TransformerBranchOutput(
branch_features=branch_features.contiguous(),
Expand Down
2 changes: 2 additions & 0 deletions cortex/model/elemental/_bidirectional_self_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ def __init__(self, num_heads: int = 4, embed_dim: int = 32, dropout_p: float = 0
raise ValueError("num_heads must evenly divide embed_dim")

self.c_attn = nn.Linear(embed_dim, embed_dim * 3, bias=bias)
self.c_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.dropout = nn.Dropout(dropout_p)
self.dropout_p = dropout_p
self.head_dim = embed_dim // num_heads
Expand Down Expand Up @@ -35,4 +36,5 @@ def forward(self, inputs: tuple[Tensor, Tensor]) -> tuple[Tensor, Tensor]:
)

res = res.transpose(-2, -3).contiguous().flatten(start_dim=-2)
res = self.c_proj(res)
return self.dropout(res), padding_mask
2 changes: 2 additions & 0 deletions cortex/model/elemental/_causal_self_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ def __init__(self, num_heads: int = 4, embed_dim: int = 32, dropout_p: float = 0
raise ValueError("num_heads must evenly divide embed_dim")

self.c_attn = nn.Linear(embed_dim, embed_dim * 3, bias=bias)
self.c_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.dropout = nn.Dropout(dropout_p)
self.dropout_p = dropout_p
self.head_dim = embed_dim // num_heads
Expand All @@ -32,4 +33,5 @@ def forward(self, inputs: tuple[Tensor, Tensor]) -> tuple[Tensor, Tensor]:
)

res = res.transpose(-2, -3).flatten(start_dim=-2)
res = self.c_proj(res)
return self.dropout(res), padding_mask
6 changes: 4 additions & 2 deletions cortex/model/elemental/_mean_pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ class MeanPooling(nn.Module):
Average pooling over the sequence dimension excluding padding token positions.
"""

def forward(self, x, padding_mask):
def forward(self, inputs: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
x, padding_mask = inputs
weights = torch.where(padding_mask.bool(), 0.0, float("-inf"))
weights = weights.softmax(dim=-1).to(x)
pooled_x = (x * weights[..., None]).sum(-2)
Expand All @@ -24,7 +25,8 @@ def __init__(self, in_dim):
super().__init__()
self.encoder = nn.Linear(in_dim, in_dim)

def forward(self, x, padding_mask):
def forward(self, inputs: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
x, padding_mask = inputs
weights = self.encoder(x)
weights = torch.where(padding_mask.bool().unsqueeze(-1), weights, float("-inf"))
weights = weights.softmax(dim=-2).to(x)
Expand Down
5 changes: 4 additions & 1 deletion cortex/model/elemental/_pooling_self_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@ def __init__(self, num_heads: int = 4, embed_dim: int = 32, dropout_p: float = 0
raise ValueError("num_heads must evenly divide embed_dim")

self.c_attn = nn.Linear(embed_dim, embed_dim * 3, bias=bias)
self.c_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.dropout = nn.Dropout(dropout_p)
self.dropout_p = dropout_p
self.head_dim = embed_dim // num_heads
self.num_heads = num_heads

def forward(self, x: Tensor, padding_mask: Tensor) -> tuple[Tensor, Tensor]:
def forward(self, inputs: tuple[Tensor, Tensor]) -> Tensor:
x, padding_mask = inputs
seq_len = x.size(-2)
queries, keys, values = self.c_attn(x).chunk(3, dim=-1)

Expand All @@ -38,5 +40,6 @@ def forward(self, x: Tensor, padding_mask: Tensor) -> tuple[Tensor, Tensor]:
)

res = res.transpose(-2, -3).contiguous().flatten(start_dim=-2)
res = self.c_proj(res)
res = self.dropout(res)[..., 0, :] # drop 1D query dim
return res
3 changes: 2 additions & 1 deletion cortex/model/leaf/_autoregressive_lm_leaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
*args,
corruption_process: Optional[CorruptionProcess] = None,
corruption_rate: float = 0.1,
layernorm: bool = True,
**kwargs,
):
"""
Expand All @@ -49,7 +50,7 @@ def __init__(
*args: Additional positional arguments to pass to the parent class
**kwargs: Additional keyword arguments to pass to the parent class
"""
super().__init__(*args, **kwargs)
super().__init__(*args, layernorm=layernorm, **kwargs)
self.corruption_process = corruption_process
self.corruption_rate = corruption_rate

Expand Down
3 changes: 2 additions & 1 deletion cortex/model/leaf/_classifier_leaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def __init__(
last_layer_bias: bool = True,
label_smoothing: Union[float, str] = 0.0,
root_key: Optional[str] = None,
layernorm: bool = False,
) -> None:
super().__init__()
self.in_dim = in_dim
Expand All @@ -83,7 +84,7 @@ def __init__(
self.root_key = root_key

# testing out normalizing the penultimate activations
encoder_modules = [nn.LayerNorm(in_dim, bias=False)]
encoder_modules = [nn.LayerNorm(in_dim, bias=False)] if layernorm else []
if num_layers >= 1:
for _ in range(num_layers):
encoder_modules.extend(
Expand Down
3 changes: 2 additions & 1 deletion cortex/model/leaf/_denoising_lm_leaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
*args,
corruption_process: Optional[CorruptionProcess] = None,
corruption_rate: float = 0.1,
layernorm: bool = True,
**kwargs,
):
"""
Expand All @@ -49,7 +50,7 @@ def __init__(
*args: Additional positional arguments to pass to the parent class
**kwargs: Additional keyword arguments to pass to the parent class
"""
super().__init__(*args, **kwargs)
super().__init__(*args, layernorm=layernorm, **kwargs)
self.corruption_process = corruption_process
self.corruption_rate = corruption_rate

Expand Down