Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion colossalai/nn/layer/parallel_sequence/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ class _Linear(nn.Module):
keep_master_weight_for_test: This was added for testing and should be
set to False. It returns the master weights
used for initialization.
skip_bias_add: This was added to enable performance optimations where bias
skip_bias_add: This was added to enable performance optimizations where bias
can be fused with other elementwise operations. we skip
adding bias but instead return it.
"""
Expand Down
6 changes: 3 additions & 3 deletions colossalai/nn/loss/loss_1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def forward(ctx, vocab_parallel_logits, targets, process_group):
# Subtract the maximum value.
vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))

# Get the partition's vocab indecies
# Get the partition's vocab indices
partition_vocab_size = vocab_parallel_logits.size()[-1]
rank = dist.get_rank(process_group)
vocab_start_index = partition_vocab_size * rank
Expand Down Expand Up @@ -61,10 +61,10 @@ def forward(ctx, vocab_parallel_logits, targets, process_group):
@custom_bwd
def backward(ctx, grad_output):

# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target_1d = ctx.saved_tensors

# All the inputs have softmax as thier gradient.
# All the inputs have softmax as their gradient.
grad_input = softmax
# For simplicity, work with the 2D gradient.
partition_vocab_size = softmax.size()[-1]
Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/loss/loss_2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def forward(ctx, logits, targets):
@staticmethod
@custom_bwd
def backward(ctx, output_grad):
# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target = ctx.saved_tensors

# All the inputs have softmax as their gradient.
Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/loss/loss_2p5d.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def forward(ctx, logits, targets):
@staticmethod
@custom_bwd
def backward(ctx, output_grad):
# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target = ctx.saved_tensors

# All the inputs have softmax as their gradient.
Expand Down
4 changes: 2 additions & 2 deletions colossalai/nn/loss/loss_3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,10 @@ def forward(ctx, logits, targets, output_parallel_mode):
@staticmethod
@custom_bwd
def backward(ctx, output_grad):
# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target = ctx.saved_tensors

# All the inputs have softmax as thier gradient.
# All the inputs have softmax as their gradient.
input_grad = softmax
# For simplicity, work with the 2D gradient.
partition_vocab_size = softmax.size()[-1]
Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/optimizer/cpu_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CPUAdam(NVMeOptimizer):

`CPUAdam` requires CUDA extensions which can be built during installation or runtime.

This version of CPU Adam accelates parameters updating on CPU with SIMD.
This version of CPU Adam accelerates parameters updating on CPU with SIMD.
Support of AVX2 or AVX512 is required.

The GPU part is implemented in an naive way.
Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/optimizer/lamb.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def step(self, closure=None):
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instead.')

state = self.state[p]

Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/optimizer/nvme_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __init__(self,
self.offloader = None
self.is_on_nvme: Dict[Parameter, bool] = {}
self.offloaded_numel: int = 0
# As param may be not materialized here, these attributes are initalized when the first step
# As param may be not materialized here, these attributes are initialized when the first step
self.total_numel: Optional[int] = None
self.can_offload_numel: Optional[int] = None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,23 @@ class CachedEmbeddingBag(BaseEmbeddingBag):

Cached Embedding. Apply a GPU-based software cache approaches to dynamically manage the embedding table in the CPU and GPU memory space.
It can leverage the id's frequency statistics of the target dataset, by passing a frequency list to param `ids_freq_mapping`.
You can also apply a navie LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
You can also apply a naive LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.

Args:
num_embeddings (int): size of the dictionary of embeddings
embedding_dim (int): the size of each embedding vector
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i.e. it remains as a fixed “pad”. For a newly constructed EmbeddingBag, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector. Note that the embedding vector at padding_idx is excluded from the reduction.
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm
norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2..
norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2.
scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. Note: this option is not supported when mode="max". Defaults to False.
sparse (bool, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Note: this option is not supported when mode="max".. Defaults to False.
_weight (torch.Tensor, optional): an embedding weight tensor. Concate multiple tables in a embedding bag as a single one. Defaults to None.
_weight (torch.Tensor, optional): an embedding weight tensor. Concatenate multiple tables in a embedding bag as a single one. Defaults to None.
mode (str, optional): "sum", "mean" or "max". Specifies the way to reduce the bag. "sum" computes the weighted sum, taking per_sample_weights into consideration. "mean" computes the average of the values in the bag, "max" computes the max value over each bag. Default: "mean". Defaults to 'mean'.
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occures in dataset. Defaults to None.
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
Comment thread
FrankLeeeee marked this conversation as resolved.
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
pin_weight (bool, optional): pin the cpu weight. Defaults to False.
Expand Down Expand Up @@ -145,7 +145,7 @@ def num_write_back_history(self):
def swap_in_bandwidth(self):
if self.cache_weight_mgr._cpu_to_cuda_numel > 0:
return self.cache_weight_mgr._cpu_to_cuda_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \
self.cache_weight_mgr._cpu_to_cuda_elpase
self.cache_weight_mgr._cpu_to_cuda_elapse
else:
return 0

Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/parallel/layers/cache_embedding/copyer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self, size: int) -> None:
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
"""copy
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
The valid rows in the src tensor are continous, while rows in tgt tensor is scattered.
The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.

Args:
dim (int): dimension along which to index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def forward(self, indices: torch.Tensor, offsets: torch.Tensor = None, per_sampl

# get result of shape = (batch_size, (len(assigned_table_list)*embedding_dim))
local_output = torch.cat(local_output_list, 1)
# then concatenate those local_output on the second demension.
# then concatenate those local_output on the second dimension.
# use all_to_all
remains = batch_size % self.world_size
scatter_strides = [batch_size // self.world_size + int(i < remains) for i in range(self.world_size)]
Expand Down