hpcaitech · FrankLeeeee · Jun 5, 2023 · May 17, 2023 · May 19, 2023 · May 19, 2023
@@ -195,7 +195,7 @@ class _Linear(nn.Module):
         keep_master_weight_for_test: This was added for testing and should be
                                      set to False. It returns the master weights
                                      used for initialization.
-        skip_bias_add: This was added to enable performance optimations where bias
+        skip_bias_add: This was added to enable performance optimizations where bias
                        can be fused with other elementwise operations. we skip
                        adding bias but instead return it.
     """

@@ -21,7 +21,7 @@ def forward(ctx, vocab_parallel_logits, targets, process_group):
         # Subtract the maximum value.
         vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
 
-        # Get the partition's vocab indecies
+        # Get the partition's vocab indices
         partition_vocab_size = vocab_parallel_logits.size()[-1]
         rank = dist.get_rank(process_group)
         vocab_start_index = partition_vocab_size * rank
@@ -61,10 +61,10 @@ def forward(ctx, vocab_parallel_logits, targets, process_group):
     @custom_bwd
     def backward(ctx, grad_output):
 
-        # Retreive tensors from the forward path.
+        # Retrieve tensors from the forward path.
         softmax, target_mask, masked_target_1d = ctx.saved_tensors
 
-        # All the inputs have softmax as thier gradient.
+        # All the inputs have softmax as their gradient.
         grad_input = softmax
         # For simplicity, work with the 2D gradient.
         partition_vocab_size = softmax.size()[-1]

@@ -106,7 +106,7 @@ def forward(ctx, logits, targets):
     @staticmethod
     @custom_bwd
     def backward(ctx, output_grad):
-        # Retreive tensors from the forward path.
+        # Retrieve tensors from the forward path.
         softmax, target_mask, masked_target = ctx.saved_tensors
 
         # All the inputs have softmax as their gradient.

@@ -100,7 +100,7 @@ def forward(ctx, logits, targets):
     @staticmethod
     @custom_bwd
     def backward(ctx, output_grad):
-        # Retreive tensors from the forward path.
+        # Retrieve tensors from the forward path.
         softmax, target_mask, masked_target = ctx.saved_tensors
 
         # All the inputs have softmax as their gradient.

@@ -99,10 +99,10 @@ def forward(ctx, logits, targets, output_parallel_mode):
     @staticmethod
     @custom_bwd
     def backward(ctx, output_grad):
-        # Retreive tensors from the forward path.
+        # Retrieve tensors from the forward path.
         softmax, target_mask, masked_target = ctx.saved_tensors
 
-        # All the inputs have softmax as thier gradient.
+        # All the inputs have softmax as their gradient.
         input_grad = softmax
         # For simplicity, work with the 2D gradient.
         partition_vocab_size = softmax.size()[-1]

@@ -21,7 +21,7 @@ class CPUAdam(NVMeOptimizer):
 
     `CPUAdam` requires CUDA extensions which can be built during installation or runtime.
 
-    This version of CPU Adam accelates parameters updating on CPU with SIMD.
+    This version of CPU Adam accelerates parameters updating on CPU with SIMD.
     Support of AVX2 or AVX512 is required.
 
     The GPU part is implemented in an naive way.

@@ -59,7 +59,7 @@ def step(self, closure=None):
                     continue
                 grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
+                    raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instead.')
 
                 state = self.state[p]
 

@@ -43,7 +43,7 @@ def __init__(self,
             self.offloader = None
         self.is_on_nvme: Dict[Parameter, bool] = {}
         self.offloaded_numel: int = 0
-        # As param may be not materialized here, these attributes are initalized when the first step
+        # As param may be not materialized here, these attributes are initialized when the first step
         self.total_numel: Optional[int] = None
         self.can_offload_numel: Optional[int] = None
 

@@ -12,23 +12,23 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
 
     Cached Embedding. Apply a GPU-based software cache approaches to dynamically manage the embedding table in the CPU and GPU memory space.
     It can leverage the id's frequency statistics of the target dataset, by passing a frequency list to param `ids_freq_mapping`.
-    You can also apply a navie LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
+    You can also apply a naive LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
 
     Args:
         num_embeddings (int): size of the dictionary of embeddings
         embedding_dim (int):  the size of each embedding vector
         padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i.e. it remains as a fixed “pad”. For a newly constructed EmbeddingBag, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector. Note that the embedding vector at padding_idx is excluded from the reduction.
         max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm
-        norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2..
+        norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2.
         scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. Note: this option is not supported when mode="max". Defaults to False.
         sparse (bool, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Note: this option is not supported when mode="max".. Defaults to False.
-        _weight (torch.Tensor, optional): an embedding weight tensor. Concate multiple tables in a embedding bag as a single one. Defaults to None.
+        _weight (torch.Tensor, optional): an embedding weight tensor. Concatenate multiple tables in a embedding bag as a single one. Defaults to None.
         mode (str, optional): "sum", "mean" or "max". Specifies the way to reduce the bag. "sum" computes the weighted sum, taking per_sample_weights into consideration. "mean" computes the average of the values in the bag, "max" computes the max value over each bag. Default: "mean". Defaults to 'mean'.
         include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
         dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
         device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
         cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row 
-        ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occures in dataset. Defaults to None.
+        ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
         warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
         buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
         pin_weight (bool, optional): pin the cpu weight. Defaults to False.
@@ -145,7 +145,7 @@ def num_write_back_history(self):
     def swap_in_bandwidth(self):
         if self.cache_weight_mgr._cpu_to_cuda_numel > 0:
             return self.cache_weight_mgr._cpu_to_cuda_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \
-                   self.cache_weight_mgr._cpu_to_cuda_elpase
+                   self.cache_weight_mgr._cpu_to_cuda_elapse
         else:
             return 0
 

@@ -17,7 +17,7 @@ def __init__(self, size: int) -> None:
     def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
         """copy 
         src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
-        The valid rows in the src tensor are continous, while rows in tgt tensor is scattered.
+        The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
 
         Args:
             dim (int):  dimension along which to index

@@ -114,7 +114,7 @@ def forward(self, indices: torch.Tensor, offsets: torch.Tensor = None, per_sampl
 
         # get result of shape = (batch_size, (len(assigned_table_list)*embedding_dim))
         local_output = torch.cat(local_output_list, 1)
-        # then concatenate those local_output on the second demension.
+        # then concatenate those local_output on the second dimension.
         # use all_to_all
         remains = batch_size % self.world_size
         scatter_strides = [batch_size // self.world_size + int(i < remains) for i in range(self.world_size)]