hpcaitech · binmakeswell · Apr 26, 2023 · Apr 21, 2023 · Apr 24, 2023 · Apr 24, 2023
@@ -138,7 +138,7 @@ def emit_ckpt_func(body,
                    delete_unused_value_func,
                    ckpt_level=0,
                    in_ckpt=False):
-    """Emit ckpt fuction in nested way
+    """Emit ckpt function in nested way
 
     Args:
         body: forward code - in recursive calls, this part will be checkpoint

@@ -111,7 +111,7 @@ def copy_grad_to_region_slice(self, param: torch.nn.Parameter, data_slice: torch
         Copy data slice to the memory space indexed by the input tensor in the region.
 
         Args:
-            param (torch.nn.Parameter): the param used to retrive meta information
+            param (torch.nn.Parameter): the param used to retrieve meta information
             data_slice (torch.Tensor): the tensor to be copied to the region
         """
 

@@ -22,7 +22,7 @@ class TrainingSimulator(ABC):
 
     Args:
         region_list (List[Region]): represents the linearized DNN computing graph.
-        comp_power (float): the NVIDIA GPU FP16 compuing power.
+        comp_power (float): the NVIDIA GPU FP16 computing power.
         link_to_bw (Dict[str, Dict[float, float]]): communication links and the corresponding bandwidth.
     """
 

@@ -149,7 +149,7 @@ def size_value_converting_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh
 
     def _extract_target_dim(node):
         '''
-        A helper function to etract the target dimension from size node.
+        A helper function to extract the target dimension from size node.
         There are two usages of torch.Tensor.size:
         1. tensor.size()
         2. tensor.size(dim)
@@ -427,7 +427,7 @@ def _shard_param(param, target_sharding_spec):
         if target_sharding_spec.dim_partition_dict != {}:
             origin_sharding_spec = ShardingSpec(device_mesh, param.shape, {})
             setattr(param, 'sharding_spec', origin_sharding_spec)
-            # TODO: build a ColoParamter class to manager the distributed parameters
+            # TODO: build a ColoParameter class to manager the distributed parameters
             # we could use .data here, because all the operations just happen before the real training
             # loop, so we don't need to track these operations in the autograd graph.
             param = torch.nn.Parameter(

@@ -287,7 +287,7 @@ def emit_code_with_chunk(body: List[str],
             body = _replace_new_tensor_like_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body)
             # new tensor
             body = _replace_new_tensor_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body)
-            # reassgin reshape size
+            # reassign reshape size
             body[-1] = _replace_reshape_size(body[-1], node.name, chunk_infos[region_idx]["reshape_size"])
             body[-1] = "    " + body[-1]
             delete_unused_value_func(node, body, chunk_inputs_names)

@@ -153,7 +153,7 @@ def estimate_chunk_inference_mem(self, node_list: List, chunk_infos: Dict = None
 
         Returns:
             act_memory_peak_log (List): peak memory of every node
-            act_memory_after_node_log (List): memory after excuting every node
+            act_memory_after_node_log (List): memory after executing every node
             active_node_list_log (List): active nodes of every node. active nodes refer to
                 nodes generated but not deleted.
         """

@@ -16,7 +16,7 @@ class SearchChunk(object):
     This is the core class for AutoChunk.
 
     It defines the framework of the strategy of AutoChunk.
-    Chunks will be selected one by one utill search stops.
+    Chunks will be selected one by one until search stops.
 
     The chunk search is as follows:
     1. find the peak memory node
@@ -73,7 +73,7 @@ def _init_trace(self) -> None:
 
     def _find_peak_region(self, mem_peak: List) -> int:
         """
-        find peak node, along with its neighbour nodes exceeds max mem
+        find peak node, along with its neighbor nodes exceeds max mem
         """
         max_value = max(mem_peak)
         max_idx = mem_peak.index(max_value)
@@ -118,7 +118,7 @@ def _search_max_chunk_region(self, active_node: List, peak_region: int, chunk_re
             chunk_region_start (int)
             chunk_region_end (int)
         """
-        # check if peak node already in chunkinfo
+        # check if peak node already in chunk info
         if chunk_regions is not None:
             for i in chunk_regions:
                 if i["region"][0] < peak_region[0] <= i["region"][1] or \

@@ -479,7 +479,7 @@ def check_region_start_end(self, start_node: Node, start_dim: int, start_idx: in
         # check index source align
         if not self.check_index_source(start_dim, start_node, start_idx, end_dim, end_node):
             return False
-        # check index copmute
+        # check index compute
         if not self.check_index_compute(start_idx, end_dim, end_node, end_idx):
             return False
         return True
@@ -8,7 +8,7 @@
 
 class TraceIndice(object):
     """
-    Trace all indice infomation for every node.
+    Trace all indice information for every node.
 
     Indice is a logical concept. Equal dims can been treated as one indice.
     eg. dim(x1) = [a, b, c]
@@ -153,7 +153,7 @@ def _inherit_all_indice(self, node_from: Node, node_to: Node) -> None:
 
     def _inherit_more_indice_from_node_with_exclude(self, node_from: Node, node_to: Node, exclude: List = None) -> None:
         """
-        inheirt indice from node without init
+        inherit indice from node without init
         """
         if exclude == None:
             exclude = []
@@ -301,7 +301,7 @@ def _assign_permute_indice(self, node: Node, node_idx: int) -> None:
     def _assign_linear_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for linear op.
-        1. copy trace from input node and change last indice accroding to weight
+        1. copy trace from input node and change last indice according to weight
         2. mark equal for input node last indice, weight first dim and bias dim.
         3. inherit input's computation, mark computation for last dim.
 
@@ -360,7 +360,7 @@ def _assign_baddbmm_indice(self, node: Node, node_idx: int) -> None:
     def _assign_matmul_indice(self, node: Node, node_idx: int) -> None:
         """
         Assign indice for matmul op.
-        1. copy trace from matmul_left and change last indice accroding to matmul_right. (assert they have same length)
+        1. copy trace from matmul_left and change last indice according to matmul_right. (assert they have same length)
         2. mark equal for input matmul_left -1 indice and matmul_right -2 dim.
         3. inherit matmul_left and matmul_right computation, mark computation for last dim.
 
@@ -720,11 +720,11 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None:
         Assign indice for view and reshape op.
         1. get origin shape and target shape by meta info.
         2. compute the real value of -1 in target shape.
-        3. determine changed dim, and assgin indice for generated dim.
+        3. determine changed dim, and assign indice for generated dim.
         4. log changed dim and generated dim for restore
         5. inherit computation.
         6. look into view list to see whether the view is associated with other,
-           if so assgin equal dim according to previous view.
+           if so assign equal dim according to previous view.
 
         Args:
             node (node)

@@ -20,7 +20,7 @@
 class Booster:
     """
     Booster is a high-level API for training neural networks. It provides a unified interface for
-    training with different precisio, accelerator, and plugin.
+    training with different precision, accelerator, and plugin.
 
     Examples:
         >>> colossalai.launch(...)

@@ -71,7 +71,7 @@ def load_model(self,
 
         Args:
             model (nn.Module): model to be loaded.
-            checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the
+            checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the
                         mainstream model zoos such as Hugging Face and TIMM. The checkpoint path can be:
                         1. a file path, e.g. 'model.pt'
                         2. a path to a json file which defines the index to the sharded checkpoint
@@ -127,7 +127,7 @@ def save_model(self,
                 1. a file path, e.g. 'model.pt'
                 2. a directory path to save the sharded checkpoint, e.g. './checkpoints/' when shard = True.
             shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into
-                multiple files. The model shards will be specificed by a `model.index.json` file. When shard = True, please ensure
+                multiple files. The model shards will be specified by a `model.index.json` file. When shard = True, please ensure
                 that the checkpoint path is a directory path instead of a file path.
             gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
             variant (str): If specified, weights are saved in the format pytorch_model.<variant>.bin. Default: None.
@@ -149,7 +149,7 @@ def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
 
         Args:
             optimizer (Optimizer): optimizer to be loaded.
-            checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the
+            checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the
         """
         index_file_exists, index_file_path = has_index_file(checkpoint)
 
@@ -180,7 +180,7 @@ def save_optimizer(self,
                 2. a path to a json file which defines the index to the sharded checkpoint for the optimizer
                 3. a path to a folder containing a unique .index.json file for sharded checkpoint
             shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into
-                multiple files. The optimizer shards will be specificed by a `optimizer.index.json` file.
+                multiple files. The optimizer shards will be specified by a `optimizer.index.json` file.
             gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
             prefix (str): prefix for the optimizer checkpoint when shard = True. Default: None.
             size_per_shard (int): size per shard in MB. Default: 1024. This value is only used when shard is set to True.

@@ -76,7 +76,7 @@ def check_installation():
     click.echo("")
     click.echo(f"Note:")
     click.echo(
-        f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment varialbe CUDA_EXT=1 is set"
+        f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment variable CUDA_EXT=1 is set"
     )
     click.echo(f"2. If AOT compilation is not enabled, stay calm as the CUDA kernels can still be built during runtime")
 
@@ -88,7 +88,7 @@ def check_installation():
     click.echo(f"Note:")
     click.echo(f"1. The table above checks the version compatibility of the libraries/tools in the current environment")
     click.echo(
-        f"   - PyTorch version mistach: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation"
+        f"   - PyTorch version mismatch: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation"
     )
     click.echo(
         f"   - System and PyTorch CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version required by PyTorch"

@@ -103,10 +103,10 @@ def _communicate(object_send_next: Union[torch.Tensor, List[torch.Tensor]] = Non
                    previous rank.
         recv_next (bool): boolean for whether tensor should be received from
                    next rank.
-        recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defualts to None.
-        recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defualts to None.
-        prev_rank (int): the rank of the previous pipeline stage, defualts to None,
-        next_rank (int): the rank of the next pipeline stage, defualts to None,
+        recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defaults to None.
+        recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defaults to None.
+        prev_rank (int): the rank of the previous pipeline stage, defaults to None,
+        next_rank (int): the rank of the next pipeline stage, defaults to None,
         dtype (torch.dtype): data type of intermediate buffers, defaults to None
         scatter_gather_tensors (bool): whether to scatter and gather tensor between pipeline stages, defaults to False
 

@@ -230,7 +230,7 @@ def recv_backward(next_rank: int = None) -> Any:
         next_rank (int, optional): The rank of the source of the tensor.
 
     Returns:
-        Any: The input gradient tensor or gradident tensor list.
+        Any: The input gradient tensor or gradient tensor list.
     """
     if gpc.is_pipeline_last_stage():
         output_tensor_grad = None

@@ -64,7 +64,7 @@ def setup(self, seed: int, use_kernel_optim: bool = True):
         from colossalai.core import global_context as gpc
         self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
         assert self.world_size % self.max_ep_size == 0, \
-            "Maximum epxert parallel size must be a factor of the number of GPUs"
+            "Maximum expert parallel size must be a factor of the number of GPUs"
         self.min_dp_size = self.world_size // self.max_ep_size
 
         # Enabling kernel optimization may raise error in some cases

@@ -44,7 +44,7 @@ def __init__(self):
         # load config from file
         self._config = None
 
-        # default 3D parallel args, will be overwritten during process group intialization
+        # default 3D parallel args, will be overwritten during process group initialization
         self.world_size = 1
         self.data_parallel_size = 1
         self.pipeline_parallel_size = 1
@@ -264,7 +264,7 @@ def _add_world_size(self, parallel_mode: ParallelMode, world_size: int):
         """Adds world size for `parallel_mode`.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode correponding to the process group
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode corresponding to the process group
             world_size (int): The world size to be added
 
         Raises:

@@ -59,23 +59,23 @@ def set_mode(self, parallel_mode: ParallelMode):
         self._current_mode = parallel_mode
         torch.cuda.set_rng_state(self._seed_states[parallel_mode])
 
-    def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False):
+    def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
         """Adds a seed to the seed manager for `parallel_mode`.
 
         Args:
             parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
             seed (int): The seed to be added.
-            overwrtie (bool, optional): Whether allows to overwrite the seed that has been set already
+            overwrite (bool, optional): Whether allows to overwrite the seed that has been set already
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of :class:`colossalai.context.ParallelMode`
                 or the seed for `parallel_mode` has been added.
         """
         assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
-        if overwrtie is False:
+        if overwrite is False:
             assert parallel_mode not in self._seed_states, f'The seed for {parallel_mode} has been added'
         elif parallel_mode in self._seed_states:
-            print(f"Warnning: {parallel_mode} seed has been overwritten.", flush=True)
+            print(f"Warning: {parallel_mode} seed has been overwritten.", flush=True)
 
         current_state = torch.cuda.get_rng_state()
         torch.cuda.manual_seed(seed)

@@ -305,7 +305,7 @@ def emit_ckpt_func(body,
                    delete_unused_value_func,
                    level=0,
                    in_ckpt=False):
-    """Emit ckpt fuction in nested way
+    """Emit ckpt function in nested way
     Args:
         body: forward code, in recursive calls, this part will be checkpoint
         functions code

@@ -155,7 +155,7 @@ def record_output(def_node: torch.fx.node.Node, use_node: Optional[torch.fx.node
                 use_partition = partitions[use_partition_name]
                 use_partition.outputs.setdefault(def_node.name)
 
-    # split nodes into parititons
+    # split nodes into partitions
     for node in m.graph.nodes:
         orig_nodes[node.name] = node
 
@@ -198,7 +198,7 @@ def record_output(def_node: torch.fx.node.Node, use_node: Optional[torch.fx.node
     if len(sorted_partitions) != len(partitions):
         raise RuntimeError("cycle exists between partitions!")
 
-    # add placeholders to parititons
+    # add placeholders to partitions
     for partition_name in sorted_partitions:
         partition = partitions[partition_name]
         for input in partition.inputs:

@@ -111,7 +111,7 @@ class MultiHeadAttention(nn.Module):
     Arguments:
         hidden_size: Total dimension of hidden_size.
         nhead: Number of parallel attention heads.
-        batch_size: Batch Size for one foward
+        batch_size: Batch Size for one forward
         max_seq_len: Max length of input sequence
         dropout: Dropout probability
         norm_first: perform LayerNorms before attention

@@ -88,7 +88,7 @@ def colo_embedding_bag(input_tensor: GeneralTensor,
     assert isinstance(weight, ColoTensor)
     input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group())
 
-    # Handle differen parallel actions.
+    # Handle different parallel actions.
 
     if not weight.has_compute_spec():    # No Model Parallel Applied
         assert weight.is_replicate(), 'Invalid weight spec for native embedding op'