From 39f1da31dee9ccc7b962f0f0a7e2adafbd0948bb Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Wed, 17 May 2023 13:56:54 +0800
Subject: [PATCH 1/3] fix typo colossalai/autochunk auto_parallel amp

---
 colossalai/amp/torch_amp/_grad_scaler.py                  | 2 +-
 .../auto_parallel/meta_profiler/meta_registry/linear.py   | 2 +-
 colossalai/auto_parallel/passes/runtime_apply_pass.py     | 2 +-
 .../auto_parallel/passes/runtime_preparation_pass.py      | 4 ++--
 colossalai/autochunk/trace_flow.py                        | 6 +++---
 colossalai/autochunk/trace_indice.py                      | 8 ++++----
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/colossalai/amp/torch_amp/_grad_scaler.py b/colossalai/amp/torch_amp/_grad_scaler.py
index 7b78998fb8c2..ed4b8e484436 100644
--- a/colossalai/amp/torch_amp/_grad_scaler.py
+++ b/colossalai/amp/torch_amp/_grad_scaler.py
@@ -240,7 +240,7 @@ def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
                 for grads in per_dtype_grads.values():
                     torch._amp_foreach_non_finite_check_and_unscale_(grads, per_device_found_inf.get(device),
                                                                      per_device_inv_scale.get(device))
-        # For tensor parallel paramters it should be all-reduced over tensor parallel process group
+        # For tensor parallel parameters it should be all-reduced over tensor parallel process group
         if gpc.is_initialized(ParallelMode.MODEL) and gpc.get_world_size(ParallelMode.MODEL) > 1:
             vals = [val for val in per_device_found_inf._per_device_tensors.values()]
             coalesced = _flatten_dense_tensors(vals)
diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
index 7697fc6c383d..94dd9143e0ae 100644
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
@@ -325,7 +325,7 @@ def matmul_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L
         else:
             _is_batch_dims_same = False
 
-        # retireve dimensions
+        # retrieve dimensions
         input_dim_00 = input_tensors[0].shape[-2]
         input_dim_01 = input_tensors[0].shape[-1]
         input_dim_10 = input_tensors[1].shape[-2]
diff --git a/colossalai/auto_parallel/passes/runtime_apply_pass.py b/colossalai/auto_parallel/passes/runtime_apply_pass.py
index a473bb6e973d..2049a06187d2 100644
--- a/colossalai/auto_parallel/passes/runtime_apply_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_apply_pass.py
@@ -219,7 +219,7 @@ def _comm_spec_apply(gm: torch.fx.GraphModule):
     return gm
 
 
-def _act_annotataion_pass(gm: torch.fx.GraphModule):
+def _act_annotation_pass(gm: torch.fx.GraphModule):
     """
     This pass is used to add the act annotation to the new inserted nodes.
     """
diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
index 177f3765f5a0..9a2314826448 100644
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -54,7 +54,7 @@ def size_processing(size: Union[int, torch.Size],
     return size
 
 
-def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int],
+def solution_annotation_pass(gm: torch.fx.GraphModule, solution: List[int],
                                strategies_constructor: StrategiesConstructor):
     """
     This method is used to stick the solution strategy to the nodes and add the information
@@ -496,7 +496,7 @@ def runtime_preparation_pass(gm: torch.fx.GraphModule,
                              device_mesh: DeviceMesh,
                              strategies_constructor: StrategiesConstructor,
                              overlap=False):
-    gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict = solution_annotatation_pass(
+    gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict = solution_annotation_pass(
         gm, solution, strategies_constructor)
     gm = size_value_converting_pass(gm, device_mesh)
     gm = node_args_converting_pass(gm, device_mesh)
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index db25267e9b42..11a7e62ff37c 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -64,7 +64,7 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx):
             return False
         return True
 
-    def _assgin_single_node_flow(
+    def _assign_single_node_flow(
         self,
         arg_node: Node,
         start_idx: int,
@@ -177,7 +177,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx):
                     if get_node_shape(arg) is None:
                         continue
                     arg_list.append(arg)
-                    flow_flag = self._assgin_single_node_flow(
+                    flow_flag = self._assign_single_node_flow(
                         arg,
                         start_idx,
                         end_idx,
@@ -315,7 +315,7 @@ def _get_prepose_nodes(self, all_node_info: Dict, start_idx: int, end_idx: int,
         chunk_info["args"]["prepose_nodes"] = prepose_nodes
 
     def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx):
-        # we need to log input nodes to avoid deleteing them in the loop
+        # we need to log input nodes to avoid deleting them in the loop
         chunk_node_list = self.node_mgr.get_node_slice_by_idx(start_idx, end_idx + 1)
         # also need to get some prepose node's arg out of non_chunk_inputs
         for n in chunk_info["args"]["prepose_nodes"]:
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index d56bf843f18d..8e6cd3e29bea 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -461,7 +461,7 @@ def _assign_elementwise_indice(self, node, idx):
                 nodes_in.append(node_in)
                 self._inherit_more_indice_from_node_with_exclude(node_in, node)
 
-    def _assgin_no_change_indice(self, node, idx):
+    def _assign_no_change_indice(self, node, idx):
         self._assign_indice_as_input(node, idx)
         for node_in in node.args:
             if type(node_in) == type(node):
@@ -792,7 +792,7 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None:
             self._add_dim(node_idx, i)
         dim_from.reverse()
 
-        # inheirt indice from current node
+        # inherit indice from current node
         if len(dim_from) != 0 and len(dim_to) != 0:
             if dim_diff == 1:
                 if origin_shape[dim_from[0]] == 1:
@@ -852,7 +852,7 @@ def trace_indice(self) -> None:
                 elif "split" == node_name:
                     self._assign_split_indice(node, idx)
                 elif any(i == node_name for i in ["to", "contiguous", "clone", "type", "float"]):
-                    self._assgin_no_change_indice(node, idx)
+                    self._assign_no_change_indice(node, idx)
                 elif "new_ones" == node_name:
                     self._assign_all_indice(node, idx)
                 elif "flatten" == node_name:
@@ -914,7 +914,7 @@ def trace_indice(self) -> None:
                 elif "conv2d" == node_name:
                     self._assign_conv2d_indice(node, idx)
                 elif "identity" == node_name:
-                    self._assgin_no_change_indice(node, idx)
+                    self._assign_no_change_indice(node, idx)
                 elif any(n == node_name for n in ["sigmoid", "dropout", "relu", "silu", "gelu"]):
                     self._assign_elementwise_indice(node, idx)
                 else:

From dce5d3d469b34da1dbd26f949c1eecce3e99dfc2 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Fri, 19 May 2023 14:18:14 +0800
Subject: [PATCH 2/3] fix typo colossalai/auto_parallel nn utils etc.

---
 applications/Chat/coati/dataset/reward_dataset.py    |  2 +-
 .../tensor_shard/node_handler/embedding_handler.py   |  4 ++--
 .../tensor_shard/node_handler/linear_handler.py      |  6 +++---
 .../tensor_shard/node_handler/matmul_handler.py      | 10 +++++-----
 .../tensor_shard/node_handler/node_handler.py        |  2 +-
 .../auto_parallel/tensor_shard/utils/factory.py      |  2 +-
 .../auto_parallel/tensor_shard/utils/reshape.py      | 12 ++++++------
 colossalai/nn/optimizer/cpu_adam.py                  |  2 +-
 colossalai/nn/optimizer/hybrid_adam.py               |  8 ++++----
 .../nn/parallel/layers/cache_embedding/cache_mgr.py  |  6 +++---
 colossalai/utils/common.py                           |  2 +-
 colossalai/utils/tensor_detector/readme.md           | 12 ++++++------
 colossalai/utils/tensor_detector/tensor_detector.py  |  8 ++++----
 colossalai/zero/gemini/chunk/manager.py              | 12 ++++++------
 colossalai/zero/gemini/chunk/search_utils.py         |  2 +-
 colossalai/zero/gemini/memory_tracer/memory_stats.py |  2 +-
 16 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/applications/Chat/coati/dataset/reward_dataset.py b/applications/Chat/coati/dataset/reward_dataset.py
index faa1c94d2728..5dacf7e81464 100644
--- a/applications/Chat/coati/dataset/reward_dataset.py
+++ b/applications/Chat/coati/dataset/reward_dataset.py
@@ -6,7 +6,7 @@
 from .utils import is_rank_0
 
 
-# Dahaos/rm-static
+# Dahoas/rm-static
 class RmStaticDataset(Dataset):
     """
     Dataset for reward model
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py
index e154105b672d..112ee194b4ec 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py
@@ -155,7 +155,7 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li
         Convert the sharding spec from the logical shape to the physical shape.
         """
         # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
         # we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output
         strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy,
                                                                                        input_name=str(
@@ -221,7 +221,7 @@ def post_process(self, strategy: ShardingStrategy):
         Convert the sharding spec from the logical shape to the physical shape.
         """
         # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
         # we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output
         strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy,
                                                                                        input_name=str(
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py
index 59091dab519f..ea541e434009 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py
@@ -23,7 +23,7 @@ def _update_sharding_spec_for_transposed_weight_for_linear(strategy: ShardingStr
                                                            weight_name: str) -> ShardingStrategy:
     """
     This function is a helper function used by both module node handler and function node handler. This function will
-    convert the sharding spec for the transposed weight to the correct partititon spec.
+    convert the sharding spec for the transposed weight to the correct partition spec.
 
     Args:
         strategy (ShardingStrategy): the strategy generated by the strategy generator.
@@ -197,7 +197,7 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li
         strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy, weight_name='weight')
 
         # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
         # we need to map the partition at dim 0 to one of the first few dimensions of the input
         strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy,
                                                                                     input_name=str(self.node.args[0]),
@@ -267,7 +267,7 @@ def post_process(self, strategy: ShardingStrategy):
         strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy,
                                                                           weight_name=str(self.node.args[1]))
         # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
         # we need to map the partition at dim 0 to one of the first few dimensions of the input
         strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy,
                                                                                     input_name=str(self.node.args[0]),
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
index f3c9d0cbf826..bfebc3f59d0c 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
@@ -48,8 +48,8 @@ def get_matmul_type(input_dim: int, other_dim: int):
     Determine which type of matmul operation should be executed for the given tensor dimensions.
 
     Args:
-        input_dim (int): the number of dimensions for the input tenosr
-        other_dim (int): the number of dimensions for the other tenosr
+        input_dim (int): the number of dimensions for the input tensor
+        other_dim (int): the number of dimensions for the other tensor
     """
     if input_dim == 1 and other_dim == 1:
         matmul_type = MatMulType.DOT
@@ -268,13 +268,13 @@ def _update_sharding_spec(key, strategy, physical_batch_dim):
             dim_partition_dict = sharding_spec.dim_partition_dict
             entire_shape = sharding_spec.entire_shape
 
-            # upddate the dimension index for the matrix dimensions
+            # update the dimension index for the matrix dimensions
             if 2 in dim_partition_dict:
                 dim_partition_dict[len(self.batch_dims_before_view) + 1] = dim_partition_dict.pop(2)
             if 1 in dim_partition_dict:
                 dim_partition_dict[len(self.batch_dims_before_view)] = dim_partition_dict.pop(1)
 
-            # map the logical batch dim to phyiscal batch dim
+            # map the logical batch dim to physical batch dim
             if 0 in dim_partition_dict:
                 batch_dim_shard = dim_partition_dict.pop(0)
                 dim_partition_dict[physical_batch_dim] = batch_dim_shard
@@ -414,7 +414,7 @@ def _get_logical_shape_for_dot(self):
 
     def _get_logical_shape_for_mm(self):
         """
-        We need to handle the input tensor for a matrix-matrix multiplcation as the input
+        We need to handle the input tensor for a matrix-matrix multiplication as the input
         tensor can be a 1D or 2D tensor. If it is a 1D tensor, 1 will be prepended to its shape
         (e.g. [4] -> [1, 4]).
         """
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
index d3d09a9dcf65..4262d76173e4 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
@@ -212,7 +212,7 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV
         return self.strategies_vector
 
     def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, List[ShardingStrategy]]:
-        # tranform the strategy generated
+        # transform the strategy generated
         # e.g. to process the sharding strategy for the transposed weights
         return strategy
 
diff --git a/colossalai/auto_parallel/tensor_shard/utils/factory.py b/colossalai/auto_parallel/tensor_shard/utils/factory.py
index 05331e560001..347c10aa102d 100644
--- a/colossalai/auto_parallel/tensor_shard/utils/factory.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/factory.py
@@ -30,7 +30,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic
     """
 
     if isinstance(input_, Node):
-        assert hasattr(input_, '_meta_data'), f'The given node has no attribte _meta_data'
+        assert hasattr(input_, '_meta_data'), f'The given node has no attribute _meta_data'
         meta_tensor = input_._meta_data
         assert meta_tensor is not None, "The given node's _meta_data attribute is None"
         shape = meta_tensor.shape
diff --git a/colossalai/auto_parallel/tensor_shard/utils/reshape.py b/colossalai/auto_parallel/tensor_shard/utils/reshape.py
index a32a14bf7d57..d0ebbd7e8b1b 100644
--- a/colossalai/auto_parallel/tensor_shard/utils/reshape.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/reshape.py
@@ -6,12 +6,12 @@
 
 class PreviousStatus(Enum):
     """
-    This class shows the status of previous comparision.
+    This class shows the status of previous comparison.
     """
     RESET = 0
-    # ORIGIN means the dimension size of original tensor is larger in the previous comparision.
+    # ORIGIN means the dimension size of original tensor is larger in the previous comparison.
     ORIGIN = 1
-    # TGT means the dimension size of target tensor is larger in the previous comparision.
+    # TGT means the dimension size of target tensor is larger in the previous comparison.
     TGT = 2
 
 
@@ -91,7 +91,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D
             tgt_index += 1
 
             if previous_label == PreviousStatus.TGT:
-                # if the target dimension size is larger in the previous comparision, which means
+                # if the target dimension size is larger in the previous comparison, which means
                 # the origin dimension size has already accumulated larger than target dimension size, so
                 # we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
                 reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
@@ -111,7 +111,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D
             origin_index += 1
 
             if previous_label == PreviousStatus.ORIGIN:
-                # if the origin element is larger in the previous comparision, which means
+                # if the origin element is larger in the previous comparison, which means
                 # the target element has already accumulated larger than origin element, so
                 # we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
                 reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
@@ -139,7 +139,7 @@ def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]],
     Rule:
         For a sharded dimension of input tensor, if it is not the minimum element of the input tuple,
         the function will return false.
-        To illustrate this issue, there are two cases to analyse:
+        To illustrate this issue, there are two cases to analyze:
         1. no sharded dims in the input tuple: we could do the reshape operation safely just as the normal
         operation without distributed tensor.
         2. sharded dims in the input tuple: the sharded dim must be the minimum element, then during shape
diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index 54036973e1e3..bb561a106515 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -13,7 +13,7 @@
 class CPUAdam(NVMeOptimizer):
     """Implements Adam algorithm.
 
-    Supports parameters updating on both GPU and CPU, depanding on the device of paramters.
+    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
     But the parameters and gradients should on the same device:
       * Parameters on CPU and gradients on CPU is allowed.
       * Parameters on GPU and gradients on GPU is allowed.
diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py
index 1d0fb92de499..be6311c6c29f 100644
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@@ -13,19 +13,19 @@
 class HybridAdam(NVMeOptimizer):
     """Implements Adam algorithm.
 
-    Supports parameters updating on both GPU and CPU, depanding on the device of paramters.
+    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
     But the parameters and gradients should on the same device:
       * Parameters on CPU and gradients on CPU is allowed.
       * Parameters on GPU and gradients on GPU is allowed.
       * Parameters on GPU and gradients on CPU is **not** allowed.
 
-    `HybriadAdam` requires CUDA extensions which can be built during installation or runtime.
+    `HybridAdam` requires CUDA extensions which can be built during installation or runtime.
 
     This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam.
 
     * For parameters updating on CPU, it uses CPUAdam.
     * For parameters updating on GPU, it uses FusedAdam.
-    * Hybird precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.
+    * Hybrid precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.
 
     :class:`colossalai.nn.optimizer.HybridAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
     or ``torch.optim.Adam`` with ``adamw_mode=False``
@@ -131,7 +131,7 @@ def step(self, closure=None, div_scale: float = -1):
                     assert state['exp_avg'].device.type == 'cuda', "exp_avg should stay on cuda"
                     assert state['exp_avg_sq'].device.type == 'cuda', "exp_avg should stay on cuda"
 
-                    # record the state by gruop and update at once
+                    # record the state by group and update at once
                     g_l.append(p.grad.data)
                     p_l.append(p.data)
                     m_l.append(state['exp_avg'])
diff --git a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
index da043df368ae..a6159856dcce 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
@@ -20,8 +20,8 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
         return
     torch.cuda.current_stream().wait_stream(stream)
     # As mentioned in https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html,
-    # PyTorch uses the "caching allocator" for memroy allocation for tensors. When a tensor is
-    # freed, its memory is likely to be reused by newly constructed tenosrs.  By default,
+    # PyTorch uses the "caching allocator" for memory allocation for tensors. When a tensor is
+    # freed, its memory is likely to be reused by newly constructed tensors.  By default,
     # this allocator traces whether a tensor is still in use by only the CUDA stream where it
     # was created.   When a tensor is used by additional CUDA streams, we need to call record_stream
     # to tell the allocator about all these streams.  Otherwise, the allocator might free the
@@ -294,7 +294,7 @@ def print_comm_stats(self):
             print(
                 f"CPU->CUDA BWD {self._cpu_to_cuda_numel * self.elem_size_in_byte / 1e6 / elapsed} MB/s {self._cpu_to_cuda_numel / 1e6} M elem"
             )
-            print(f'cpu_to_cuda_elpase {elapsed} sec')
+            print(f'cpu_to_cuda_elapse {elapsed} sec')
 
         for k, v in self._elapsed_dict.items():
             print(f'{k}: {v}')
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 95b3b8014af1..8022e84dc24b 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -324,7 +324,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
     norm_type = float(norm_type)
 
     # Parameters can be on CPU or CUDA
-    # If parameters are on CPU, disable CUDA kernerls
+    # If parameters are on CPU, disable CUDA kernels
 
     # Calculate norm.
     if norm_type == inf:
diff --git a/colossalai/utils/tensor_detector/readme.md b/colossalai/utils/tensor_detector/readme.md
index 840dc8f4eca6..d6852ea55b54 100644
--- a/colossalai/utils/tensor_detector/readme.md
+++ b/colossalai/utils/tensor_detector/readme.md
@@ -46,7 +46,7 @@ detector.detect()
 
 I have made some comments on the right of the output for your understanding.
 
-Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memery Allocated`.  PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly.
+Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memory Allocated`.  PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly.
 
 **The order of print is not equal to the order the tensor creates, but they are really close.**
 
@@ -61,7 +61,7 @@ Note that the total `Mem` of all the tensors and parameters is not equal to `Tot
 +  mlp.2.bias                        cuda:0               (32,)      True       torch.float32          128 B
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 27
-Totle GPU Memery Allocated on cuda:0 is 4.5 KB
+Total GPU Memory Allocated on cuda:0 is 4.5 KB
 ------------------------------------------------------------------------------------------------------------
 
 
@@ -72,7 +72,7 @@ Totle GPU Memery Allocated on cuda:0 is 4.5 KB
 +  Tensor                            cuda:0               (32,)      True       torch.float32          128 B    # output
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 30
-Totle GPU Memery Allocated on cuda:0 is 5.5 KB
+Total GPU Memory Allocated on cuda:0 is 5.5 KB
 ------------------------------------------------------------------------------------------------------------
 
 
@@ -82,7 +82,7 @@ Totle GPU Memery Allocated on cuda:0 is 5.5 KB
 +  Tensor                            cuda:0                  ()      True       torch.float32            4 B    # loss
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 32
-Totle GPU Memery Allocated on cuda:0 is 6.0 KB
+Total GPU Memory Allocated on cuda:0 is 6.0 KB
 ------------------------------------------------------------------------------------------------------------
 
 
@@ -103,7 +103,7 @@ Totle GPU Memery Allocated on cuda:0 is 6.0 KB
 -  Tensor                            cuda:0                (8,)      True       torch.float32           32 B    # deleted activation
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 34
-Totle GPU Memery Allocated on cuda:0 is 10.0 KB
+Total GPU Memory Allocated on cuda:0 is 10.0 KB
 ------------------------------------------------------------------------------------------------------------
 
 
@@ -117,7 +117,7 @@ Totle GPU Memery Allocated on cuda:0 is 10.0 KB
 +  Tensor                            cuda:0               (32,)     False       torch.float32          128 B
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 36
-Totle GPU Memery Allocated on cuda:0 is 14.0 KB
+Total GPU Memory Allocated on cuda:0 is 14.0 KB
 ------------------------------------------------------------------------------------------------------------
 ```
 
diff --git a/colossalai/utils/tensor_detector/tensor_detector.py b/colossalai/utils/tensor_detector/tensor_detector.py
index a8186f76834c..cfcd4e47b4cb 100644
--- a/colossalai/utils/tensor_detector/tensor_detector.py
+++ b/colossalai/utils/tensor_detector/tensor_detector.py
@@ -55,7 +55,7 @@ def get_tensor_mem(self, tensor):
         return self.mem_format(memory_size)
 
     def mem_format(self, real_memory_size):
-        # format the tensor memory into a reasonal magnitude
+        # format the tensor memory into a reasonable magnitude
         if real_memory_size >= 2**30:
             return str(real_memory_size / (2**30)) + ' GB'
         if real_memory_size >= 2**20:
@@ -71,7 +71,7 @@ def collect_tensors_state(self):
                 if (not self.include_cpu) and obj.device == torch.device('cpu'):
                     continue
                 self.detected.append(id(obj))
-                # skip paramters we had added in __init__ when module is an instance of nn.Module for the first epoch
+                # skip parameters we had added in __init__ when module is an instance of nn.Module for the first epoch
                 if id(obj) not in self.tensor_info:
 
                     name = type(obj).__name__
@@ -84,7 +84,7 @@ def collect_tensors_state(self):
                                     name = par_name + ' (with grad)'
                         else:
                             # with no grad attached
-                            # there will be no new paramters created during running
+                            # there will be no new parameters created during running
                             # so it must be in saved_tensor_info
                             continue
                     # we can also marked common tensors as tensor(with grad)
@@ -155,7 +155,7 @@ def print_tensors_state(self):
             if device == torch.device('cpu'):
                 continue
             gpu_mem_alloc = self.mem_format(torch.cuda.memory_allocated(device))
-            self.info += f"Totle GPU Memery Allocated on {device} is {gpu_mem_alloc}\n"
+            self.info += f"Total GPU Memory Allocated on {device} is {gpu_mem_alloc}\n"
         self.info += LINE
         self.info += '\n\n'
         if self.show_info:
diff --git a/colossalai/zero/gemini/chunk/manager.py b/colossalai/zero/gemini/chunk/manager.py
index d85df0b00476..77368d06d255 100644
--- a/colossalai/zero/gemini/chunk/manager.py
+++ b/colossalai/zero/gemini/chunk/manager.py
@@ -102,7 +102,7 @@ def access_chunk(self, chunk: Chunk) -> None:
         """
         if chunk in self.accessed_chunks:
             return
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
         if chunk.device_type == 'cpu':
             chunk.shard_move(get_current_device())
         self.__add_accessed_chunk(chunk)
@@ -114,7 +114,7 @@ def release_chunk(self, chunk: Chunk) -> None:
         if chunk not in self.accessed_chunks:
             return
         if chunk.can_release:
-            self.__sub_memroy_usage(chunk.memory_usage)
+            self.__sub_memory_usage(chunk.memory_usage)
             self.__sub_accessed_chunk(chunk)
             self.__add_memory_usage(chunk.memory_usage)
 
@@ -123,7 +123,7 @@ def move_chunk(self, chunk: Chunk, device: torch.device, force_copy: bool = Fals
         """
         if not chunk.can_move or chunk.device_type == device.type:
             return
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
         chunk.shard_move(device, force_copy)
         self.__add_memory_usage(chunk.memory_usage)
 
@@ -138,7 +138,7 @@ def reduce_chunk(self, chunk: Chunk) -> bool:
         """
         if not chunk.can_reduce:
             return False
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
         chunk.reduce()
         self.__sub_accessed_chunk(chunk)
         self.__add_memory_usage(chunk.memory_usage)
@@ -228,11 +228,11 @@ def __get_chunk_group(self, group_name: str) -> Deque:
         return self.chunk_groups[group_name]
 
     def __close_one_chunk(self, chunk: Chunk):
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
         chunk.close_chunk()
         self.__add_memory_usage(chunk.memory_usage)
 
-    def __sub_memroy_usage(self, usage: Dict[str, int]):
+    def __sub_memory_usage(self, usage: Dict[str, int]):
         for k, v in usage.items():
             self.total_mem[k] -= v
 
diff --git a/colossalai/zero/gemini/chunk/search_utils.py b/colossalai/zero/gemini/chunk/search_utils.py
index da58e038c879..881ceb0b3b97 100644
--- a/colossalai/zero/gemini/chunk/search_utils.py
+++ b/colossalai/zero/gemini/chunk/search_utils.py
@@ -85,7 +85,7 @@ def classify_params_by_dp_degree(param_order: OrderedParamGenerator,
     Classify the parameters by their dp degree
 
     Args:
-        param_order (OrderedParamGenerator): the order of param be visied
+        param_order (OrderedParamGenerator): the order of param be vised
         strict_ddp_flag (bool, optional): whether to enable the strict ddp mode. Defaults to False.
 
     Returns:
diff --git a/colossalai/zero/gemini/memory_tracer/memory_stats.py b/colossalai/zero/gemini/memory_tracer/memory_stats.py
index 9a45034ee27e..41d7e5754e96 100644
--- a/colossalai/zero/gemini/memory_tracer/memory_stats.py
+++ b/colossalai/zero/gemini/memory_tracer/memory_stats.py
@@ -59,7 +59,7 @@ def increase_preop_step(self, param_list: List[torch.nn.Parameter]):
         time step.
 
         Args:
-            param_list (List[torch.nn.Parameter]): a list of torch paramters.
+            param_list (List[torch.nn.Parameter]): a list of torch parameters.
         """
         for p in param_list:
             if p not in self._param_step_dict:

From 86ad586f425dc994a358d426c62482fbfe2be19b Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Tue, 23 May 2023 15:56:31 +0800
Subject: [PATCH 3/3] fix typo colossalai/auto_parallel autochunk fx/passes 
 etc.

---
 .github/workflows/README.md                               | 4 ++--
 colossalai/auto_parallel/passes/meta_info_prop.py         | 2 +-
 .../node_handler/strategy/batch_norm_generator.py         | 2 +-
 .../node_handler/strategy/conv_strategy_generator.py      | 4 ++--
 .../node_handler/strategy/layer_norm_generator.py         | 4 ++--
 .../node_handler/strategy/normal_pooling_generator.py     | 6 +++---
 colossalai/autochunk/trace_flow.py                        | 8 ++++----
 colossalai/autochunk/trace_indice.py                      | 4 ++--
 colossalai/booster/plugin/gemini_plugin.py                | 2 +-
 colossalai/cluster/dist_coordinator.py                    | 2 +-
 colossalai/device/alpha_beta_profiler.py                  | 2 +-
 colossalai/engine/schedule/_pipeline_schedule.py          | 4 ++--
 colossalai/engine/schedule/_pipeline_schedule_v2.py       | 2 +-
 colossalai/fx/codegen/activation_checkpoint_codegen.py    | 2 +-
 colossalai/fx/passes/adding_split_node_pass.py            | 2 +-
 .../passes/experimental/adding_shape_consistency_pass.py  | 2 +-
 colossalai/fx/passes/meta_info_prop.py                    | 2 +-
 colossalai/fx/passes/passes_for_gpt2_test.py              | 4 ++--
 colossalai/fx/passes/split_module.py                      | 4 ++--
 19 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index 8fc14e0d531a..f40f4cc86d1b 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -14,7 +14,7 @@
       - [Compatibility Test on Dispatch](#compatibility-test-on-dispatch)
     - [Release](#release)
     - [User Friendliness](#user-friendliness)
-    - [Commmunity](#commmunity)
+    - [Community](#community)
   - [Configuration](#configuration)
   - [Progress Log](#progress-log)
 
@@ -97,7 +97,7 @@ This workflow is triggered by manually dispatching the workflow. It has the foll
 | `Synchronize submodule` | `submodule.yml`         | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers.                 |
 | `Close inactive issues` | `close_inactive.yml`    | This workflow will close issues which are stale for 14 days.                                                                           |
 
-### Commmunity
+### Community
 
 | Workflow Name                                | File name                        | Description                                                                      |
 | -------------------------------------------- | -------------------------------- | -------------------------------------------------------------------------------- |
diff --git a/colossalai/auto_parallel/passes/meta_info_prop.py b/colossalai/auto_parallel/passes/meta_info_prop.py
index bc0960483980..0673b767de7b 100644
--- a/colossalai/auto_parallel/passes/meta_info_prop.py
+++ b/colossalai/auto_parallel/passes/meta_info_prop.py
@@ -148,7 +148,7 @@ def node_handler(self, node: Node) -> None:
         graph_info.fwd_tmp = buffer_tensors
         graph_info.fwd_out = output_tensors
 
-        # fetch other memory informations
+        # fetch other memory information
         memory_cost = meta_info.memory_cost
         graph_info.fwd_mem_tmp = memory_cost.fwd.temp
         graph_info.fwd_mem_out = memory_cost.fwd.activation
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py
index 79b69acb25b3..416dc9c29cad 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py
@@ -44,7 +44,7 @@ def update_compute_cost(self, strategy: ShardingStrategy):
         '''
         Compute the computation cost per device with this specific strategy.
 
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         '''
         # TODO: a constant coefficient need to be added.
         # 1D: (L) * N * Cin
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py
index c2154b3104d3..e605a68a326b 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py
@@ -38,9 +38,9 @@ def update_compute_cost(self, strategy: ShardingStrategy):
         '''
         Compute the computation cost per device with this specific strategy.
 
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         '''
-        # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         # 1D: (L) * N * Cout * Cin * kernel
         # 2D: (H * W) * N * Cout * Cin * kernel
         # 3D: (H * W  * D) * N * Cout * Cin * kernel
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py
index fbb6070f7e82..65b173bbf65d 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py
@@ -34,9 +34,9 @@ def update_compute_cost(self, strategy: ShardingStrategy):
         '''
         Compute the computation cost per device with this specific strategy.
 
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         '''
-        # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         # TODO: a constant coefficient need to be added.
 
         sharded_input_shape = strategy.sharding_specs[self.op_data['input']].get_sharded_shape_per_device()
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py
index 9df6d2fbfa12..b7db42f8f67e 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py
@@ -17,7 +17,7 @@ class NormalPoolStrategyGenerator(StrategyGenerator):
     """
     NormalPoolStrategyGenerator is a generic class to generate strategies for pool operation like MaxPoolxd.
     The reason we call this normal pool is AvgPoolxd and MaxPoolxd are taking the kernel size element from image,
-    and reduce them depening on the operation type.
+    and reduce them depending on the operation type.
     """
 
     def validate(self) -> bool:
@@ -35,9 +35,9 @@ def update_compute_cost(self, strategy: ShardingStrategy) -> TrainCycleItem:
         '''
         Compute the computation cost per device with this specific strategy.
 
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         '''
-        # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
+        # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
         # 1D: (Lout) * N * C * kernel
         # 2D: (H * W) * N * Cout * Cin * kernel
         # 3D: (H * W  * D) * N * Cout * Cin * kernel
diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py
index 11a7e62ff37c..a1080fda1541 100644
--- a/colossalai/autochunk/trace_flow.py
+++ b/colossalai/autochunk/trace_flow.py
@@ -366,8 +366,8 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim):
         # find non chunk inputs
         chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
 
-        # reassgin reshape size, some size may have changed due to chunk
-        chunk_info = self._reassgin_reshape_size(chunk_info)
+        # reassign reshape size, some size may have changed due to chunk
+        chunk_info = self._reassign_reshape_size(chunk_info)
 
         return chunk_info
 
@@ -428,10 +428,10 @@ def _update_chunk_info(self, chunk_info: Dict, new_all_node_info: Dict, output:
         chunk_info["outputs_dim"].append(output_dim)
         return True
 
-    def _reassgin_reshape_size(self, chunk_info):
+    def _reassign_reshape_size(self, chunk_info):
         """
         Some shape args in reshape may have changed due to chunk
-        reassgin those changed shape
+        reassign those changed shape
         """
         chunk_region = chunk_info["region"]
         reshape_size = {}
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index 8e6cd3e29bea..fbe0741b8827 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -397,7 +397,7 @@ def _assign_conv2d_indice(self, node: Node, node_idx: int) -> None:
         input_node = node.args[0]
         assert len(get_node_shape(input_node)) == 4
 
-        # assgin index
+        # assign index
         self._assign_indice_as_input(node, node_idx, input_node)
         self._del_dim(node_idx, 1)
         self._add_dim(node_idx, 1)
@@ -415,7 +415,7 @@ def _assign_interpolate_indice(self, node: Node, node_idx: int) -> None:
         assert node.kwargs['size'] is None
         assert len(get_node_shape(node)) == 4
 
-        # assgin index
+        # assign index
         self._assign_indice_as_input(node, node_idx)
         self._mark_computation(node, node_idx, [-1, -2])
 
diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index bb3124642ccf..adbf4803eefe 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -179,7 +179,7 @@ class GeminiPlugin(DPPluginBase):
             Users can provide this argument to speed up searching.
             If users do not know this argument before training, it is ok. We will use a default value 1024.
         min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
-            If the aggregate size of parameters is still samller than the minimum chunk size,
+            If the aggregate size of parameters is still smaller than the minimum chunk size,
             all parameters will be compacted into one small chunk.
         memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
         gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
diff --git a/colossalai/cluster/dist_coordinator.py b/colossalai/cluster/dist_coordinator.py
index 99dde810e112..3ee364ec3364 100644
--- a/colossalai/cluster/dist_coordinator.py
+++ b/colossalai/cluster/dist_coordinator.py
@@ -181,7 +181,7 @@ def on_master_only(self, process_group: ProcessGroup = None):
         """
         is_master = self.is_master(process_group)
 
-        # define an inner functiuon
+        # define an inner function
         def decorator(func):
 
             @functools.wraps(func)
diff --git a/colossalai/device/alpha_beta_profiler.py b/colossalai/device/alpha_beta_profiler.py
index af2b10928c6f..f8b20de9bc37 100644
--- a/colossalai/device/alpha_beta_profiler.py
+++ b/colossalai/device/alpha_beta_profiler.py
@@ -381,7 +381,7 @@ def _extract_alpha_beta(pg, pg_handler):
         first_latency, first_bandwidth = _extract_alpha_beta(first_axis, first_axis_process_group)
         second_latency, second_bandwidth = _extract_alpha_beta(second_axis, second_axis_process_group)
         mesh_alpha = [first_latency, second_latency]
-        # The beta values have been enlarged by 1e10 times temporarilly because the computation cost
+        # The beta values have been enlarged by 1e10 times temporarily because the computation cost
         # is still estimated in the unit of TFLOPs instead of time. We will remove this factor in future.
         mesh_beta = [1e10 / first_bandwidth, 1e10 / second_bandwidth]
 
diff --git a/colossalai/engine/schedule/_pipeline_schedule.py b/colossalai/engine/schedule/_pipeline_schedule.py
index 38175fe0941c..9fc301a26559 100644
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@@ -152,9 +152,9 @@ def _get_data_slice(self, data, offset):
             raise TypeError(f"Expected data to be of type torch.Tensor, list, tuple, or dict, but got {type(data)}")
 
     def load_micro_batch(self):
-        mciro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset)
+        micro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset)
         self.microbatch_offset += self.microbatch_size
-        return self._move_to_device(mciro_batch_data)
+        return self._move_to_device(micro_batch_data)
 
     def pre_processing(self, engine):
         from colossalai.zero.legacy import ShardedModelV2
diff --git a/colossalai/engine/schedule/_pipeline_schedule_v2.py b/colossalai/engine/schedule/_pipeline_schedule_v2.py
index 28c58bd82b5c..89e45c7aacec 100644
--- a/colossalai/engine/schedule/_pipeline_schedule_v2.py
+++ b/colossalai/engine/schedule/_pipeline_schedule_v2.py
@@ -84,7 +84,7 @@ def forward_backward_step(self,
             'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
         self.load_batch(data_iter)
 
-        # num_warmup_microbatches is the step when not all the processers are working
+        # num_warmup_microbatches is the step when not all the processes are working
         num_warmup_microbatches = \
             (gpc.get_world_size(ParallelMode.PIPELINE)
              - gpc.get_local_rank(ParallelMode.PIPELINE) - 1)
diff --git a/colossalai/fx/codegen/activation_checkpoint_codegen.py b/colossalai/fx/codegen/activation_checkpoint_codegen.py
index 5a72cb9ca923..33b164800262 100644
--- a/colossalai/fx/codegen/activation_checkpoint_codegen.py
+++ b/colossalai/fx/codegen/activation_checkpoint_codegen.py
@@ -523,7 +523,7 @@ def emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node_func,
     # append code text to body
     for idx, node in enumerate(node_list):
         # if this is the first node of the ckpt region
-        # append the ckpt function defition
+        # append the ckpt function definition
         if idx in start_idx:
             label = start_idx.index(idx)
             ckpt_fn_def = _gen_ckpt_fn_def(label, input_vars[label])
diff --git a/colossalai/fx/passes/adding_split_node_pass.py b/colossalai/fx/passes/adding_split_node_pass.py
index 2c7b842b530c..245ba5d776da 100644
--- a/colossalai/fx/passes/adding_split_node_pass.py
+++ b/colossalai/fx/passes/adding_split_node_pass.py
@@ -206,7 +206,7 @@ def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int):
 
 def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int):
     """
-    In avgnode_split_pass, simpliy split graph by node number.
+    In avgnode_split_pass, simply split graph by node number.
     """
     mod_graph = gm.graph
     avg_num_node = len(mod_graph.nodes) // pp_size
diff --git a/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py b/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py
index f28d65e2668a..4571bd93a790 100644
--- a/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py
+++ b/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py
@@ -16,7 +16,7 @@ def apply(*args, **kwargs):
     return shape_consistency_manager.apply(*args, **kwargs)
 
 
-def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh):
+def solution_annotation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh):
     mod_graph = gm.graph
     nodes = tuple(mod_graph.nodes)
 
diff --git a/colossalai/fx/passes/meta_info_prop.py b/colossalai/fx/passes/meta_info_prop.py
index 2b4a8749cfd7..ab203dfd7440 100644
--- a/colossalai/fx/passes/meta_info_prop.py
+++ b/colossalai/fx/passes/meta_info_prop.py
@@ -31,7 +31,7 @@ class TensorMetadata(NamedTuple):
     numel: int
     is_tensor: bool
     # TODO: we can add a list of sharding spec here, and record the sharding
-    # behaviour by appending sharding spec into list.
+    # behavior by appending sharding spec into list.
 
 
 def _extract_tensor_metadata(result: torch.Tensor) -> TensorMetadata:
diff --git a/colossalai/fx/passes/passes_for_gpt2_test.py b/colossalai/fx/passes/passes_for_gpt2_test.py
index abc1a089e9a9..efdd34a01fe0 100644
--- a/colossalai/fx/passes/passes_for_gpt2_test.py
+++ b/colossalai/fx/passes/passes_for_gpt2_test.py
@@ -230,7 +230,7 @@ def record_cross_partition_use(def_node: torch.fx.node.Node,
                     use_partition.partitions_dependent_on.setdefault(def_partition_name)
 
     node_process_list = list(m.graph.nodes)
-    # split nodes into parititons
+    # split nodes into partitions
     while node_process_list:
         node = node_process_list.pop(0)
         orig_nodes[node.name] = node
@@ -277,7 +277,7 @@ def record_cross_partition_use(def_node: torch.fx.node.Node,
     if len(sorted_partitions) != len(partitions):
         raise RuntimeError("cycle exists between partitions!")
 
-    # add placeholders to parititons
+    # add placeholders to partitions
     for partition_name in sorted_partitions:
         partition = partitions[partition_name]
         for input in partition.inputs:
diff --git a/colossalai/fx/passes/split_module.py b/colossalai/fx/passes/split_module.py
index 5ce5b969cbde..61ed037ab7a1 100644
--- a/colossalai/fx/passes/split_module.py
+++ b/colossalai/fx/passes/split_module.py
@@ -29,8 +29,8 @@ def __repr__(self) -> str:
             f" nodes: {self.node_names},\n" \
             f" inputs: {self.inputs},\n" \
             f" outputs: {self.outputs},\n" \
-            f" partitions depenent on: {self.partitions_dependent_on},\n" \
-            f" parition dependents: {self.partition_dependents}"
+            f" partitions dependent on: {self.partitions_dependent_on},\n" \
+            f" partition dependents: {self.partition_dependents}"
 
 
 # Creates subgraphs out of main graph