From 39f1da31dee9ccc7b962f0f0a7e2adafbd0948bb Mon Sep 17 00:00:00 2001 From: digger yu Date: Wed, 17 May 2023 13:56:54 +0800 Subject: [PATCH 1/5] fix typo colossalai/autochunk auto_parallel amp --- colossalai/amp/torch_amp/_grad_scaler.py | 2 +- .../auto_parallel/meta_profiler/meta_registry/linear.py | 2 +- colossalai/auto_parallel/passes/runtime_apply_pass.py | 2 +- .../auto_parallel/passes/runtime_preparation_pass.py | 4 ++-- colossalai/autochunk/trace_flow.py | 6 +++--- colossalai/autochunk/trace_indice.py | 8 ++++---- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/colossalai/amp/torch_amp/_grad_scaler.py b/colossalai/amp/torch_amp/_grad_scaler.py index 7b78998fb8c2..ed4b8e484436 100644 --- a/colossalai/amp/torch_amp/_grad_scaler.py +++ b/colossalai/amp/torch_amp/_grad_scaler.py @@ -240,7 +240,7 @@ def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16): for grads in per_dtype_grads.values(): torch._amp_foreach_non_finite_check_and_unscale_(grads, per_device_found_inf.get(device), per_device_inv_scale.get(device)) - # For tensor parallel paramters it should be all-reduced over tensor parallel process group + # For tensor parallel parameters it should be all-reduced over tensor parallel process group if gpc.is_initialized(ParallelMode.MODEL) and gpc.get_world_size(ParallelMode.MODEL) > 1: vals = [val for val in per_device_found_inf._per_device_tensors.values()] coalesced = _flatten_dense_tensors(vals) diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py index 7697fc6c383d..94dd9143e0ae 100644 --- a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py +++ b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py @@ -325,7 +325,7 @@ def matmul_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L else: _is_batch_dims_same = False - # retireve dimensions + # retrieve dimensions input_dim_00 = input_tensors[0].shape[-2] input_dim_01 = input_tensors[0].shape[-1] input_dim_10 = input_tensors[1].shape[-2] diff --git a/colossalai/auto_parallel/passes/runtime_apply_pass.py b/colossalai/auto_parallel/passes/runtime_apply_pass.py index a473bb6e973d..2049a06187d2 100644 --- a/colossalai/auto_parallel/passes/runtime_apply_pass.py +++ b/colossalai/auto_parallel/passes/runtime_apply_pass.py @@ -219,7 +219,7 @@ def _comm_spec_apply(gm: torch.fx.GraphModule): return gm -def _act_annotataion_pass(gm: torch.fx.GraphModule): +def _act_annotation_pass(gm: torch.fx.GraphModule): """ This pass is used to add the act annotation to the new inserted nodes. """ diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py index 177f3765f5a0..9a2314826448 100644 --- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py +++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py @@ -54,7 +54,7 @@ def size_processing(size: Union[int, torch.Size], return size -def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int], +def solution_annotation_pass(gm: torch.fx.GraphModule, solution: List[int], strategies_constructor: StrategiesConstructor): """ This method is used to stick the solution strategy to the nodes and add the information @@ -496,7 +496,7 @@ def runtime_preparation_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh, strategies_constructor: StrategiesConstructor, overlap=False): - gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict = solution_annotatation_pass( + gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict = solution_annotation_pass( gm, solution, strategies_constructor) gm = size_value_converting_pass(gm, device_mesh) gm = node_args_converting_pass(gm, device_mesh) diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py index db25267e9b42..11a7e62ff37c 100644 --- a/colossalai/autochunk/trace_flow.py +++ b/colossalai/autochunk/trace_flow.py @@ -64,7 +64,7 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx): return False return True - def _assgin_single_node_flow( + def _assign_single_node_flow( self, arg_node: Node, start_idx: int, @@ -177,7 +177,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx): if get_node_shape(arg) is None: continue arg_list.append(arg) - flow_flag = self._assgin_single_node_flow( + flow_flag = self._assign_single_node_flow( arg, start_idx, end_idx, @@ -315,7 +315,7 @@ def _get_prepose_nodes(self, all_node_info: Dict, start_idx: int, end_idx: int, chunk_info["args"]["prepose_nodes"] = prepose_nodes def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx): - # we need to log input nodes to avoid deleteing them in the loop + # we need to log input nodes to avoid deleting them in the loop chunk_node_list = self.node_mgr.get_node_slice_by_idx(start_idx, end_idx + 1) # also need to get some prepose node's arg out of non_chunk_inputs for n in chunk_info["args"]["prepose_nodes"]: diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py index d56bf843f18d..8e6cd3e29bea 100644 --- a/colossalai/autochunk/trace_indice.py +++ b/colossalai/autochunk/trace_indice.py @@ -461,7 +461,7 @@ def _assign_elementwise_indice(self, node, idx): nodes_in.append(node_in) self._inherit_more_indice_from_node_with_exclude(node_in, node) - def _assgin_no_change_indice(self, node, idx): + def _assign_no_change_indice(self, node, idx): self._assign_indice_as_input(node, idx) for node_in in node.args: if type(node_in) == type(node): @@ -792,7 +792,7 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None: self._add_dim(node_idx, i) dim_from.reverse() - # inheirt indice from current node + # inherit indice from current node if len(dim_from) != 0 and len(dim_to) != 0: if dim_diff == 1: if origin_shape[dim_from[0]] == 1: @@ -852,7 +852,7 @@ def trace_indice(self) -> None: elif "split" == node_name: self._assign_split_indice(node, idx) elif any(i == node_name for i in ["to", "contiguous", "clone", "type", "float"]): - self._assgin_no_change_indice(node, idx) + self._assign_no_change_indice(node, idx) elif "new_ones" == node_name: self._assign_all_indice(node, idx) elif "flatten" == node_name: @@ -914,7 +914,7 @@ def trace_indice(self) -> None: elif "conv2d" == node_name: self._assign_conv2d_indice(node, idx) elif "identity" == node_name: - self._assgin_no_change_indice(node, idx) + self._assign_no_change_indice(node, idx) elif any(n == node_name for n in ["sigmoid", "dropout", "relu", "silu", "gelu"]): self._assign_elementwise_indice(node, idx) else: From dce5d3d469b34da1dbd26f949c1eecce3e99dfc2 Mon Sep 17 00:00:00 2001 From: digger yu Date: Fri, 19 May 2023 14:18:14 +0800 Subject: [PATCH 2/5] fix typo colossalai/auto_parallel nn utils etc. --- applications/Chat/coati/dataset/reward_dataset.py | 2 +- .../tensor_shard/node_handler/embedding_handler.py | 4 ++-- .../tensor_shard/node_handler/linear_handler.py | 6 +++--- .../tensor_shard/node_handler/matmul_handler.py | 10 +++++----- .../tensor_shard/node_handler/node_handler.py | 2 +- .../auto_parallel/tensor_shard/utils/factory.py | 2 +- .../auto_parallel/tensor_shard/utils/reshape.py | 12 ++++++------ colossalai/nn/optimizer/cpu_adam.py | 2 +- colossalai/nn/optimizer/hybrid_adam.py | 8 ++++---- .../nn/parallel/layers/cache_embedding/cache_mgr.py | 6 +++--- colossalai/utils/common.py | 2 +- colossalai/utils/tensor_detector/readme.md | 12 ++++++------ colossalai/utils/tensor_detector/tensor_detector.py | 8 ++++---- colossalai/zero/gemini/chunk/manager.py | 12 ++++++------ colossalai/zero/gemini/chunk/search_utils.py | 2 +- colossalai/zero/gemini/memory_tracer/memory_stats.py | 2 +- 16 files changed, 46 insertions(+), 46 deletions(-) diff --git a/applications/Chat/coati/dataset/reward_dataset.py b/applications/Chat/coati/dataset/reward_dataset.py index faa1c94d2728..5dacf7e81464 100644 --- a/applications/Chat/coati/dataset/reward_dataset.py +++ b/applications/Chat/coati/dataset/reward_dataset.py @@ -6,7 +6,7 @@ from .utils import is_rank_0 -# Dahaos/rm-static +# Dahoas/rm-static class RmStaticDataset(Dataset): """ Dataset for reward model diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py index e154105b672d..112ee194b4ec 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py @@ -155,7 +155,7 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li Convert the sharding spec from the logical shape to the physical shape. """ # create multiple sharding strategies for the inputs - # as input can be multi-dimensinal and the partition dim is only 2D, + # as input can be multi-dimensional and the partition dim is only 2D, # we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy, input_name=str( @@ -221,7 +221,7 @@ def post_process(self, strategy: ShardingStrategy): Convert the sharding spec from the logical shape to the physical shape. """ # create multiple sharding strategies for the inputs - # as input can be multi-dimensinal and the partition dim is only 2D, + # as input can be multi-dimensional and the partition dim is only 2D, # we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy, input_name=str( diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py index 59091dab519f..ea541e434009 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py @@ -23,7 +23,7 @@ def _update_sharding_spec_for_transposed_weight_for_linear(strategy: ShardingStr weight_name: str) -> ShardingStrategy: """ This function is a helper function used by both module node handler and function node handler. This function will - convert the sharding spec for the transposed weight to the correct partititon spec. + convert the sharding spec for the transposed weight to the correct partition spec. Args: strategy (ShardingStrategy): the strategy generated by the strategy generator. @@ -197,7 +197,7 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy, weight_name='weight') # create multiple sharding strategies for the inputs - # as input can be multi-dimensinal and the partition dim is only 2D, + # as input can be multi-dimensional and the partition dim is only 2D, # we need to map the partition at dim 0 to one of the first few dimensions of the input strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy, input_name=str(self.node.args[0]), @@ -267,7 +267,7 @@ def post_process(self, strategy: ShardingStrategy): strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy, weight_name=str(self.node.args[1])) # create multiple sharding strategies for the inputs - # as input can be multi-dimensinal and the partition dim is only 2D, + # as input can be multi-dimensional and the partition dim is only 2D, # we need to map the partition at dim 0 to one of the first few dimensions of the input strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy, input_name=str(self.node.args[0]), diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py index f3c9d0cbf826..bfebc3f59d0c 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py @@ -48,8 +48,8 @@ def get_matmul_type(input_dim: int, other_dim: int): Determine which type of matmul operation should be executed for the given tensor dimensions. Args: - input_dim (int): the number of dimensions for the input tenosr - other_dim (int): the number of dimensions for the other tenosr + input_dim (int): the number of dimensions for the input tensor + other_dim (int): the number of dimensions for the other tensor """ if input_dim == 1 and other_dim == 1: matmul_type = MatMulType.DOT @@ -268,13 +268,13 @@ def _update_sharding_spec(key, strategy, physical_batch_dim): dim_partition_dict = sharding_spec.dim_partition_dict entire_shape = sharding_spec.entire_shape - # upddate the dimension index for the matrix dimensions + # update the dimension index for the matrix dimensions if 2 in dim_partition_dict: dim_partition_dict[len(self.batch_dims_before_view) + 1] = dim_partition_dict.pop(2) if 1 in dim_partition_dict: dim_partition_dict[len(self.batch_dims_before_view)] = dim_partition_dict.pop(1) - # map the logical batch dim to phyiscal batch dim + # map the logical batch dim to physical batch dim if 0 in dim_partition_dict: batch_dim_shard = dim_partition_dict.pop(0) dim_partition_dict[physical_batch_dim] = batch_dim_shard @@ -414,7 +414,7 @@ def _get_logical_shape_for_dot(self): def _get_logical_shape_for_mm(self): """ - We need to handle the input tensor for a matrix-matrix multiplcation as the input + We need to handle the input tensor for a matrix-matrix multiplication as the input tensor can be a 1D or 2D tensor. If it is a 1D tensor, 1 will be prepended to its shape (e.g. [4] -> [1, 4]). """ diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py index d3d09a9dcf65..4262d76173e4 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py @@ -212,7 +212,7 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV return self.strategies_vector def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, List[ShardingStrategy]]: - # tranform the strategy generated + # transform the strategy generated # e.g. to process the sharding strategy for the transposed weights return strategy diff --git a/colossalai/auto_parallel/tensor_shard/utils/factory.py b/colossalai/auto_parallel/tensor_shard/utils/factory.py index 05331e560001..347c10aa102d 100644 --- a/colossalai/auto_parallel/tensor_shard/utils/factory.py +++ b/colossalai/auto_parallel/tensor_shard/utils/factory.py @@ -30,7 +30,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic """ if isinstance(input_, Node): - assert hasattr(input_, '_meta_data'), f'The given node has no attribte _meta_data' + assert hasattr(input_, '_meta_data'), f'The given node has no attribute _meta_data' meta_tensor = input_._meta_data assert meta_tensor is not None, "The given node's _meta_data attribute is None" shape = meta_tensor.shape diff --git a/colossalai/auto_parallel/tensor_shard/utils/reshape.py b/colossalai/auto_parallel/tensor_shard/utils/reshape.py index a32a14bf7d57..d0ebbd7e8b1b 100644 --- a/colossalai/auto_parallel/tensor_shard/utils/reshape.py +++ b/colossalai/auto_parallel/tensor_shard/utils/reshape.py @@ -6,12 +6,12 @@ class PreviousStatus(Enum): """ - This class shows the status of previous comparision. + This class shows the status of previous comparison. """ RESET = 0 - # ORIGIN means the dimension size of original tensor is larger in the previous comparision. + # ORIGIN means the dimension size of original tensor is larger in the previous comparison. ORIGIN = 1 - # TGT means the dimension size of target tensor is larger in the previous comparision. + # TGT means the dimension size of target tensor is larger in the previous comparison. TGT = 2 @@ -91,7 +91,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D tgt_index += 1 if previous_label == PreviousStatus.TGT: - # if the target dimension size is larger in the previous comparision, which means + # if the target dimension size is larger in the previous comparison, which means # the origin dimension size has already accumulated larger than target dimension size, so # we need to offload the origin dims and tgt dims into the reshape_mapping_dict. reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims) @@ -111,7 +111,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D origin_index += 1 if previous_label == PreviousStatus.ORIGIN: - # if the origin element is larger in the previous comparision, which means + # if the origin element is larger in the previous comparison, which means # the target element has already accumulated larger than origin element, so # we need to offload the origin dims and tgt dims into the reshape_mapping_dict. reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims) @@ -139,7 +139,7 @@ def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]], Rule: For a sharded dimension of input tensor, if it is not the minimum element of the input tuple, the function will return false. - To illustrate this issue, there are two cases to analyse: + To illustrate this issue, there are two cases to analyze: 1. no sharded dims in the input tuple: we could do the reshape operation safely just as the normal operation without distributed tensor. 2. sharded dims in the input tuple: the sharded dim must be the minimum element, then during shape diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py index 54036973e1e3..bb561a106515 100644 --- a/colossalai/nn/optimizer/cpu_adam.py +++ b/colossalai/nn/optimizer/cpu_adam.py @@ -13,7 +13,7 @@ class CPUAdam(NVMeOptimizer): """Implements Adam algorithm. - Supports parameters updating on both GPU and CPU, depanding on the device of paramters. + Supports parameters updating on both GPU and CPU, depanding on the device of parameters. But the parameters and gradients should on the same device: * Parameters on CPU and gradients on CPU is allowed. * Parameters on GPU and gradients on GPU is allowed. diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py index 1d0fb92de499..be6311c6c29f 100644 --- a/colossalai/nn/optimizer/hybrid_adam.py +++ b/colossalai/nn/optimizer/hybrid_adam.py @@ -13,19 +13,19 @@ class HybridAdam(NVMeOptimizer): """Implements Adam algorithm. - Supports parameters updating on both GPU and CPU, depanding on the device of paramters. + Supports parameters updating on both GPU and CPU, depanding on the device of parameters. But the parameters and gradients should on the same device: * Parameters on CPU and gradients on CPU is allowed. * Parameters on GPU and gradients on GPU is allowed. * Parameters on GPU and gradients on CPU is **not** allowed. - `HybriadAdam` requires CUDA extensions which can be built during installation or runtime. + `HybridAdam` requires CUDA extensions which can be built during installation or runtime. This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam. * For parameters updating on CPU, it uses CPUAdam. * For parameters updating on GPU, it uses FusedAdam. - * Hybird precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients. + * Hybrid precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients. :class:`colossalai.nn.optimizer.HybridAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``, or ``torch.optim.Adam`` with ``adamw_mode=False`` @@ -131,7 +131,7 @@ def step(self, closure=None, div_scale: float = -1): assert state['exp_avg'].device.type == 'cuda', "exp_avg should stay on cuda" assert state['exp_avg_sq'].device.type == 'cuda', "exp_avg should stay on cuda" - # record the state by gruop and update at once + # record the state by group and update at once g_l.append(p.grad.data) p_l.append(p.data) m_l.append(state['exp_avg']) diff --git a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py index da043df368ae..a6159856dcce 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py +++ b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py @@ -20,8 +20,8 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None: return torch.cuda.current_stream().wait_stream(stream) # As mentioned in https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html, - # PyTorch uses the "caching allocator" for memroy allocation for tensors. When a tensor is - # freed, its memory is likely to be reused by newly constructed tenosrs. By default, + # PyTorch uses the "caching allocator" for memory allocation for tensors. When a tensor is + # freed, its memory is likely to be reused by newly constructed tensors. By default, # this allocator traces whether a tensor is still in use by only the CUDA stream where it # was created. When a tensor is used by additional CUDA streams, we need to call record_stream # to tell the allocator about all these streams. Otherwise, the allocator might free the @@ -294,7 +294,7 @@ def print_comm_stats(self): print( f"CPU->CUDA BWD {self._cpu_to_cuda_numel * self.elem_size_in_byte / 1e6 / elapsed} MB/s {self._cpu_to_cuda_numel / 1e6} M elem" ) - print(f'cpu_to_cuda_elpase {elapsed} sec') + print(f'cpu_to_cuda_elapse {elapsed} sec') for k, v in self._elapsed_dict.items(): print(f'{k}: {v}') diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py index 95b3b8014af1..8022e84dc24b 100644 --- a/colossalai/utils/common.py +++ b/colossalai/utils/common.py @@ -324,7 +324,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): norm_type = float(norm_type) # Parameters can be on CPU or CUDA - # If parameters are on CPU, disable CUDA kernerls + # If parameters are on CPU, disable CUDA kernels # Calculate norm. if norm_type == inf: diff --git a/colossalai/utils/tensor_detector/readme.md b/colossalai/utils/tensor_detector/readme.md index 840dc8f4eca6..d6852ea55b54 100644 --- a/colossalai/utils/tensor_detector/readme.md +++ b/colossalai/utils/tensor_detector/readme.md @@ -46,7 +46,7 @@ detector.detect() I have made some comments on the right of the output for your understanding. -Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memery Allocated`. PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly. +Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memory Allocated`. PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly. **The order of print is not equal to the order the tensor creates, but they are really close.** @@ -61,7 +61,7 @@ Note that the total `Mem` of all the tensors and parameters is not equal to `Tot + mlp.2.bias cuda:0 (32,) True torch.float32 128 B ------------------------------------------------------------------------------------------------------------ Detect Location: "test_tensor_detector.py" line 27 -Totle GPU Memery Allocated on cuda:0 is 4.5 KB +Total GPU Memory Allocated on cuda:0 is 4.5 KB ------------------------------------------------------------------------------------------------------------ @@ -72,7 +72,7 @@ Totle GPU Memery Allocated on cuda:0 is 4.5 KB + Tensor cuda:0 (32,) True torch.float32 128 B # output ------------------------------------------------------------------------------------------------------------ Detect Location: "test_tensor_detector.py" line 30 -Totle GPU Memery Allocated on cuda:0 is 5.5 KB +Total GPU Memory Allocated on cuda:0 is 5.5 KB ------------------------------------------------------------------------------------------------------------ @@ -82,7 +82,7 @@ Totle GPU Memery Allocated on cuda:0 is 5.5 KB + Tensor cuda:0 () True torch.float32 4 B # loss ------------------------------------------------------------------------------------------------------------ Detect Location: "test_tensor_detector.py" line 32 -Totle GPU Memery Allocated on cuda:0 is 6.0 KB +Total GPU Memory Allocated on cuda:0 is 6.0 KB ------------------------------------------------------------------------------------------------------------ @@ -103,7 +103,7 @@ Totle GPU Memery Allocated on cuda:0 is 6.0 KB - Tensor cuda:0 (8,) True torch.float32 32 B # deleted activation ------------------------------------------------------------------------------------------------------------ Detect Location: "test_tensor_detector.py" line 34 -Totle GPU Memery Allocated on cuda:0 is 10.0 KB +Total GPU Memory Allocated on cuda:0 is 10.0 KB ------------------------------------------------------------------------------------------------------------ @@ -117,7 +117,7 @@ Totle GPU Memery Allocated on cuda:0 is 10.0 KB + Tensor cuda:0 (32,) False torch.float32 128 B ------------------------------------------------------------------------------------------------------------ Detect Location: "test_tensor_detector.py" line 36 -Totle GPU Memery Allocated on cuda:0 is 14.0 KB +Total GPU Memory Allocated on cuda:0 is 14.0 KB ------------------------------------------------------------------------------------------------------------ ``` diff --git a/colossalai/utils/tensor_detector/tensor_detector.py b/colossalai/utils/tensor_detector/tensor_detector.py index a8186f76834c..cfcd4e47b4cb 100644 --- a/colossalai/utils/tensor_detector/tensor_detector.py +++ b/colossalai/utils/tensor_detector/tensor_detector.py @@ -55,7 +55,7 @@ def get_tensor_mem(self, tensor): return self.mem_format(memory_size) def mem_format(self, real_memory_size): - # format the tensor memory into a reasonal magnitude + # format the tensor memory into a reasonable magnitude if real_memory_size >= 2**30: return str(real_memory_size / (2**30)) + ' GB' if real_memory_size >= 2**20: @@ -71,7 +71,7 @@ def collect_tensors_state(self): if (not self.include_cpu) and obj.device == torch.device('cpu'): continue self.detected.append(id(obj)) - # skip paramters we had added in __init__ when module is an instance of nn.Module for the first epoch + # skip parameters we had added in __init__ when module is an instance of nn.Module for the first epoch if id(obj) not in self.tensor_info: name = type(obj).__name__ @@ -84,7 +84,7 @@ def collect_tensors_state(self): name = par_name + ' (with grad)' else: # with no grad attached - # there will be no new paramters created during running + # there will be no new parameters created during running # so it must be in saved_tensor_info continue # we can also marked common tensors as tensor(with grad) @@ -155,7 +155,7 @@ def print_tensors_state(self): if device == torch.device('cpu'): continue gpu_mem_alloc = self.mem_format(torch.cuda.memory_allocated(device)) - self.info += f"Totle GPU Memery Allocated on {device} is {gpu_mem_alloc}\n" + self.info += f"Total GPU Memory Allocated on {device} is {gpu_mem_alloc}\n" self.info += LINE self.info += '\n\n' if self.show_info: diff --git a/colossalai/zero/gemini/chunk/manager.py b/colossalai/zero/gemini/chunk/manager.py index d85df0b00476..77368d06d255 100644 --- a/colossalai/zero/gemini/chunk/manager.py +++ b/colossalai/zero/gemini/chunk/manager.py @@ -102,7 +102,7 @@ def access_chunk(self, chunk: Chunk) -> None: """ if chunk in self.accessed_chunks: return - self.__sub_memroy_usage(chunk.memory_usage) + self.__sub_memory_usage(chunk.memory_usage) if chunk.device_type == 'cpu': chunk.shard_move(get_current_device()) self.__add_accessed_chunk(chunk) @@ -114,7 +114,7 @@ def release_chunk(self, chunk: Chunk) -> None: if chunk not in self.accessed_chunks: return if chunk.can_release: - self.__sub_memroy_usage(chunk.memory_usage) + self.__sub_memory_usage(chunk.memory_usage) self.__sub_accessed_chunk(chunk) self.__add_memory_usage(chunk.memory_usage) @@ -123,7 +123,7 @@ def move_chunk(self, chunk: Chunk, device: torch.device, force_copy: bool = Fals """ if not chunk.can_move or chunk.device_type == device.type: return - self.__sub_memroy_usage(chunk.memory_usage) + self.__sub_memory_usage(chunk.memory_usage) chunk.shard_move(device, force_copy) self.__add_memory_usage(chunk.memory_usage) @@ -138,7 +138,7 @@ def reduce_chunk(self, chunk: Chunk) -> bool: """ if not chunk.can_reduce: return False - self.__sub_memroy_usage(chunk.memory_usage) + self.__sub_memory_usage(chunk.memory_usage) chunk.reduce() self.__sub_accessed_chunk(chunk) self.__add_memory_usage(chunk.memory_usage) @@ -228,11 +228,11 @@ def __get_chunk_group(self, group_name: str) -> Deque: return self.chunk_groups[group_name] def __close_one_chunk(self, chunk: Chunk): - self.__sub_memroy_usage(chunk.memory_usage) + self.__sub_memory_usage(chunk.memory_usage) chunk.close_chunk() self.__add_memory_usage(chunk.memory_usage) - def __sub_memroy_usage(self, usage: Dict[str, int]): + def __sub_memory_usage(self, usage: Dict[str, int]): for k, v in usage.items(): self.total_mem[k] -= v diff --git a/colossalai/zero/gemini/chunk/search_utils.py b/colossalai/zero/gemini/chunk/search_utils.py index da58e038c879..881ceb0b3b97 100644 --- a/colossalai/zero/gemini/chunk/search_utils.py +++ b/colossalai/zero/gemini/chunk/search_utils.py @@ -85,7 +85,7 @@ def classify_params_by_dp_degree(param_order: OrderedParamGenerator, Classify the parameters by their dp degree Args: - param_order (OrderedParamGenerator): the order of param be visied + param_order (OrderedParamGenerator): the order of param be vised strict_ddp_flag (bool, optional): whether to enable the strict ddp mode. Defaults to False. Returns: diff --git a/colossalai/zero/gemini/memory_tracer/memory_stats.py b/colossalai/zero/gemini/memory_tracer/memory_stats.py index 9a45034ee27e..41d7e5754e96 100644 --- a/colossalai/zero/gemini/memory_tracer/memory_stats.py +++ b/colossalai/zero/gemini/memory_tracer/memory_stats.py @@ -59,7 +59,7 @@ def increase_preop_step(self, param_list: List[torch.nn.Parameter]): time step. Args: - param_list (List[torch.nn.Parameter]): a list of torch paramters. + param_list (List[torch.nn.Parameter]): a list of torch parameters. """ for p in param_list: if p not in self._param_step_dict: From 86ad586f425dc994a358d426c62482fbfe2be19b Mon Sep 17 00:00:00 2001 From: digger yu Date: Tue, 23 May 2023 15:56:31 +0800 Subject: [PATCH 3/5] fix typo colossalai/auto_parallel autochunk fx/passes etc. --- .github/workflows/README.md | 4 ++-- colossalai/auto_parallel/passes/meta_info_prop.py | 2 +- .../node_handler/strategy/batch_norm_generator.py | 2 +- .../node_handler/strategy/conv_strategy_generator.py | 4 ++-- .../node_handler/strategy/layer_norm_generator.py | 4 ++-- .../node_handler/strategy/normal_pooling_generator.py | 6 +++--- colossalai/autochunk/trace_flow.py | 8 ++++---- colossalai/autochunk/trace_indice.py | 4 ++-- colossalai/booster/plugin/gemini_plugin.py | 2 +- colossalai/cluster/dist_coordinator.py | 2 +- colossalai/device/alpha_beta_profiler.py | 2 +- colossalai/engine/schedule/_pipeline_schedule.py | 4 ++-- colossalai/engine/schedule/_pipeline_schedule_v2.py | 2 +- colossalai/fx/codegen/activation_checkpoint_codegen.py | 2 +- colossalai/fx/passes/adding_split_node_pass.py | 2 +- .../passes/experimental/adding_shape_consistency_pass.py | 2 +- colossalai/fx/passes/meta_info_prop.py | 2 +- colossalai/fx/passes/passes_for_gpt2_test.py | 4 ++-- colossalai/fx/passes/split_module.py | 4 ++-- 19 files changed, 31 insertions(+), 31 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 8fc14e0d531a..f40f4cc86d1b 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -14,7 +14,7 @@ - [Compatibility Test on Dispatch](#compatibility-test-on-dispatch) - [Release](#release) - [User Friendliness](#user-friendliness) - - [Commmunity](#commmunity) + - [Community](#community) - [Configuration](#configuration) - [Progress Log](#progress-log) @@ -97,7 +97,7 @@ This workflow is triggered by manually dispatching the workflow. It has the foll | `Synchronize submodule` | `submodule.yml` | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. | | `Close inactive issues` | `close_inactive.yml` | This workflow will close issues which are stale for 14 days. | -### Commmunity +### Community | Workflow Name | File name | Description | | -------------------------------------------- | -------------------------------- | -------------------------------------------------------------------------------- | diff --git a/colossalai/auto_parallel/passes/meta_info_prop.py b/colossalai/auto_parallel/passes/meta_info_prop.py index bc0960483980..0673b767de7b 100644 --- a/colossalai/auto_parallel/passes/meta_info_prop.py +++ b/colossalai/auto_parallel/passes/meta_info_prop.py @@ -148,7 +148,7 @@ def node_handler(self, node: Node) -> None: graph_info.fwd_tmp = buffer_tensors graph_info.fwd_out = output_tensors - # fetch other memory informations + # fetch other memory information memory_cost = meta_info.memory_cost graph_info.fwd_mem_tmp = memory_cost.fwd.temp graph_info.fwd_mem_out = memory_cost.fwd.activation diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py index 79b69acb25b3..416dc9c29cad 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py @@ -44,7 +44,7 @@ def update_compute_cost(self, strategy: ShardingStrategy): ''' Compute the computation cost per device with this specific strategy. - Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size. ''' # TODO: a constant coefficient need to be added. # 1D: (L) * N * Cin diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py index c2154b3104d3..e605a68a326b 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py @@ -38,9 +38,9 @@ def update_compute_cost(self, strategy: ShardingStrategy): ''' Compute the computation cost per device with this specific strategy. - Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size. ''' - # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size. # 1D: (L) * N * Cout * Cin * kernel # 2D: (H * W) * N * Cout * Cin * kernel # 3D: (H * W * D) * N * Cout * Cin * kernel diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py index fbb6070f7e82..65b173bbf65d 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py @@ -34,9 +34,9 @@ def update_compute_cost(self, strategy: ShardingStrategy): ''' Compute the computation cost per device with this specific strategy. - Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size. ''' - # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size. # TODO: a constant coefficient need to be added. sharded_input_shape = strategy.sharding_specs[self.op_data['input']].get_sharded_shape_per_device() diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py index 9df6d2fbfa12..b7db42f8f67e 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py @@ -17,7 +17,7 @@ class NormalPoolStrategyGenerator(StrategyGenerator): """ NormalPoolStrategyGenerator is a generic class to generate strategies for pool operation like MaxPoolxd. The reason we call this normal pool is AvgPoolxd and MaxPoolxd are taking the kernel size element from image, - and reduce them depening on the operation type. + and reduce them depending on the operation type. """ def validate(self) -> bool: @@ -35,9 +35,9 @@ def update_compute_cost(self, strategy: ShardingStrategy) -> TrainCycleItem: ''' Compute the computation cost per device with this specific strategy. - Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size. ''' - # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size. # 1D: (Lout) * N * C * kernel # 2D: (H * W) * N * Cout * Cin * kernel # 3D: (H * W * D) * N * Cout * Cin * kernel diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py index 11a7e62ff37c..a1080fda1541 100644 --- a/colossalai/autochunk/trace_flow.py +++ b/colossalai/autochunk/trace_flow.py @@ -366,8 +366,8 @@ def flow_search(self, start_idx, start_dim, end_idx, end_dim): # find non chunk inputs chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx) - # reassgin reshape size, some size may have changed due to chunk - chunk_info = self._reassgin_reshape_size(chunk_info) + # reassign reshape size, some size may have changed due to chunk + chunk_info = self._reassign_reshape_size(chunk_info) return chunk_info @@ -428,10 +428,10 @@ def _update_chunk_info(self, chunk_info: Dict, new_all_node_info: Dict, output: chunk_info["outputs_dim"].append(output_dim) return True - def _reassgin_reshape_size(self, chunk_info): + def _reassign_reshape_size(self, chunk_info): """ Some shape args in reshape may have changed due to chunk - reassgin those changed shape + reassign those changed shape """ chunk_region = chunk_info["region"] reshape_size = {} diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py index 8e6cd3e29bea..fbe0741b8827 100644 --- a/colossalai/autochunk/trace_indice.py +++ b/colossalai/autochunk/trace_indice.py @@ -397,7 +397,7 @@ def _assign_conv2d_indice(self, node: Node, node_idx: int) -> None: input_node = node.args[0] assert len(get_node_shape(input_node)) == 4 - # assgin index + # assign index self._assign_indice_as_input(node, node_idx, input_node) self._del_dim(node_idx, 1) self._add_dim(node_idx, 1) @@ -415,7 +415,7 @@ def _assign_interpolate_indice(self, node: Node, node_idx: int) -> None: assert node.kwargs['size'] is None assert len(get_node_shape(node)) == 4 - # assgin index + # assign index self._assign_indice_as_input(node, node_idx) self._mark_computation(node, node_idx, [-1, -2]) diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py index bb3124642ccf..adbf4803eefe 100644 --- a/colossalai/booster/plugin/gemini_plugin.py +++ b/colossalai/booster/plugin/gemini_plugin.py @@ -179,7 +179,7 @@ class GeminiPlugin(DPPluginBase): Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte. - If the aggregate size of parameters is still samller than the minimum chunk size, + If the aggregate size of parameters is still smaller than the minimum chunk size, all parameters will be compacted into one small chunk. memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer. gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward) diff --git a/colossalai/cluster/dist_coordinator.py b/colossalai/cluster/dist_coordinator.py index 99dde810e112..3ee364ec3364 100644 --- a/colossalai/cluster/dist_coordinator.py +++ b/colossalai/cluster/dist_coordinator.py @@ -181,7 +181,7 @@ def on_master_only(self, process_group: ProcessGroup = None): """ is_master = self.is_master(process_group) - # define an inner functiuon + # define an inner function def decorator(func): @functools.wraps(func) diff --git a/colossalai/device/alpha_beta_profiler.py b/colossalai/device/alpha_beta_profiler.py index af2b10928c6f..f8b20de9bc37 100644 --- a/colossalai/device/alpha_beta_profiler.py +++ b/colossalai/device/alpha_beta_profiler.py @@ -381,7 +381,7 @@ def _extract_alpha_beta(pg, pg_handler): first_latency, first_bandwidth = _extract_alpha_beta(first_axis, first_axis_process_group) second_latency, second_bandwidth = _extract_alpha_beta(second_axis, second_axis_process_group) mesh_alpha = [first_latency, second_latency] - # The beta values have been enlarged by 1e10 times temporarilly because the computation cost + # The beta values have been enlarged by 1e10 times temporarily because the computation cost # is still estimated in the unit of TFLOPs instead of time. We will remove this factor in future. mesh_beta = [1e10 / first_bandwidth, 1e10 / second_bandwidth] diff --git a/colossalai/engine/schedule/_pipeline_schedule.py b/colossalai/engine/schedule/_pipeline_schedule.py index 38175fe0941c..9fc301a26559 100644 --- a/colossalai/engine/schedule/_pipeline_schedule.py +++ b/colossalai/engine/schedule/_pipeline_schedule.py @@ -152,9 +152,9 @@ def _get_data_slice(self, data, offset): raise TypeError(f"Expected data to be of type torch.Tensor, list, tuple, or dict, but got {type(data)}") def load_micro_batch(self): - mciro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset) + micro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset) self.microbatch_offset += self.microbatch_size - return self._move_to_device(mciro_batch_data) + return self._move_to_device(micro_batch_data) def pre_processing(self, engine): from colossalai.zero.legacy import ShardedModelV2 diff --git a/colossalai/engine/schedule/_pipeline_schedule_v2.py b/colossalai/engine/schedule/_pipeline_schedule_v2.py index 28c58bd82b5c..89e45c7aacec 100644 --- a/colossalai/engine/schedule/_pipeline_schedule_v2.py +++ b/colossalai/engine/schedule/_pipeline_schedule_v2.py @@ -84,7 +84,7 @@ def forward_backward_step(self, 'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.' self.load_batch(data_iter) - # num_warmup_microbatches is the step when not all the processers are working + # num_warmup_microbatches is the step when not all the processes are working num_warmup_microbatches = \ (gpc.get_world_size(ParallelMode.PIPELINE) - gpc.get_local_rank(ParallelMode.PIPELINE) - 1) diff --git a/colossalai/fx/codegen/activation_checkpoint_codegen.py b/colossalai/fx/codegen/activation_checkpoint_codegen.py index 5a72cb9ca923..33b164800262 100644 --- a/colossalai/fx/codegen/activation_checkpoint_codegen.py +++ b/colossalai/fx/codegen/activation_checkpoint_codegen.py @@ -523,7 +523,7 @@ def emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node_func, # append code text to body for idx, node in enumerate(node_list): # if this is the first node of the ckpt region - # append the ckpt function defition + # append the ckpt function definition if idx in start_idx: label = start_idx.index(idx) ckpt_fn_def = _gen_ckpt_fn_def(label, input_vars[label]) diff --git a/colossalai/fx/passes/adding_split_node_pass.py b/colossalai/fx/passes/adding_split_node_pass.py index 2c7b842b530c..245ba5d776da 100644 --- a/colossalai/fx/passes/adding_split_node_pass.py +++ b/colossalai/fx/passes/adding_split_node_pass.py @@ -206,7 +206,7 @@ def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int): def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int): """ - In avgnode_split_pass, simpliy split graph by node number. + In avgnode_split_pass, simply split graph by node number. """ mod_graph = gm.graph avg_num_node = len(mod_graph.nodes) // pp_size diff --git a/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py b/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py index f28d65e2668a..4571bd93a790 100644 --- a/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py +++ b/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py @@ -16,7 +16,7 @@ def apply(*args, **kwargs): return shape_consistency_manager.apply(*args, **kwargs) -def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh): +def solution_annotation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh): mod_graph = gm.graph nodes = tuple(mod_graph.nodes) diff --git a/colossalai/fx/passes/meta_info_prop.py b/colossalai/fx/passes/meta_info_prop.py index 2b4a8749cfd7..ab203dfd7440 100644 --- a/colossalai/fx/passes/meta_info_prop.py +++ b/colossalai/fx/passes/meta_info_prop.py @@ -31,7 +31,7 @@ class TensorMetadata(NamedTuple): numel: int is_tensor: bool # TODO: we can add a list of sharding spec here, and record the sharding - # behaviour by appending sharding spec into list. + # behavior by appending sharding spec into list. def _extract_tensor_metadata(result: torch.Tensor) -> TensorMetadata: diff --git a/colossalai/fx/passes/passes_for_gpt2_test.py b/colossalai/fx/passes/passes_for_gpt2_test.py index abc1a089e9a9..efdd34a01fe0 100644 --- a/colossalai/fx/passes/passes_for_gpt2_test.py +++ b/colossalai/fx/passes/passes_for_gpt2_test.py @@ -230,7 +230,7 @@ def record_cross_partition_use(def_node: torch.fx.node.Node, use_partition.partitions_dependent_on.setdefault(def_partition_name) node_process_list = list(m.graph.nodes) - # split nodes into parititons + # split nodes into partitions while node_process_list: node = node_process_list.pop(0) orig_nodes[node.name] = node @@ -277,7 +277,7 @@ def record_cross_partition_use(def_node: torch.fx.node.Node, if len(sorted_partitions) != len(partitions): raise RuntimeError("cycle exists between partitions!") - # add placeholders to parititons + # add placeholders to partitions for partition_name in sorted_partitions: partition = partitions[partition_name] for input in partition.inputs: diff --git a/colossalai/fx/passes/split_module.py b/colossalai/fx/passes/split_module.py index 5ce5b969cbde..61ed037ab7a1 100644 --- a/colossalai/fx/passes/split_module.py +++ b/colossalai/fx/passes/split_module.py @@ -29,8 +29,8 @@ def __repr__(self) -> str: f" nodes: {self.node_names},\n" \ f" inputs: {self.inputs},\n" \ f" outputs: {self.outputs},\n" \ - f" partitions depenent on: {self.partitions_dependent_on},\n" \ - f" parition dependents: {self.partition_dependents}" + f" partitions dependent on: {self.partitions_dependent_on},\n" \ + f" partition dependents: {self.partition_dependents}" # Creates subgraphs out of main graph From 9636e44bbd49663424c02d661ec82dc46b4ab48a Mon Sep 17 00:00:00 2001 From: digger yu Date: Wed, 24 May 2023 09:53:21 +0800 Subject: [PATCH 4/5] fix typo docs/ --- docs/README.md | 2 +- docs/REFERENCE.md | 2 +- docs/source/en/advanced_tutorials/add_your_parallel.md | 2 +- .../integrate_mixture_of_experts_into_your_model.md | 2 +- docs/source/en/advanced_tutorials/opt_service.md | 2 +- .../parallelize_your_training_like_Megatron.md | 2 +- .../train_vit_using_pipeline_parallelism.md | 2 +- .../train_vit_with_hybrid_parallelism.md | 10 +++++----- docs/source/en/basics/booster_api.md | 4 ++-- docs/source/en/basics/colotensor_concept.md | 2 +- docs/source/en/features/3D_tensor_parallel.md | 2 +- .../en/features/gradient_clipping_with_booster.md | 2 +- docs/source/en/features/nvme_offload.md | 2 +- docs/source/en/features/pipeline_parallel.md | 2 +- docs/source/en/features/zero_with_chunk.md | 2 +- .../zh-Hans/advanced_tutorials/add_your_parallel.md | 2 +- .../integrate_mixture_of_experts_into_your_model.md | 2 +- docs/source/zh-Hans/advanced_tutorials/meet_gemini.md | 4 ++-- docs/source/zh-Hans/advanced_tutorials/opt_service.md | 2 +- .../train_vit_with_hybrid_parallelism.md | 4 ++-- docs/source/zh-Hans/basics/colotensor_concept.md | 2 +- .../zh-Hans/features/mixed_precision_training.md | 2 +- docs/source/zh-Hans/features/nvme_offload.md | 2 +- 23 files changed, 30 insertions(+), 30 deletions(-) diff --git a/docs/README.md b/docs/README.md index f520608d552c..f0cb50ffe217 100644 --- a/docs/README.md +++ b/docs/README.md @@ -98,7 +98,7 @@ Lastly, if you want to skip some code, you just need to add the following annota ``` -If you have any dependency required, please add it to `requriements-doc-test.txt` for pip and `conda-doc-test-deps.yml` for Conda. +If you have any dependency required, please add it to `requirements-doc-test.txt` for pip and `conda-doc-test-deps.yml` for Conda. ### 💉 Auto Documentation diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md index 2681198191cb..0984b2dc3f28 100644 --- a/docs/REFERENCE.md +++ b/docs/REFERENCE.md @@ -1,6 +1,6 @@ # References -The Colossal-AI project aims to provide a wide array of parallelism techniques for the machine learning community in the big-model era. This project is inspired by quite a few reserach works, some are conducted by some of our developers and the others are research projects open-sourced by other organizations. We would like to credit these amazing projects below in the IEEE citation format. +The Colossal-AI project aims to provide a wide array of parallelism techniques for the machine learning community in the big-model era. This project is inspired by quite a few research works, some are conducted by some of our developers and the others are research projects open-sourced by other organizations. We would like to credit these amazing projects below in the IEEE citation format. ## By Our Team diff --git a/docs/source/en/advanced_tutorials/add_your_parallel.md b/docs/source/en/advanced_tutorials/add_your_parallel.md index be7284a7ab64..1caf58c8734e 100644 --- a/docs/source/en/advanced_tutorials/add_your_parallel.md +++ b/docs/source/en/advanced_tutorials/add_your_parallel.md @@ -56,7 +56,7 @@ follow the steps below to create a new distributed initialization. world_size: int, config: Config, data_parallel_size: int, - pipeline_parlalel_size: int, + pipeline_parallel_size: int, tensor_parallel_size: int, arg1, arg2): diff --git a/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md b/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md index e01caf76d2b3..d5edd135c079 100644 --- a/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md +++ b/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md @@ -121,7 +121,7 @@ Inside the initialization of Experts, the local expert number of each GPU will b ## Train Your Model -Do not to forget to use `colossalai.initialize` function in `colosalai` to add gradient handler for the engine. +Do not to forget to use `colossalai.initialize` function in `colossalai` to add gradient handler for the engine. We handle the back-propagation of MoE models for you. In `colossalai.initialize`, we will automatically create a `MoeGradientHandler` object to process gradients. You can find more information about the handler `MoeGradientHandler` in colossal directory. diff --git a/docs/source/en/advanced_tutorials/opt_service.md b/docs/source/en/advanced_tutorials/opt_service.md index a43ec7fdd1fe..eccfa12f9389 100644 --- a/docs/source/en/advanced_tutorials/opt_service.md +++ b/docs/source/en/advanced_tutorials/opt_service.md @@ -53,7 +53,7 @@ export CHECKPOINT_DIR="your_opt_checkpoint_path" # the ${CONFIG_DIR} must contain a server.sh file as the entry of service export CONFIG_DIR="config_file_path" -docker run --gpus all --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:lastest +docker run --gpus all --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:latest ``` Then open `https://[IP-ADDRESS]:8020/docs#` in your browser to try out! diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md index e7698e5e9d1b..1a7ab9a65674 100644 --- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md +++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md @@ -69,7 +69,7 @@ After the forward operation of the embedding module, each word in all sequences
The embedding module
-Each transformer layer contains two blocks. The self-attention operation is called in the first block and a two-layer percepton is located in the second block. +Each transformer layer contains two blocks. The self-attention operation is called in the first block and a two-layer perception is located in the second block.
diff --git a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md index b26599740c5f..6adfe4f113da 100644 --- a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md +++ b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md @@ -195,7 +195,7 @@ def build_cifar(batch_size): ## Training ViT using pipeline -You can set the size of pipeline parallel and number of microbatches in config. `NUM_CHUNKS` is useful when using interleved-pipeline (for more details see [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473) ). The original batch will be split into `num_microbatches`, and each stage will load a micro batch each time. Then we will generate an approriate schedule for you to execute the pipeline training. If you don't need the output and label of model, you can set `return_output_label` to `False` when calling `trainer.fit()` which can further reduce GPU memory usage. +You can set the size of pipeline parallel and number of microbatches in config. `NUM_CHUNKS` is useful when using interleaved-pipeline (for more details see [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473) ). The original batch will be split into `num_microbatches`, and each stage will load a micro batch each time. Then we will generate an appropriate schedule for you to execute the pipeline training. If you don't need the output and label of model, you can set `return_output_label` to `False` when calling `trainer.fit()` which can further reduce GPU memory usage. You should `export DATA=/path/to/cifar`. diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md index b2438a1cf562..a2deaeb88893 100644 --- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md +++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md @@ -16,14 +16,14 @@ In this example for ViT model, Colossal-AI provides three different parallelism We will show you how to train ViT on CIFAR-10 dataset with these parallelism techniques. To run this example, you will need 2-4 GPUs. -## Tabel of Contents +## Table of Contents 1. Colossal-AI installation 2. Steps to train ViT with data parallelism 3. Steps to train ViT with pipeline parallelism 4. Steps to train ViT with tensor parallelism or hybrid parallelism ## Colossal-AI Installation -You can install Colossal-AI pacakage and its dependencies with PyPI. +You can install Colossal-AI package and its dependencies with PyPI. ```bash pip install colossalai ``` @@ -31,7 +31,7 @@ pip install colossalai ## Data Parallelism -Data parallism is one basic way to accelerate model training process. You can apply data parallelism to training by only two steps: +Data parallelism is one basic way to accelerate model training process. You can apply data parallelism to training by only two steps: 1. Define a configuration file 2. Change a few lines of code in train script @@ -94,7 +94,7 @@ from torchvision import transforms from torchvision.datasets import CIFAR10 ``` -#### Lauch Colossal-AI +#### Launch Colossal-AI In train script, you need to initialize the distributed environment for Colossal-AI after your config file is prepared. We call this process `launch`. In Colossal-AI, we provided several launch methods to initialize the distributed backend. In most cases, you can use `colossalai.launch` and `colossalai.get_default_parser` to pass the parameters via command line. Besides, Colossal-AI can utilize the existing launch tool provided by PyTorch as many users are familiar with by using `colossalai.launch_from_torch`. For more details, you can view the related [documents](https://www.colossalai.org/docs/basics/launch_colossalai). @@ -613,7 +613,7 @@ NUM_MICRO_BATCHES = parallel['pipeline'] TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE) ``` -Ohter configs: +Other configs: ```python # hyper parameters # BATCH_SIZE is as per GPU diff --git a/docs/source/en/basics/booster_api.md b/docs/source/en/basics/booster_api.md index cafcb6d432c3..a446ff31be83 100644 --- a/docs/source/en/basics/booster_api.md +++ b/docs/source/en/basics/booster_api.md @@ -14,9 +14,9 @@ In our new design, `colossalai.booster` replaces the role of `colossalai.initial ### Plugin Plugin is an important component that manages parallel configuration (eg: The gemini plugin encapsulates the gemini acceleration solution). Currently supported plugins are as follows: -***GeminiPlugin:*** This plugin wrapps the Gemini acceleration solution, that ZeRO with chunk-based memory management. +***GeminiPlugin:*** This plugin wraps the Gemini acceleration solution, that ZeRO with chunk-based memory management. -***TorchDDPPlugin:*** This plugin wrapps the DDP acceleration solution, it implements data parallelism at the module level which can run across multiple machines. +***TorchDDPPlugin:*** This plugin wraps the DDP acceleration solution, it implements data parallelism at the module level which can run across multiple machines. ***LowLevelZeroPlugin:*** This plugin wraps the 1/2 stage of Zero Redundancy Optimizer. Stage 1 : Shards optimizer states across data parallel workers/GPUs. Stage 2 : Shards optimizer states + gradients across data parallel workers/GPUs. diff --git a/docs/source/en/basics/colotensor_concept.md b/docs/source/en/basics/colotensor_concept.md index 050f2ef9f092..abe470fe0794 100644 --- a/docs/source/en/basics/colotensor_concept.md +++ b/docs/source/en/basics/colotensor_concept.md @@ -52,7 +52,7 @@ An instance of class [ComputeSpec](https://colossalai.readthedocs.io/en/latest/c ## Example -Let's see an example. A ColoTensor is initialized and sharded on 8 GPUs using tp_degree=4, dp_dgree=2. And then the tensor is sharded along the last dim among the TP process groups. Finally, we reshard it along the first dim (0 dim) among the TP process groups. We encourage users to run the code and observe the shape of each tensor. +Let's see an example. A ColoTensor is initialized and sharded on 8 GPUs using tp_degree=4, dp_degree=2. And then the tensor is sharded along the last dim among the TP process groups. Finally, we reshard it along the first dim (0 dim) among the TP process groups. We encourage users to run the code and observe the shape of each tensor. ```python diff --git a/docs/source/en/features/3D_tensor_parallel.md b/docs/source/en/features/3D_tensor_parallel.md index b9e98eac9350..0e28f08b23c9 100644 --- a/docs/source/en/features/3D_tensor_parallel.md +++ b/docs/source/en/features/3D_tensor_parallel.md @@ -67,7 +67,7 @@ Given $P=q \times q \times q$ processors, we present the theoretical computation ## Usage -To enable 3D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallism setting as below. +To enable 3D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallelism setting as below. ```python CONFIG = dict(parallel=dict( data=1, diff --git a/docs/source/en/features/gradient_clipping_with_booster.md b/docs/source/en/features/gradient_clipping_with_booster.md index 8686eb06ff54..341a608a5c7b 100644 --- a/docs/source/en/features/gradient_clipping_with_booster.md +++ b/docs/source/en/features/gradient_clipping_with_booster.md @@ -75,7 +75,7 @@ Build your model, optimizer, loss function, lr scheduler and dataloaders. Note t NUM_EPOCHS = 200 BATCH_SIZE = 128 GRADIENT_CLIPPING = 0.1 -# build resnetå +# build resnet model = resnet34(num_classes=10) # build dataloaders train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')), diff --git a/docs/source/en/features/nvme_offload.md b/docs/source/en/features/nvme_offload.md index 4374da3c9c45..d940fd5eca14 100644 --- a/docs/source/en/features/nvme_offload.md +++ b/docs/source/en/features/nvme_offload.md @@ -53,7 +53,7 @@ It's compatible with all parallel methods in ColossalAI. > ⚠ It only offloads optimizer states on CPU. This means it only affects CPU training or Zero/Gemini with offloading. -## Exampls +## Examples Let's start from two simple examples -- training GPT with different methods. These examples relies on `transformers`. diff --git a/docs/source/en/features/pipeline_parallel.md b/docs/source/en/features/pipeline_parallel.md index ac49863b3c71..30654b0b0195 100644 --- a/docs/source/en/features/pipeline_parallel.md +++ b/docs/source/en/features/pipeline_parallel.md @@ -156,4 +156,4 @@ trainer.fit(train_dataloader=train_dataloader, display_progress=True) ``` -We use `2` pipeline stages and the batch will be splitted into `4` micro batches. +We use `2` pipeline stages and the batch will be split into `4` micro batches. diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md index a105831a5409..8448c52acf06 100644 --- a/docs/source/en/features/zero_with_chunk.md +++ b/docs/source/en/features/zero_with_chunk.md @@ -72,7 +72,7 @@ chunk_manager = init_chunk_manager(model=module, gemini_manager = GeminiManager(placement_policy, chunk_manager) ``` -`hidden_dim` is the hidden dimension of DNN. Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. `min_chunk_size_mb` is the the minimum chunk size in MegaByte. If the aggregate size of parameters is still samller than the minimum chunk size, all parameters will be compacted into one small chunk. +`hidden_dim` is the hidden dimension of DNN. Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. `min_chunk_size_mb` is the the minimum chunk size in MegaByte. If the aggregate size of parameters is still smaller than the minimum chunk size, all parameters will be compacted into one small chunk. Initialization of the optimizer. ```python diff --git a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md index 4825a6fa1d6c..059eb014affd 100644 --- a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md +++ b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md @@ -48,7 +48,7 @@ Colossal-AI 为用户提供了一个全局 context,使他们能够轻松地管 world_size: int, config: Config, data_parallel_size: int, - pipeline_parlalel_size: int, + pipeline_parallel_size: int, tensor_parallel_size: int, arg1, arg2): diff --git a/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md b/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md index 456878caa147..276fcc2619e0 100644 --- a/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md +++ b/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md @@ -122,7 +122,7 @@ Inside the initialization of Experts, the local expert number of each GPU will b ## Train Your Model -Do not to forget to use `colossalai.initialize` function in `colosalai` to add gradient handler for the engine. +Do not to forget to use `colossalai.initialize` function in `colossalai` to add gradient handler for the engine. We handle the back-propagation of MoE models for you. In `colossalai.initialize`, we will automatically create a `MoeGradientHandler` object to process gradients. You can find more information about the handler `MoeGradientHandler` in colossal directory. diff --git a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md index 2bf0a9c98c3f..a52bc6ac7b7f 100644 --- a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md +++ b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md @@ -48,7 +48,7 @@ zero = dict(
-ColossalAI设计了Gemini,就像双子星一样,它管理CPU和GPU二者内存空间。它可以让张量在训练过程中动态分布在CPU-GPU的存储空间内,从而让模型训练突破GPU的内存墙。内存管理器由两部分组成,分别是MemStatsCollector(MSC)和StatefuleTensorMgr(STM)。 +ColossalAI设计了Gemini,就像双子星一样,它管理CPU和GPU二者内存空间。它可以让张量在训练过程中动态分布在CPU-GPU的存储空间内,从而让模型训练突破GPU的内存墙。内存管理器由两部分组成,分别是MemStatsCollector(MSC)和StatefulTensorMgr(STM)。 我们利用了深度学习网络训练过程的迭代特性。我们将迭代分为warmup和non-warmup两个阶段,开始时的一个或若干迭代步属于预热阶段,其余的迭代步属于正式阶段。在warmup阶段我们为MSC收集信息,而在non-warmup阶段STM入去MSC收集的信息来移动tensor,以达到最小化CPU-GPU数据移动volume的目的。 @@ -75,7 +75,7 @@ STM管理所有model data tensor的信息。在模型的构造过程中,Coloss 我们在算子的开始和结束计算时,触发内存采样操作,我们称这个时间点为**采样时刻(sampling moment)**,两个采样时刻之间的时间我们称为**period**。计算过程是一个黑盒,由于可能分配临时buffer,内存使用情况很复杂。但是,我们可以较准确的获取period的系统最大内存使用。非模型数据的使用可以通过两个统计时刻之间系统最大内存使用-模型内存使用获得。 -我们如何设计采样时刻呢。我们选择preOp的model data layout adjust之前。如下图所示。我们采样获得上一个period的system memory used,和下一个period的model data memoy used。并行策略会给MSC的工作造成障碍。如图所示,比如对于ZeRO或者Tensor Parallel,由于Op计算前需要gather模型数据,会带来额外的内存需求。因此,我们要求在模型数据变化前进行采样系统内存,这样在一个period内,MSC会把preOp的模型变化内存捕捉。比如在period 2-3内,我们考虑的tensor gather和shard带来的内存变化。 +我们如何设计采样时刻呢。我们选择preOp的model data layout adjust之前。如下图所示。我们采样获得上一个period的system memory used,和下一个period的model data memory used。并行策略会给MSC的工作造成障碍。如图所示,比如对于ZeRO或者Tensor Parallel,由于Op计算前需要gather模型数据,会带来额外的内存需求。因此,我们要求在模型数据变化前进行采样系统内存,这样在一个period内,MSC会把preOp的模型变化内存捕捉。比如在period 2-3内,我们考虑的tensor gather和shard带来的内存变化。 尽管可以将采样时刻放在其他位置,比如排除gather buffer的变动新信息,但是会给造成麻烦。不同并行方式Op的实现有差异,比如对于Linear Op,Tensor Parallel中gather buffer的分配在Op中。而对于ZeRO,gather buffer的分配是在PreOp中。将放在PreOp开始时采样有利于将两种情况统一。 diff --git a/docs/source/zh-Hans/advanced_tutorials/opt_service.md b/docs/source/zh-Hans/advanced_tutorials/opt_service.md index a213584fd41d..1f8324a53ecb 100644 --- a/docs/source/zh-Hans/advanced_tutorials/opt_service.md +++ b/docs/source/zh-Hans/advanced_tutorials/opt_service.md @@ -52,7 +52,7 @@ export CHECKPOINT_DIR="your_opt_checkpoint_path" # the ${CONFIG_DIR} must contain a server.sh file as the entry of service export CONFIG_DIR="config_file_path" -docker run --gpus all --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:lastest +docker run --gpus all --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:latest ``` 接下来,您就可以在您的浏览器中打开 `https://[IP-ADDRESS]:8020/docs#` 进行测试。 diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md index 6dc5eccf4421..e2f2c90a3791 100644 --- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md +++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md @@ -477,7 +477,7 @@ def build_cifar(batch_size): return train_dataloader, test_dataloader -# craete dataloaders +# create dataloaders train_dataloader , test_dataloader = build_cifar() # create loss function criterion = CrossEntropyLoss(label_smoothing=0.1) @@ -492,7 +492,7 @@ lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, #### 启动 Colossal-AI 引擎 ```python -# intiailize +# initialize engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model, optimizer=optimizer, criterion=criterion, diff --git a/docs/source/zh-Hans/basics/colotensor_concept.md b/docs/source/zh-Hans/basics/colotensor_concept.md index b725e48a7cb1..ab2413e990f7 100644 --- a/docs/source/zh-Hans/basics/colotensor_concept.md +++ b/docs/source/zh-Hans/basics/colotensor_concept.md @@ -53,7 +53,7 @@ ColoTensor 包含额外的属性[ColoTensorSpec](https://colossalai.readthedocs. ## Example -让我们看一个例子。 使用 tp_degree=4, dp_dgree=2 在 8 个 GPU 上初始化并Shard一个ColoTensor。 然后tensor被沿着 TP 进程组中的最后一个维度进行分片。 最后,我们沿着 TP 进程组中的第一个维度(dim 0)对其进行重新Shard。 我们鼓励用户运行代码并观察每个张量的形状。 +让我们看一个例子。 使用 tp_degree=4, dp_degree=2 在 8 个 GPU 上初始化并Shard一个ColoTensor。 然后tensor被沿着 TP 进程组中的最后一个维度进行分片。 最后,我们沿着 TP 进程组中的第一个维度(dim 0)对其进行重新Shard。 我们鼓励用户运行代码并观察每个张量的形状。 ```python diff --git a/docs/source/zh-Hans/features/mixed_precision_training.md b/docs/source/zh-Hans/features/mixed_precision_training.md index c4df6271b3bb..4628b09cd910 100644 --- a/docs/source/zh-Hans/features/mixed_precision_training.md +++ b/docs/source/zh-Hans/features/mixed_precision_training.md @@ -203,7 +203,7 @@ Naive AMP 的默认参数: - initial_scale(int): gradient scaler 的初始值 - growth_factor(int): loss scale 的增长率 - backoff_factor(float): loss scale 的下降率 -- hysterisis(int): 动态 loss scaling 的延迟偏移 +- hysteresis(int): 动态 loss scaling 的延迟偏移 - max_scale(int): loss scale 的最大允许值 - verbose(bool): 如果被设为`True`,将打印调试信息 diff --git a/docs/source/zh-Hans/features/nvme_offload.md b/docs/source/zh-Hans/features/nvme_offload.md index fd75ed1f5b3e..db5f10184a99 100644 --- a/docs/source/zh-Hans/features/nvme_offload.md +++ b/docs/source/zh-Hans/features/nvme_offload.md @@ -53,7 +53,7 @@ optimizer = HybridAdam(model.parameters(), lr=1e-3, nvme_offload_fraction=1.0, n > ⚠ 它只会卸载在 CPU 上的优化器状态。这意味着它只会影响 CPU 训练或者使用卸载的 Zero/Gemini。 -## Exampls +## Examples Let's start from two simple examples -- training GPT with different methods. These examples relies on `transformers`. 首先让我们从两个简单的例子开始 -- 用不同的方法训练 GPT。这些例子依赖`transformers`。 From 767ee4ede1ac57451d64b5ae13c3cee9f265596e Mon Sep 17 00:00:00 2001 From: digger yu Date: Wed, 24 May 2023 14:17:41 +0800 Subject: [PATCH 5/5] change placememt_policy to placement_policy in docs/ and examples/ --- .../parallelize_your_training_like_Megatron.md | 4 ++-- docs/source/en/features/zero_with_chunk.md | 8 ++++---- .../parallelize_your_training_like_Megatron.md | 4 ++-- docs/source/zh-Hans/features/zero_with_chunk.md | 8 ++++---- examples/images/dreambooth/train_dreambooth_colossalai.py | 4 ++-- .../images/dreambooth/train_dreambooth_colossalai_lora.py | 4 ++-- examples/language/palm/train.py | 8 ++++---- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md index 1a7ab9a65674..22d52fb3cd1a 100644 --- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md +++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md @@ -175,11 +175,11 @@ In this way, users can train their models as usual. In our latest example, a Gemini + ZeRO DDP model is also defined to reduce overhead and improve efficiency.For the details of this part, please refer to [ZeRO](../features/zero_with_chunk.md). You can combine these two parts to understand our entire training process: ```python -def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"): +def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"): from colossalai.nn.parallel import GeminiDDP model = GeminiDDP(model, device=get_current_device(), - placement_policy=placememt_policy, + placement_policy=placement_policy, pin_memory=True, search_range_mb=32) return model diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md index 8448c52acf06..d7a99f2fbbfd 100644 --- a/docs/source/en/features/zero_with_chunk.md +++ b/docs/source/en/features/zero_with_chunk.md @@ -185,23 +185,23 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup): Define a model which uses Gemini + ZeRO DDP: ```python -def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"): +def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"): cai_version = colossalai.__version__ if version.parse(cai_version) > version.parse("0.1.10"): from colossalai.nn.parallel import GeminiDDP model = GeminiDDP(model, device=get_current_device(), - placement_policy=placememt_policy, + placement_policy=placement_policy, pin_memory=True, search_range_mb=32) elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"): from colossalai.gemini import ChunkManager, GeminiManager chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32) - gemini_manager = GeminiManager(placememt_policy, chunk_manager) + gemini_manager = GeminiManager(placement_policy, chunk_manager) chunk_manager = ChunkManager(chunk_size, pg, enable_distributed_storage=True, - init_device=GeminiManager.get_default_device(placememt_policy)) + init_device=GeminiManager.get_default_device(placement_policy)) model = ZeroDDP(model, gemini_manager) else: raise NotImplemented(f"CAI version {cai_version} is not supported") diff --git a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md index f3c6247c38e4..c4131e593437 100644 --- a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md +++ b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md @@ -159,11 +159,11 @@ for mn, module in model.named_modules(): 在我们最新示例中还定义了一个Gemini + ZeRO DDP 的模型从而减小开销,提升效率。这一部分的详细内容可以参考[ZeRO](../features/zero_with_chunk.md),你可以将这两部分内容结合起来看从而理解我们整个训练流程: ```python -def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"): +def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"): from colossalai.nn.parallel import GeminiDDP model = GeminiDDP(model, device=get_current_device(), - placement_policy=placememt_policy, + placement_policy=placement_policy, pin_memory=True, search_range_mb=32) return model diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md index 72403bf610a4..ba57ba4e8e61 100644 --- a/docs/source/zh-Hans/features/zero_with_chunk.md +++ b/docs/source/zh-Hans/features/zero_with_chunk.md @@ -185,23 +185,23 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup): 定义一个使用 Gemini + ZeRO DDP 的模型: ```python -def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"): +def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"): cai_version = colossalai.__version__ if version.parse(cai_version) > version.parse("0.1.10"): from colossalai.nn.parallel import GeminiDDP model = GeminiDDP(model, device=get_current_device(), - placement_policy=placememt_policy, + placement_policy=placement_policy, pin_memory=True, search_range_mb=32) elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"): from colossalai.gemini import ChunkManager, GeminiManager chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32) - gemini_manager = GeminiManager(placememt_policy, chunk_manager) + gemini_manager = GeminiManager(placement_policy, chunk_manager) chunk_manager = ChunkManager(chunk_size, pg, enable_distributed_storage=True, - init_device=GeminiManager.get_default_device(placememt_policy)) + init_device=GeminiManager.get_default_device(placement_policy)) model = ZeroDDP(model, gemini_manager) else: raise NotImplemented(f"CAI version {cai_version} is not supported") diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py index e6159e1058b9..d07febea0a84 100644 --- a/examples/images/dreambooth/train_dreambooth_colossalai.py +++ b/examples/images/dreambooth/train_dreambooth_colossalai.py @@ -340,12 +340,12 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: # Gemini + ZeRO DDP -def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"): +def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"): from colossalai.nn.parallel import GeminiDDP model = GeminiDDP(model, device=get_current_device(), - placement_policy=placememt_policy, + placement_policy=placement_policy, pin_memory=True, search_range_mb=64) return model diff --git a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py index 1b2fc778d5ed..6715b473a567 100644 --- a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py +++ b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py @@ -342,12 +342,12 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: # Gemini + ZeRO DDP -def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"): +def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"): from colossalai.nn.parallel import GeminiDDP model = GeminiDDP(model, device=get_current_device(), - placement_policy=placememt_policy, + placement_policy=placement_policy, pin_memory=True, search_range_mb=64) return model diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py index 7923e4fc855d..b16da1c7744a 100644 --- a/examples/language/palm/train.py +++ b/examples/language/palm/train.py @@ -102,23 +102,23 @@ def get_model_size(model: nn.Module): # Gemini + ZeRO DDP -def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"): +def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"): cai_version = colossalai.__version__ if version.parse(cai_version) > version.parse("0.1.10"): from colossalai.nn.parallel import GeminiDDP model = GeminiDDP(model, device=get_current_device(), - placement_policy=placememt_policy, + placement_policy=placement_policy, pin_memory=True, search_range_mb=32) elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"): from colossalai.gemini import ChunkManager, GeminiManager chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32) - gemini_manager = GeminiManager(placememt_policy, chunk_manager) + gemini_manager = GeminiManager(placement_policy, chunk_manager) chunk_manager = ChunkManager(chunk_size, pg, enable_distributed_storage=True, - init_device=GeminiManager.get_default_device(placememt_policy)) + init_device=GeminiManager.get_default_device(placement_policy)) model = ZeroDDP(model, gemini_manager) else: raise NotImplemented(f"CAI version {cai_version} is not supported")