From 39f1da31dee9ccc7b962f0f0a7e2adafbd0948bb Mon Sep 17 00:00:00 2001 From: digger yu Date: Wed, 17 May 2023 13:56:54 +0800 Subject: [PATCH 1/2] fix typo colossalai/autochunk auto_parallel amp --- colossalai/amp/torch_amp/_grad_scaler.py | 2 +- .../auto_parallel/meta_profiler/meta_registry/linear.py | 2 +- colossalai/auto_parallel/passes/runtime_apply_pass.py | 2 +- .../auto_parallel/passes/runtime_preparation_pass.py | 4 ++-- colossalai/autochunk/trace_flow.py | 6 +++--- colossalai/autochunk/trace_indice.py | 8 ++++---- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/colossalai/amp/torch_amp/_grad_scaler.py b/colossalai/amp/torch_amp/_grad_scaler.py index 7b78998fb8c2..ed4b8e484436 100644 --- a/colossalai/amp/torch_amp/_grad_scaler.py +++ b/colossalai/amp/torch_amp/_grad_scaler.py @@ -240,7 +240,7 @@ def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16): for grads in per_dtype_grads.values(): torch._amp_foreach_non_finite_check_and_unscale_(grads, per_device_found_inf.get(device), per_device_inv_scale.get(device)) - # For tensor parallel paramters it should be all-reduced over tensor parallel process group + # For tensor parallel parameters it should be all-reduced over tensor parallel process group if gpc.is_initialized(ParallelMode.MODEL) and gpc.get_world_size(ParallelMode.MODEL) > 1: vals = [val for val in per_device_found_inf._per_device_tensors.values()] coalesced = _flatten_dense_tensors(vals) diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py index 7697fc6c383d..94dd9143e0ae 100644 --- a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py +++ b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py @@ -325,7 +325,7 @@ def matmul_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L else: _is_batch_dims_same = False - # retireve dimensions + # retrieve dimensions input_dim_00 = input_tensors[0].shape[-2] input_dim_01 = input_tensors[0].shape[-1] input_dim_10 = input_tensors[1].shape[-2] diff --git a/colossalai/auto_parallel/passes/runtime_apply_pass.py b/colossalai/auto_parallel/passes/runtime_apply_pass.py index a473bb6e973d..2049a06187d2 100644 --- a/colossalai/auto_parallel/passes/runtime_apply_pass.py +++ b/colossalai/auto_parallel/passes/runtime_apply_pass.py @@ -219,7 +219,7 @@ def _comm_spec_apply(gm: torch.fx.GraphModule): return gm -def _act_annotataion_pass(gm: torch.fx.GraphModule): +def _act_annotation_pass(gm: torch.fx.GraphModule): """ This pass is used to add the act annotation to the new inserted nodes. """ diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py index 177f3765f5a0..9a2314826448 100644 --- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py +++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py @@ -54,7 +54,7 @@ def size_processing(size: Union[int, torch.Size], return size -def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int], +def solution_annotation_pass(gm: torch.fx.GraphModule, solution: List[int], strategies_constructor: StrategiesConstructor): """ This method is used to stick the solution strategy to the nodes and add the information @@ -496,7 +496,7 @@ def runtime_preparation_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh, strategies_constructor: StrategiesConstructor, overlap=False): - gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict = solution_annotatation_pass( + gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict = solution_annotation_pass( gm, solution, strategies_constructor) gm = size_value_converting_pass(gm, device_mesh) gm = node_args_converting_pass(gm, device_mesh) diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py index db25267e9b42..11a7e62ff37c 100644 --- a/colossalai/autochunk/trace_flow.py +++ b/colossalai/autochunk/trace_flow.py @@ -64,7 +64,7 @@ def check_index_compute(self, start_idx, end_dim, end_node, end_idx): return False return True - def _assgin_single_node_flow( + def _assign_single_node_flow( self, arg_node: Node, start_idx: int, @@ -177,7 +177,7 @@ def _get_all_node_info(self, end_dim, start_idx, end_idx): if get_node_shape(arg) is None: continue arg_list.append(arg) - flow_flag = self._assgin_single_node_flow( + flow_flag = self._assign_single_node_flow( arg, start_idx, end_idx, @@ -315,7 +315,7 @@ def _get_prepose_nodes(self, all_node_info: Dict, start_idx: int, end_idx: int, chunk_info["args"]["prepose_nodes"] = prepose_nodes def _get_non_chunk_inputs(self, chunk_info, start_idx, end_idx): - # we need to log input nodes to avoid deleteing them in the loop + # we need to log input nodes to avoid deleting them in the loop chunk_node_list = self.node_mgr.get_node_slice_by_idx(start_idx, end_idx + 1) # also need to get some prepose node's arg out of non_chunk_inputs for n in chunk_info["args"]["prepose_nodes"]: diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py index d56bf843f18d..8e6cd3e29bea 100644 --- a/colossalai/autochunk/trace_indice.py +++ b/colossalai/autochunk/trace_indice.py @@ -461,7 +461,7 @@ def _assign_elementwise_indice(self, node, idx): nodes_in.append(node_in) self._inherit_more_indice_from_node_with_exclude(node_in, node) - def _assgin_no_change_indice(self, node, idx): + def _assign_no_change_indice(self, node, idx): self._assign_indice_as_input(node, idx) for node_in in node.args: if type(node_in) == type(node): @@ -792,7 +792,7 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None: self._add_dim(node_idx, i) dim_from.reverse() - # inheirt indice from current node + # inherit indice from current node if len(dim_from) != 0 and len(dim_to) != 0: if dim_diff == 1: if origin_shape[dim_from[0]] == 1: @@ -852,7 +852,7 @@ def trace_indice(self) -> None: elif "split" == node_name: self._assign_split_indice(node, idx) elif any(i == node_name for i in ["to", "contiguous", "clone", "type", "float"]): - self._assgin_no_change_indice(node, idx) + self._assign_no_change_indice(node, idx) elif "new_ones" == node_name: self._assign_all_indice(node, idx) elif "flatten" == node_name: @@ -914,7 +914,7 @@ def trace_indice(self) -> None: elif "conv2d" == node_name: self._assign_conv2d_indice(node, idx) elif "identity" == node_name: - self._assgin_no_change_indice(node, idx) + self._assign_no_change_indice(node, idx) elif any(n == node_name for n in ["sigmoid", "dropout", "relu", "silu", "gelu"]): self._assign_elementwise_indice(node, idx) else: From dce5d3d469b34da1dbd26f949c1eecce3e99dfc2 Mon Sep 17 00:00:00 2001 From: digger yu Date: Fri, 19 May 2023 14:18:14 +0800 Subject: [PATCH 2/2] fix typo colossalai/auto_parallel nn utils etc. --- applications/Chat/coati/dataset/reward_dataset.py | 2 +- .../tensor_shard/node_handler/embedding_handler.py | 4 ++-- .../tensor_shard/node_handler/linear_handler.py | 6 +++--- .../tensor_shard/node_handler/matmul_handler.py | 10 +++++----- .../tensor_shard/node_handler/node_handler.py | 2 +- .../auto_parallel/tensor_shard/utils/factory.py | 2 +- .../auto_parallel/tensor_shard/utils/reshape.py | 12 ++++++------ colossalai/nn/optimizer/cpu_adam.py | 2 +- colossalai/nn/optimizer/hybrid_adam.py | 8 ++++---- .../nn/parallel/layers/cache_embedding/cache_mgr.py | 6 +++--- colossalai/utils/common.py | 2 +- colossalai/utils/tensor_detector/readme.md | 12 ++++++------ colossalai/utils/tensor_detector/tensor_detector.py | 8 ++++---- colossalai/zero/gemini/chunk/manager.py | 12 ++++++------ colossalai/zero/gemini/chunk/search_utils.py | 2 +- colossalai/zero/gemini/memory_tracer/memory_stats.py | 2 +- 16 files changed, 46 insertions(+), 46 deletions(-) diff --git a/applications/Chat/coati/dataset/reward_dataset.py b/applications/Chat/coati/dataset/reward_dataset.py index faa1c94d2728..5dacf7e81464 100644 --- a/applications/Chat/coati/dataset/reward_dataset.py +++ b/applications/Chat/coati/dataset/reward_dataset.py @@ -6,7 +6,7 @@ from .utils import is_rank_0 -# Dahaos/rm-static +# Dahoas/rm-static class RmStaticDataset(Dataset): """ Dataset for reward model diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py index e154105b672d..112ee194b4ec 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py @@ -155,7 +155,7 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li Convert the sharding spec from the logical shape to the physical shape. """ # create multiple sharding strategies for the inputs - # as input can be multi-dimensinal and the partition dim is only 2D, + # as input can be multi-dimensional and the partition dim is only 2D, # we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy, input_name=str( @@ -221,7 +221,7 @@ def post_process(self, strategy: ShardingStrategy): Convert the sharding spec from the logical shape to the physical shape. """ # create multiple sharding strategies for the inputs - # as input can be multi-dimensinal and the partition dim is only 2D, + # as input can be multi-dimensional and the partition dim is only 2D, # we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy, input_name=str( diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py index 59091dab519f..ea541e434009 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py @@ -23,7 +23,7 @@ def _update_sharding_spec_for_transposed_weight_for_linear(strategy: ShardingStr weight_name: str) -> ShardingStrategy: """ This function is a helper function used by both module node handler and function node handler. This function will - convert the sharding spec for the transposed weight to the correct partititon spec. + convert the sharding spec for the transposed weight to the correct partition spec. Args: strategy (ShardingStrategy): the strategy generated by the strategy generator. @@ -197,7 +197,7 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy, weight_name='weight') # create multiple sharding strategies for the inputs - # as input can be multi-dimensinal and the partition dim is only 2D, + # as input can be multi-dimensional and the partition dim is only 2D, # we need to map the partition at dim 0 to one of the first few dimensions of the input strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy, input_name=str(self.node.args[0]), @@ -267,7 +267,7 @@ def post_process(self, strategy: ShardingStrategy): strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy, weight_name=str(self.node.args[1])) # create multiple sharding strategies for the inputs - # as input can be multi-dimensinal and the partition dim is only 2D, + # as input can be multi-dimensional and the partition dim is only 2D, # we need to map the partition at dim 0 to one of the first few dimensions of the input strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy, input_name=str(self.node.args[0]), diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py index f3c9d0cbf826..bfebc3f59d0c 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py @@ -48,8 +48,8 @@ def get_matmul_type(input_dim: int, other_dim: int): Determine which type of matmul operation should be executed for the given tensor dimensions. Args: - input_dim (int): the number of dimensions for the input tenosr - other_dim (int): the number of dimensions for the other tenosr + input_dim (int): the number of dimensions for the input tensor + other_dim (int): the number of dimensions for the other tensor """ if input_dim == 1 and other_dim == 1: matmul_type = MatMulType.DOT @@ -268,13 +268,13 @@ def _update_sharding_spec(key, strategy, physical_batch_dim): dim_partition_dict = sharding_spec.dim_partition_dict entire_shape = sharding_spec.entire_shape - # upddate the dimension index for the matrix dimensions + # update the dimension index for the matrix dimensions if 2 in dim_partition_dict: dim_partition_dict[len(self.batch_dims_before_view) + 1] = dim_partition_dict.pop(2) if 1 in dim_partition_dict: dim_partition_dict[len(self.batch_dims_before_view)] = dim_partition_dict.pop(1) - # map the logical batch dim to phyiscal batch dim + # map the logical batch dim to physical batch dim if 0 in dim_partition_dict: batch_dim_shard = dim_partition_dict.pop(0) dim_partition_dict[physical_batch_dim] = batch_dim_shard @@ -414,7 +414,7 @@ def _get_logical_shape_for_dot(self): def _get_logical_shape_for_mm(self): """ - We need to handle the input tensor for a matrix-matrix multiplcation as the input + We need to handle the input tensor for a matrix-matrix multiplication as the input tensor can be a 1D or 2D tensor. If it is a 1D tensor, 1 will be prepended to its shape (e.g. [4] -> [1, 4]). """ diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py index d3d09a9dcf65..4262d76173e4 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py @@ -212,7 +212,7 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV return self.strategies_vector def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, List[ShardingStrategy]]: - # tranform the strategy generated + # transform the strategy generated # e.g. to process the sharding strategy for the transposed weights return strategy diff --git a/colossalai/auto_parallel/tensor_shard/utils/factory.py b/colossalai/auto_parallel/tensor_shard/utils/factory.py index 05331e560001..347c10aa102d 100644 --- a/colossalai/auto_parallel/tensor_shard/utils/factory.py +++ b/colossalai/auto_parallel/tensor_shard/utils/factory.py @@ -30,7 +30,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic """ if isinstance(input_, Node): - assert hasattr(input_, '_meta_data'), f'The given node has no attribte _meta_data' + assert hasattr(input_, '_meta_data'), f'The given node has no attribute _meta_data' meta_tensor = input_._meta_data assert meta_tensor is not None, "The given node's _meta_data attribute is None" shape = meta_tensor.shape diff --git a/colossalai/auto_parallel/tensor_shard/utils/reshape.py b/colossalai/auto_parallel/tensor_shard/utils/reshape.py index a32a14bf7d57..d0ebbd7e8b1b 100644 --- a/colossalai/auto_parallel/tensor_shard/utils/reshape.py +++ b/colossalai/auto_parallel/tensor_shard/utils/reshape.py @@ -6,12 +6,12 @@ class PreviousStatus(Enum): """ - This class shows the status of previous comparision. + This class shows the status of previous comparison. """ RESET = 0 - # ORIGIN means the dimension size of original tensor is larger in the previous comparision. + # ORIGIN means the dimension size of original tensor is larger in the previous comparison. ORIGIN = 1 - # TGT means the dimension size of target tensor is larger in the previous comparision. + # TGT means the dimension size of target tensor is larger in the previous comparison. TGT = 2 @@ -91,7 +91,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D tgt_index += 1 if previous_label == PreviousStatus.TGT: - # if the target dimension size is larger in the previous comparision, which means + # if the target dimension size is larger in the previous comparison, which means # the origin dimension size has already accumulated larger than target dimension size, so # we need to offload the origin dims and tgt dims into the reshape_mapping_dict. reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims) @@ -111,7 +111,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D origin_index += 1 if previous_label == PreviousStatus.ORIGIN: - # if the origin element is larger in the previous comparision, which means + # if the origin element is larger in the previous comparison, which means # the target element has already accumulated larger than origin element, so # we need to offload the origin dims and tgt dims into the reshape_mapping_dict. reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims) @@ -139,7 +139,7 @@ def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]], Rule: For a sharded dimension of input tensor, if it is not the minimum element of the input tuple, the function will return false. - To illustrate this issue, there are two cases to analyse: + To illustrate this issue, there are two cases to analyze: 1. no sharded dims in the input tuple: we could do the reshape operation safely just as the normal operation without distributed tensor. 2. sharded dims in the input tuple: the sharded dim must be the minimum element, then during shape diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py index 54036973e1e3..bb561a106515 100644 --- a/colossalai/nn/optimizer/cpu_adam.py +++ b/colossalai/nn/optimizer/cpu_adam.py @@ -13,7 +13,7 @@ class CPUAdam(NVMeOptimizer): """Implements Adam algorithm. - Supports parameters updating on both GPU and CPU, depanding on the device of paramters. + Supports parameters updating on both GPU and CPU, depanding on the device of parameters. But the parameters and gradients should on the same device: * Parameters on CPU and gradients on CPU is allowed. * Parameters on GPU and gradients on GPU is allowed. diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py index 1d0fb92de499..be6311c6c29f 100644 --- a/colossalai/nn/optimizer/hybrid_adam.py +++ b/colossalai/nn/optimizer/hybrid_adam.py @@ -13,19 +13,19 @@ class HybridAdam(NVMeOptimizer): """Implements Adam algorithm. - Supports parameters updating on both GPU and CPU, depanding on the device of paramters. + Supports parameters updating on both GPU and CPU, depanding on the device of parameters. But the parameters and gradients should on the same device: * Parameters on CPU and gradients on CPU is allowed. * Parameters on GPU and gradients on GPU is allowed. * Parameters on GPU and gradients on CPU is **not** allowed. - `HybriadAdam` requires CUDA extensions which can be built during installation or runtime. + `HybridAdam` requires CUDA extensions which can be built during installation or runtime. This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam. * For parameters updating on CPU, it uses CPUAdam. * For parameters updating on GPU, it uses FusedAdam. - * Hybird precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients. + * Hybrid precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients. :class:`colossalai.nn.optimizer.HybridAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``, or ``torch.optim.Adam`` with ``adamw_mode=False`` @@ -131,7 +131,7 @@ def step(self, closure=None, div_scale: float = -1): assert state['exp_avg'].device.type == 'cuda', "exp_avg should stay on cuda" assert state['exp_avg_sq'].device.type == 'cuda', "exp_avg should stay on cuda" - # record the state by gruop and update at once + # record the state by group and update at once g_l.append(p.grad.data) p_l.append(p.data) m_l.append(state['exp_avg']) diff --git a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py index da043df368ae..a6159856dcce 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py +++ b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py @@ -20,8 +20,8 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None: return torch.cuda.current_stream().wait_stream(stream) # As mentioned in https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html, - # PyTorch uses the "caching allocator" for memroy allocation for tensors. When a tensor is - # freed, its memory is likely to be reused by newly constructed tenosrs. By default, + # PyTorch uses the "caching allocator" for memory allocation for tensors. When a tensor is + # freed, its memory is likely to be reused by newly constructed tensors. By default, # this allocator traces whether a tensor is still in use by only the CUDA stream where it # was created. When a tensor is used by additional CUDA streams, we need to call record_stream # to tell the allocator about all these streams. Otherwise, the allocator might free the @@ -294,7 +294,7 @@ def print_comm_stats(self): print( f"CPU->CUDA BWD {self._cpu_to_cuda_numel * self.elem_size_in_byte / 1e6 / elapsed} MB/s {self._cpu_to_cuda_numel / 1e6} M elem" ) - print(f'cpu_to_cuda_elpase {elapsed} sec') + print(f'cpu_to_cuda_elapse {elapsed} sec') for k, v in self._elapsed_dict.items(): print(f'{k}: {v}') diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py index 95b3b8014af1..8022e84dc24b 100644 --- a/colossalai/utils/common.py +++ b/colossalai/utils/common.py @@ -324,7 +324,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): norm_type = float(norm_type) # Parameters can be on CPU or CUDA - # If parameters are on CPU, disable CUDA kernerls + # If parameters are on CPU, disable CUDA kernels # Calculate norm. if norm_type == inf: diff --git a/colossalai/utils/tensor_detector/readme.md b/colossalai/utils/tensor_detector/readme.md index 840dc8f4eca6..d6852ea55b54 100644 --- a/colossalai/utils/tensor_detector/readme.md +++ b/colossalai/utils/tensor_detector/readme.md @@ -46,7 +46,7 @@ detector.detect() I have made some comments on the right of the output for your understanding. -Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memery Allocated`. PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly. +Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memory Allocated`. PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly. **The order of print is not equal to the order the tensor creates, but they are really close.** @@ -61,7 +61,7 @@ Note that the total `Mem` of all the tensors and parameters is not equal to `Tot + mlp.2.bias cuda:0 (32,) True torch.float32 128 B ------------------------------------------------------------------------------------------------------------ Detect Location: "test_tensor_detector.py" line 27 -Totle GPU Memery Allocated on cuda:0 is 4.5 KB +Total GPU Memory Allocated on cuda:0 is 4.5 KB ------------------------------------------------------------------------------------------------------------ @@ -72,7 +72,7 @@ Totle GPU Memery Allocated on cuda:0 is 4.5 KB + Tensor cuda:0 (32,) True torch.float32 128 B # output ------------------------------------------------------------------------------------------------------------ Detect Location: "test_tensor_detector.py" line 30 -Totle GPU Memery Allocated on cuda:0 is 5.5 KB +Total GPU Memory Allocated on cuda:0 is 5.5 KB ------------------------------------------------------------------------------------------------------------ @@ -82,7 +82,7 @@ Totle GPU Memery Allocated on cuda:0 is 5.5 KB + Tensor cuda:0 () True torch.float32 4 B # loss ------------------------------------------------------------------------------------------------------------ Detect Location: "test_tensor_detector.py" line 32 -Totle GPU Memery Allocated on cuda:0 is 6.0 KB +Total GPU Memory Allocated on cuda:0 is 6.0 KB ------------------------------------------------------------------------------------------------------------ @@ -103,7 +103,7 @@ Totle GPU Memery Allocated on cuda:0 is 6.0 KB - Tensor cuda:0 (8,) True torch.float32 32 B # deleted activation ------------------------------------------------------------------------------------------------------------ Detect Location: "test_tensor_detector.py" line 34 -Totle GPU Memery Allocated on cuda:0 is 10.0 KB +Total GPU Memory Allocated on cuda:0 is 10.0 KB ------------------------------------------------------------------------------------------------------------ @@ -117,7 +117,7 @@ Totle GPU Memery Allocated on cuda:0 is 10.0 KB + Tensor cuda:0 (32,) False torch.float32 128 B ------------------------------------------------------------------------------------------------------------ Detect Location: "test_tensor_detector.py" line 36 -Totle GPU Memery Allocated on cuda:0 is 14.0 KB +Total GPU Memory Allocated on cuda:0 is 14.0 KB ------------------------------------------------------------------------------------------------------------ ``` diff --git a/colossalai/utils/tensor_detector/tensor_detector.py b/colossalai/utils/tensor_detector/tensor_detector.py index a8186f76834c..cfcd4e47b4cb 100644 --- a/colossalai/utils/tensor_detector/tensor_detector.py +++ b/colossalai/utils/tensor_detector/tensor_detector.py @@ -55,7 +55,7 @@ def get_tensor_mem(self, tensor): return self.mem_format(memory_size) def mem_format(self, real_memory_size): - # format the tensor memory into a reasonal magnitude + # format the tensor memory into a reasonable magnitude if real_memory_size >= 2**30: return str(real_memory_size / (2**30)) + ' GB' if real_memory_size >= 2**20: @@ -71,7 +71,7 @@ def collect_tensors_state(self): if (not self.include_cpu) and obj.device == torch.device('cpu'): continue self.detected.append(id(obj)) - # skip paramters we had added in __init__ when module is an instance of nn.Module for the first epoch + # skip parameters we had added in __init__ when module is an instance of nn.Module for the first epoch if id(obj) not in self.tensor_info: name = type(obj).__name__ @@ -84,7 +84,7 @@ def collect_tensors_state(self): name = par_name + ' (with grad)' else: # with no grad attached - # there will be no new paramters created during running + # there will be no new parameters created during running # so it must be in saved_tensor_info continue # we can also marked common tensors as tensor(with grad) @@ -155,7 +155,7 @@ def print_tensors_state(self): if device == torch.device('cpu'): continue gpu_mem_alloc = self.mem_format(torch.cuda.memory_allocated(device)) - self.info += f"Totle GPU Memery Allocated on {device} is {gpu_mem_alloc}\n" + self.info += f"Total GPU Memory Allocated on {device} is {gpu_mem_alloc}\n" self.info += LINE self.info += '\n\n' if self.show_info: diff --git a/colossalai/zero/gemini/chunk/manager.py b/colossalai/zero/gemini/chunk/manager.py index d85df0b00476..77368d06d255 100644 --- a/colossalai/zero/gemini/chunk/manager.py +++ b/colossalai/zero/gemini/chunk/manager.py @@ -102,7 +102,7 @@ def access_chunk(self, chunk: Chunk) -> None: """ if chunk in self.accessed_chunks: return - self.__sub_memroy_usage(chunk.memory_usage) + self.__sub_memory_usage(chunk.memory_usage) if chunk.device_type == 'cpu': chunk.shard_move(get_current_device()) self.__add_accessed_chunk(chunk) @@ -114,7 +114,7 @@ def release_chunk(self, chunk: Chunk) -> None: if chunk not in self.accessed_chunks: return if chunk.can_release: - self.__sub_memroy_usage(chunk.memory_usage) + self.__sub_memory_usage(chunk.memory_usage) self.__sub_accessed_chunk(chunk) self.__add_memory_usage(chunk.memory_usage) @@ -123,7 +123,7 @@ def move_chunk(self, chunk: Chunk, device: torch.device, force_copy: bool = Fals """ if not chunk.can_move or chunk.device_type == device.type: return - self.__sub_memroy_usage(chunk.memory_usage) + self.__sub_memory_usage(chunk.memory_usage) chunk.shard_move(device, force_copy) self.__add_memory_usage(chunk.memory_usage) @@ -138,7 +138,7 @@ def reduce_chunk(self, chunk: Chunk) -> bool: """ if not chunk.can_reduce: return False - self.__sub_memroy_usage(chunk.memory_usage) + self.__sub_memory_usage(chunk.memory_usage) chunk.reduce() self.__sub_accessed_chunk(chunk) self.__add_memory_usage(chunk.memory_usage) @@ -228,11 +228,11 @@ def __get_chunk_group(self, group_name: str) -> Deque: return self.chunk_groups[group_name] def __close_one_chunk(self, chunk: Chunk): - self.__sub_memroy_usage(chunk.memory_usage) + self.__sub_memory_usage(chunk.memory_usage) chunk.close_chunk() self.__add_memory_usage(chunk.memory_usage) - def __sub_memroy_usage(self, usage: Dict[str, int]): + def __sub_memory_usage(self, usage: Dict[str, int]): for k, v in usage.items(): self.total_mem[k] -= v diff --git a/colossalai/zero/gemini/chunk/search_utils.py b/colossalai/zero/gemini/chunk/search_utils.py index da58e038c879..881ceb0b3b97 100644 --- a/colossalai/zero/gemini/chunk/search_utils.py +++ b/colossalai/zero/gemini/chunk/search_utils.py @@ -85,7 +85,7 @@ def classify_params_by_dp_degree(param_order: OrderedParamGenerator, Classify the parameters by their dp degree Args: - param_order (OrderedParamGenerator): the order of param be visied + param_order (OrderedParamGenerator): the order of param be vised strict_ddp_flag (bool, optional): whether to enable the strict ddp mode. Defaults to False. Returns: diff --git a/colossalai/zero/gemini/memory_tracer/memory_stats.py b/colossalai/zero/gemini/memory_tracer/memory_stats.py index 9a45034ee27e..41d7e5754e96 100644 --- a/colossalai/zero/gemini/memory_tracer/memory_stats.py +++ b/colossalai/zero/gemini/memory_tracer/memory_stats.py @@ -59,7 +59,7 @@ def increase_preop_step(self, param_list: List[torch.nn.Parameter]): time step. Args: - param_list (List[torch.nn.Parameter]): a list of torch paramters. + param_list (List[torch.nn.Parameter]): a list of torch parameters. """ for p in param_list: if p not in self._param_step_dict: