From f72e7879d8bde3596e27e7defc8f208b4c83986c Mon Sep 17 00:00:00 2001 From: digger-yu Date: Fri, 21 Apr 2023 09:11:53 +0800 Subject: [PATCH 01/17] Fixed several spelling errors under colossalai --- README.md | 2 +- colossalai/_analyzer/fx/codegen.py | 2 +- colossalai/auto_parallel/offload/region.py | 2 +- .../auto_parallel/offload/training_simulator.py | 2 +- .../auto_parallel/passes/runtime_preparation_pass.py | 6 +++--- colossalai/autochunk/autochunk_codegen.py | 2 +- colossalai/autochunk/estimate_memory.py | 2 +- colossalai/autochunk/reorder_graph.py | 2 +- colossalai/autochunk/search_chunk.py | 6 +++--- colossalai/autochunk/trace_flow.py | 2 +- colossalai/autochunk/trace_indice.py | 12 ++++++------ colossalai/booster/booster.py | 2 +- colossalai/checkpoint_io/checkpoint_io_base.py | 8 ++++---- colossalai/cli/check/check_installation.py | 4 ++-- colossalai/communication/p2p.py | 10 +++++----- colossalai/communication/p2p_v2.py | 2 +- colossalai/context/moe_context.py | 2 +- colossalai/context/parallel_context.py | 4 ++-- colossalai/context/random/seed_manager.py | 8 ++++---- .../fx/codegen/activation_checkpoint_codegen.py | 2 +- colossalai/fx/passes/split_module.py | 4 ++-- 21 files changed, 43 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 79f733122cb3..e19e716a28e2 100644 --- a/README.md +++ b/README.md @@ -399,7 +399,7 @@ You may contact us or participate in the following ways: Thanks so much to all of our amazing contributors! - + diff --git a/colossalai/_analyzer/fx/codegen.py b/colossalai/_analyzer/fx/codegen.py index b768e59004b1..41d74f2e3719 100644 --- a/colossalai/_analyzer/fx/codegen.py +++ b/colossalai/_analyzer/fx/codegen.py @@ -138,7 +138,7 @@ def emit_ckpt_func(body, delete_unused_value_func, ckpt_level=0, in_ckpt=False): - """Emit ckpt fuction in nested way + """Emit ckpt function in nested way Args: body: forward code - in recursive calls, this part will be checkpoint diff --git a/colossalai/auto_parallel/offload/region.py b/colossalai/auto_parallel/offload/region.py index 9a2f558c3145..819ffbd96eb1 100644 --- a/colossalai/auto_parallel/offload/region.py +++ b/colossalai/auto_parallel/offload/region.py @@ -111,7 +111,7 @@ def copy_grad_to_region_slice(self, param: torch.nn.Parameter, data_slice: torch Copy data slice to the memory space indexed by the input tensor in the region. Args: - param (torch.nn.Parameter): the param used to retrive meta information + param (torch.nn.Parameter): the param used to retrieve meta information data_slice (torch.Tensor): the tensor to be copied to the region """ diff --git a/colossalai/auto_parallel/offload/training_simulator.py b/colossalai/auto_parallel/offload/training_simulator.py index f277c183a912..de58023ec2d6 100644 --- a/colossalai/auto_parallel/offload/training_simulator.py +++ b/colossalai/auto_parallel/offload/training_simulator.py @@ -22,7 +22,7 @@ class TrainingSimulator(ABC): Args: region_list (List[Region]): represents the linearized DNN computing graph. - comp_power (float): the NVIDIA GPU FP16 compuing power. + comp_power (float): the NVIDIA GPU FP16 computing power. link_to_bw (Dict[str, Dict[float, float]]): communication links and the corresponding bandwidth. """ diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py index e1d0c627274e..fccc59cd7579 100644 --- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py +++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py @@ -149,7 +149,7 @@ def size_value_converting_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh def _extract_target_dim(node): ''' - A helper function to etract the target dimension from size node. + A helper function to extract the target dimension from size node. There are two usages of torch.Tensor.size: 1. tensor.size() 2. tensor.size(dim) @@ -427,9 +427,9 @@ def _shard_param(param, target_sharding_spec): if target_sharding_spec.dim_partition_dict != {}: origin_sharding_spec = ShardingSpec(device_mesh, param.shape, {}) setattr(param, 'sharding_spec', origin_sharding_spec) - # TODO: build a ColoParamter class to manager the distributed parameters + # TODO: build a ColoParameter class to manager the distributed parameters # we could use .data here, because all the operations just happen before the real training - # loop, so we don't need to track these operations in the autograd graph. + # loop, so we don't need to track these operations in the autograft graph. param = torch.nn.Parameter( shape_consistency_manager.apply_for_autoparallel_runtime(param.data, param.sharding_spec, target_sharding_spec).detach().clone()) diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py index 2cbc6c9221aa..d0a467254d72 100644 --- a/colossalai/autochunk/autochunk_codegen.py +++ b/colossalai/autochunk/autochunk_codegen.py @@ -287,7 +287,7 @@ def emit_code_with_chunk(body: List[str], body = _replace_new_tensor_like_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body) # new tensor body = _replace_new_tensor_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body) - # reassgin reshape size + # reassign reshape size body[-1] = _replace_reshape_size(body[-1], node.name, chunk_infos[region_idx]["reshape_size"]) body[-1] = " " + body[-1] delete_unused_value_func(node, body, chunk_inputs_names) diff --git a/colossalai/autochunk/estimate_memory.py b/colossalai/autochunk/estimate_memory.py index 08a55f9aa04a..77bc2ef17bc3 100644 --- a/colossalai/autochunk/estimate_memory.py +++ b/colossalai/autochunk/estimate_memory.py @@ -153,7 +153,7 @@ def estimate_chunk_inference_mem(self, node_list: List, chunk_infos: Dict = None Returns: act_memory_peak_log (List): peak memory of every node - act_memory_after_node_log (List): memory after excuting every node + act_memory_after_node_log (List): memory after executing every node active_node_list_log (List): active nodes of every node. active nodes refer to nodes generated but not deleted. """ diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py index 3b00d47fb955..dd5fc7499aa2 100644 --- a/colossalai/autochunk/reorder_graph.py +++ b/colossalai/autochunk/reorder_graph.py @@ -4,7 +4,7 @@ class ReorderGraph(object): """ - Reorder node list and indice trace list + Reorder node list and indices trace list """ def __init__(self, trace_indice: TraceIndice, node_mgr: NodeMgr) -> None: diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py index 326445ee9f12..3b572924f8e2 100644 --- a/colossalai/autochunk/search_chunk.py +++ b/colossalai/autochunk/search_chunk.py @@ -16,7 +16,7 @@ class SearchChunk(object): This is the core class for AutoChunk. It defines the framework of the strategy of AutoChunk. - Chunks will be selected one by one utill search stops. + Chunks will be selected one by one utile search stops. The chunk search is as follows: 1. find the peak memory node @@ -73,7 +73,7 @@ def _init_trace(self) -> None: def _find_peak_region(self, mem_peak: List) -> int: """ - find peak node, along with its neighbour nodes exceeds max mem + find peak node, along with its neighbor nodes exceeds max mem """ max_value = max(mem_peak) max_idx = mem_peak.index(max_value) @@ -118,7 +118,7 @@ def _search_max_chunk_region(self, active_node: List, peak_region: int, chunk_re chunk_region_start (int) chunk_region_end (int) """ - # check if peak node already in chunkinfo + # check if peak node already in chunk info if chunk_regions is not None: for i in chunk_regions: if i["region"][0] < peak_region[0] <= i["region"][1] or \ diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py index 16815215f52b..db25267e9b42 100644 --- a/colossalai/autochunk/trace_flow.py +++ b/colossalai/autochunk/trace_flow.py @@ -479,7 +479,7 @@ def check_region_start_end(self, start_node: Node, start_dim: int, start_idx: in # check index source align if not self.check_index_source(start_dim, start_node, start_idx, end_dim, end_node): return False - # check index copmute + # check index compute if not self.check_index_compute(start_idx, end_dim, end_node, end_idx): return False return True diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py index 307f4de326d7..c7fce4c8bee1 100644 --- a/colossalai/autochunk/trace_indice.py +++ b/colossalai/autochunk/trace_indice.py @@ -8,7 +8,7 @@ class TraceIndice(object): """ - Trace all indice infomation for every node. + Trace all indice information for every node. Indice is a logical concept. Equal dims can been treated as one indice. eg. dim(x1) = [a, b, c] @@ -153,7 +153,7 @@ def _inherit_all_indice(self, node_from: Node, node_to: Node) -> None: def _inherit_more_indice_from_node_with_exclude(self, node_from: Node, node_to: Node, exclude: List = None) -> None: """ - inheirt indice from node without init + inherit indice from node without init """ if exclude == None: exclude = [] @@ -301,7 +301,7 @@ def _assign_permute_indice(self, node: Node, node_idx: int) -> None: def _assign_linear_indice(self, node: Node, node_idx: int) -> None: """ Assign indice for linear op. - 1. copy trace from input node and change last indice accroding to weight + 1. copy trace from input node and change last indice according to weight 2. mark equal for input node last indice, weight first dim and bias dim. 3. inherit input's computation, mark computation for last dim. @@ -360,7 +360,7 @@ def _assign_baddbmm_indice(self, node: Node, node_idx: int) -> None: def _assign_matmul_indice(self, node: Node, node_idx: int) -> None: """ Assign indice for matmul op. - 1. copy trace from matmul_left and change last indice accroding to matmul_right. (assert they have same length) + 1. copy trace from matmul_left and change last indice according to matmul_right. (assert they have same length) 2. mark equal for input matmul_left -1 indice and matmul_right -2 dim. 3. inherit matmul_left and matmul_right computation, mark computation for last dim. @@ -720,11 +720,11 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None: Assign indice for view and reshape op. 1. get origin shape and target shape by meta info. 2. compute the real value of -1 in target shape. - 3. determine changed dim, and assgin indice for generated dim. + 3. determine changed dim, and assign indice for generated dim. 4. log changed dim and generated dim for restore 5. inherit computation. 6. look into view list to see whether the view is associated with other, - if so assgin equal dim according to previous view. + if so assign equal dim according to previous view. Args: node (node) diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py index 1ad9f7f20ec1..c14e602deaf5 100644 --- a/colossalai/booster/booster.py +++ b/colossalai/booster/booster.py @@ -20,7 +20,7 @@ class Booster: """ Booster is a high-level API for training neural networks. It provides a unified interface for - training with different precisio, accelerator, and plugin. + training with different precision, accelerator, and plugin. Examples: >>> colossalai.launch(...) diff --git a/colossalai/checkpoint_io/checkpoint_io_base.py b/colossalai/checkpoint_io/checkpoint_io_base.py index 3f8b0b0a6b47..cb853559c48c 100644 --- a/colossalai/checkpoint_io/checkpoint_io_base.py +++ b/colossalai/checkpoint_io/checkpoint_io_base.py @@ -71,7 +71,7 @@ def load_model(self, Args: model (nn.Module): model to be loaded. - checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the + checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the mainstream model zoos such as Hugging Face and TIMM. The checkpoint path can be: 1. a file path, e.g. 'model.pt' 2. a path to a json file which defines the index to the sharded checkpoint @@ -127,7 +127,7 @@ def save_model(self, 1. a file path, e.g. 'model.pt' 2. a directory path to save the sharded checkpoint, e.g. './checkpoints/' when shard = True. shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into - multiple files. The model shards will be specificed by a `model.index.json` file. When shard = True, please ensure + multiple files. The model shards will be specified by a `model.index.json` file. When shard = True, please ensure that the checkpoint path is a directory path instead of a file path. gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True. variant (str): If specified, weights are saved in the format pytorch_model..bin. Default: None. @@ -149,7 +149,7 @@ def load_optimizer(self, optimizer: Optimizer, checkpoint: str): Args: optimizer (Optimizer): optimizer to be loaded. - checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the + checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the """ index_file_exists, index_file_path = has_index_file(checkpoint) @@ -180,7 +180,7 @@ def save_optimizer(self, 2. a path to a json file which defines the index to the sharded checkpoint for the optimizer 3. a path to a folder containing a unique .index.json file for sharded checkpoint shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into - multiple files. The optimizer shards will be specificed by a `optimizer.index.json` file. + multiple files. The optimizer shards will be specified by a `optimizer.index.json` file. gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True. prefix (str): prefix for the optimizer checkpoint when shard = True. Default: None. size_per_shard (int): size per shard in MB. Default: 1024. This value is only used when shard is set to True. diff --git a/colossalai/cli/check/check_installation.py b/colossalai/cli/check/check_installation.py index 44d7840700ef..3c4081d8957c 100644 --- a/colossalai/cli/check/check_installation.py +++ b/colossalai/cli/check/check_installation.py @@ -76,7 +76,7 @@ def check_installation(): click.echo("") click.echo(f"Note:") click.echo( - f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment varialbe CUDA_EXT=1 is set" + f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment variable CUDA_EXT=1 is set" ) click.echo(f"2. If AOT compilation is not enabled, stay calm as the CUDA kernels can still be built during runtime") @@ -88,7 +88,7 @@ def check_installation(): click.echo(f"Note:") click.echo(f"1. The table above checks the version compatibility of the libraries/tools in the current environment") click.echo( - f" - PyTorch version mistach: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation" + f" - PyTorch version misteach: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation" ) click.echo( f" - System and PyTorch CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version required by PyTorch" diff --git a/colossalai/communication/p2p.py b/colossalai/communication/p2p.py index 6dd4d0d6608d..782f3461990c 100644 --- a/colossalai/communication/p2p.py +++ b/colossalai/communication/p2p.py @@ -91,7 +91,7 @@ def _communicate(object_send_next: Union[torch.Tensor, List[torch.Tensor]] = Non dtype: torch.dtype = None, scatter_gather_tensors: bool = False) -> Tuple[Union[torch.Tensor, List[torch.Tensor]]]: """ - Adapted from megatron.p2p_communication. + Adapted from megaton.p2p_communication. Communicate tensors between stages. Used as helper method in other communication methods that are used in pipeline schedule. Takes the following arguments: @@ -103,10 +103,10 @@ def _communicate(object_send_next: Union[torch.Tensor, List[torch.Tensor]] = Non previous rank. recv_next (bool): boolean for whether tensor should be received from next rank. - recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defualts to None. - recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defualts to None. - prev_rank (int): the rank of the previous pipeline stage, defualts to None, - next_rank (int): the rank of the next pipeline stage, defualts to None, + recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defaults to None. + recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defaults to None. + prev_rank (int): the rank of the previous pipeline stage, defaults to None, + next_rank (int): the rank of the next pipeline stage, defaults to None, dtype (torch.dtype): data type of intermediate buffers, defaults to None scatter_gather_tensors (bool): whether to scatter and gather tensor between pipeline stages, defaults to False diff --git a/colossalai/communication/p2p_v2.py b/colossalai/communication/p2p_v2.py index 4223f78d58cd..0dacd8c3c9b5 100644 --- a/colossalai/communication/p2p_v2.py +++ b/colossalai/communication/p2p_v2.py @@ -230,7 +230,7 @@ def recv_backward(next_rank: int = None) -> Any: next_rank (int, optional): The rank of the source of the tensor. Returns: - Any: The input gradient tensor or gradident tensor list. + Any: The input gradient tensor or gradient tensor list. """ if gpc.is_pipeline_last_stage(): output_tensor_grad = None diff --git a/colossalai/context/moe_context.py b/colossalai/context/moe_context.py index 1d7a883b1552..b41f4072a405 100644 --- a/colossalai/context/moe_context.py +++ b/colossalai/context/moe_context.py @@ -64,7 +64,7 @@ def setup(self, seed: int, use_kernel_optim: bool = True): from colossalai.core import global_context as gpc self.max_ep_size = gpc.config.get('max_ep_size', self.world_size) assert self.world_size % self.max_ep_size == 0, \ - "Maximum epxert parallel size must be a factor of the number of GPUs" + "Maximum expert parallel size must be a factor of the number of GPUs" self.min_dp_size = self.world_size // self.max_ep_size # Enabling kernel optimization may raise error in some cases diff --git a/colossalai/context/parallel_context.py b/colossalai/context/parallel_context.py index 0cd533fdef1a..003f0cdd91b6 100644 --- a/colossalai/context/parallel_context.py +++ b/colossalai/context/parallel_context.py @@ -44,7 +44,7 @@ def __init__(self): # load config from file self._config = None - # default 3D parallel args, will be overwritten during process group intialization + # default 3D parallel args, will be overwritten during process group initialization self.world_size = 1 self.data_parallel_size = 1 self.pipeline_parallel_size = 1 @@ -264,7 +264,7 @@ def _add_world_size(self, parallel_mode: ParallelMode, world_size: int): """Adds world size for `parallel_mode`. Args: - parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode correponding to the process group + parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode corresponding to the process group world_size (int): The world size to be added Raises: diff --git a/colossalai/context/random/seed_manager.py b/colossalai/context/random/seed_manager.py index 3c84aaafc179..956f9001200d 100644 --- a/colossalai/context/random/seed_manager.py +++ b/colossalai/context/random/seed_manager.py @@ -59,23 +59,23 @@ def set_mode(self, parallel_mode: ParallelMode): self._current_mode = parallel_mode torch.cuda.set_rng_state(self._seed_states[parallel_mode]) - def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False): + def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrite: bool = False): """Adds a seed to the seed manager for `parallel_mode`. Args: parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode. seed (int): The seed to be added. - overwrtie (bool, optional): Whether allows to overwrite the seed that has been set already + overwrite (bool, optional): Whether allows to overwrite the seed that has been set already Raises: AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added. """ assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided' - if overwrtie is False: + if overwrite is False: assert parallel_mode not in self._seed_states, f'The seed for {parallel_mode} has been added' elif parallel_mode in self._seed_states: - print(f"Warnning: {parallel_mode} seed has been overwritten.", flush=True) + print(f"Warning: {parallel_mode} seed has been overwritten.", flush=True) current_state = torch.cuda.get_rng_state() torch.cuda.manual_seed(seed) diff --git a/colossalai/fx/codegen/activation_checkpoint_codegen.py b/colossalai/fx/codegen/activation_checkpoint_codegen.py index 492ebf918a9c..5a72cb9ca923 100644 --- a/colossalai/fx/codegen/activation_checkpoint_codegen.py +++ b/colossalai/fx/codegen/activation_checkpoint_codegen.py @@ -305,7 +305,7 @@ def emit_ckpt_func(body, delete_unused_value_func, level=0, in_ckpt=False): - """Emit ckpt fuction in nested way + """Emit ckpt function in nested way Args: body: forward code, in recursive calls, this part will be checkpoint functions code diff --git a/colossalai/fx/passes/split_module.py b/colossalai/fx/passes/split_module.py index 9bc4bf1f5c42..5ce5b969cbde 100644 --- a/colossalai/fx/passes/split_module.py +++ b/colossalai/fx/passes/split_module.py @@ -155,7 +155,7 @@ def record_output(def_node: torch.fx.node.Node, use_node: Optional[torch.fx.node use_partition = partitions[use_partition_name] use_partition.outputs.setdefault(def_node.name) - # split nodes into parititons + # split nodes into partitions for node in m.graph.nodes: orig_nodes[node.name] = node @@ -198,7 +198,7 @@ def record_output(def_node: torch.fx.node.Node, use_node: Optional[torch.fx.node if len(sorted_partitions) != len(partitions): raise RuntimeError("cycle exists between partitions!") - # add placeholders to parititons + # add placeholders to partitions for partition_name in sorted_partitions: partition = partitions[partition_name] for input in partition.inputs: From 777b3a1359c1573711a62b3c36078a19a8bf6e53 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Mon, 24 Apr 2023 16:48:18 +0800 Subject: [PATCH 02/17] Fix the spelling error in colossalai and docs directory --- colossalai/initialize.py | 2 +- .../kernel/cuda_native/multihead_attention.py | 2 +- colossalai/nn/_ops/embedding_bag.py | 2 +- colossalai/nn/layer/moe/experts.py | 12 +++++----- colossalai/nn/layer/moe/layers.py | 4 ++-- colossalai/nn/layer/moe/routers.py | 4 ++-- colossalai/nn/layer/moe/utils.py | 4 ++-- colossalai/nn/layer/parallel_1d/layers.py | 10 ++++----- colossalai/tensor/colo_tensor.py | 6 ++--- colossalai/tensor/comm_spec.py | 2 +- colossalai/tensor/compute_spec.py | 2 +- colossalai/tensor/d_tensor/layout.py | 2 +- colossalai/tensor/d_tensor/sharding_spec.py | 4 ++-- colossalai/tensor/dist_spec_mgr.py | 2 +- colossalai/tensor/distspec.py | 2 +- colossalai/tensor/shape_consistency.py | 4 ++-- colossalai/tensor/sharding_spec.py | 2 +- colossalai/tensor/utils.py | 6 ++--- colossalai/testing/utils.py | 14 ++++++------ .../utils/checkpoint/module_checkpoint.py | 2 +- colossalai/utils/checkpoint/utils.py | 2 +- colossalai/utils/moe.py | 2 +- colossalai/zero/gemini/colo_init_context.py | 4 ++-- colossalai/zero/gemini/gemini_ddp.py | 6 ++--- .../gemini/ophooks/_shard_grad_ophook.py | 2 +- .../gemini/ophooks/_shard_param_ophook.py | 2 +- .../zero/legacy/gemini/stateful_tensor_mgr.py | 2 +- .../zero/legacy/init_ctx/init_context.py | 2 +- .../bucket_tensor_shard_strategy.py | 2 +- .../legacy/sharded_model/sharded_model_v2.py | 8 +++---- .../legacy/sharded_optim/sharded_optim_v2.py | 12 +++++----- colossalai/zero/wrapper.py | 4 ++-- .../en/Colossal-Auto/get_started/run_demo.md | 2 +- .../en/advanced_tutorials/meet_gemini.md | 2 +- .../en/advanced_tutorials/opt_service.md | 2 +- .../train_vit_with_hybrid_parallelism.md | 22 +++++++++---------- docs/source/en/basics/colotensor_concept.md | 2 +- docs/source/en/basics/engine_trainer.md | 2 +- .../source/en/concepts/colossalai_overview.md | 6 ++--- docs/source/en/features/1D_tensor_parallel.md | 4 ++-- docs/source/en/features/2D_tensor_parallel.md | 4 ++-- .../en/features/2p5D_tensor_parallel.md | 4 ++-- docs/source/en/features/3D_tensor_parallel.md | 2 +- .../en/features/gradient_accumulation.md | 2 +- .../en/features/mixed_precision_training.md | 8 +++---- docs/source/en/features/nvme_offload.md | 6 ++--- docs/source/en/features/zero_with_chunk.md | 4 ++-- 47 files changed, 104 insertions(+), 104 deletions(-) diff --git a/colossalai/initialize.py b/colossalai/initialize.py index 5d3f3e5530cb..bae4cb3bdf63 100644 --- a/colossalai/initialize.py +++ b/colossalai/initialize.py @@ -195,7 +195,7 @@ def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = 'nccl', seed: int = 1024, verbose: bool = True): - """A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size + """A wrapper for colossalai.launch for torch run or torch.distributed.launch by reading rank and world size from the environment variables set by PyTorch Args: diff --git a/colossalai/kernel/cuda_native/multihead_attention.py b/colossalai/kernel/cuda_native/multihead_attention.py index 7df53731edc5..3b6470cdcbb9 100644 --- a/colossalai/kernel/cuda_native/multihead_attention.py +++ b/colossalai/kernel/cuda_native/multihead_attention.py @@ -111,7 +111,7 @@ class MultiHeadAttention(nn.Module): Arguments: hidden_size: Total dimension of hidden_size. nhead: Number of parallel attention heads. - batch_size: Batch Size for one foward + batch_size: Batch Size for one forward max_seq_len: Max length of input sequence dropout: Dropout probability norm_first: perform LayerNorms before attention diff --git a/colossalai/nn/_ops/embedding_bag.py b/colossalai/nn/_ops/embedding_bag.py index 0e8aa8fecb01..0026f579b6dc 100644 --- a/colossalai/nn/_ops/embedding_bag.py +++ b/colossalai/nn/_ops/embedding_bag.py @@ -88,7 +88,7 @@ def colo_embedding_bag(input_tensor: GeneralTensor, assert isinstance(weight, ColoTensor) input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group()) - # Handle differen parallel actions. + # Handle different parallel actions. if not weight.has_compute_spec(): # No Model Parallel Applied assert weight.is_replicate(), 'Invalid weight spec for native embedding op' diff --git a/colossalai/nn/layer/moe/experts.py b/colossalai/nn/layer/moe/experts.py index 2e5d9e6e79a9..56b11f4d9e08 100644 --- a/colossalai/nn/layer/moe/experts.py +++ b/colossalai/nn/layer/moe/experts.py @@ -13,7 +13,7 @@ class MoeExperts(nn.Module): - """Basic class for experts in MoE. It stores what kind of communication expersts use + """Basic class for experts in MoE. It stores what kind of communication experts use to exchange tokens, how many experts in a single GPU and parallel information such as expert parallel size, data parallel size and their distributed communication groups. """ @@ -24,7 +24,7 @@ def __init__(self, comm_name: str, num_experts: int): "This kind of communication has not been implemented yet.\n Please use Experts build function." self.comm_name = comm_name self.num_total_experts = num_experts - # Get the configuration of experts' deployment and parallel information from moe contex + # Get the configuration of experts' deployment and parallel information from moe context self.num_local_experts, self.dist_info = MOE_CONTEXT.get_info(num_experts) @@ -32,7 +32,7 @@ def __init__(self, comm_name: str, num_experts: int): class Experts(MoeExperts): """A wrapper class to create experts. It will create E experts across the moe model parallel group, where E is the number of experts. Every expert - is a instence of the class, 'expert' in initialization parameters. + is a instance of the class, 'expert' in initialization parameters. Args: expert_cls (:class:`torch.nn.Module`): The class of all experts @@ -146,15 +146,15 @@ def forward(self, inputs): # inputs [g, el, c, h] class TPExperts(MoeExperts): """Use tensor parallelism to split each expert evenly, which can deploy experts in - case that the number of experts can't be divied by maximum expert parallel size or - maximum expert parallel size can't be divied by the number of experts. + case that the number of experts can't be divide by maximum expert parallel size or + maximum expert parallel size can't be divide by the number of experts. """ def __init__(self, num_experts: int, d_model: int, d_ff: int, activation=None, drop_rate: float = 0): super().__init__("all_gather", MOE_CONTEXT.max_ep_size) assert d_ff % MOE_CONTEXT.max_ep_size == 0, \ - "d_ff should be divied by maximum expert parallel size" + "d_ff should be divide by maximum expert parallel size" p_ff = d_ff // MOE_CONTEXT.max_ep_size diff --git a/colossalai/nn/layer/moe/layers.py b/colossalai/nn/layer/moe/layers.py index b90d1f0bfcc6..03f55d91f3a8 100644 --- a/colossalai/nn/layer/moe/layers.py +++ b/colossalai/nn/layer/moe/layers.py @@ -25,7 +25,7 @@ class MoeLayer(nn.Module): """A MoE layer, that puts its input tensor to its gate and uses the output logits to router all tokens, is mainly used to exchange all tokens for every expert across - the moe tensor group by all to all comunication. Then it will get the output of all + the moe tensor group by all to all communication. Then it will get the output of all experts and exchange the output. At last returns the output of the moe system. Args: @@ -122,7 +122,7 @@ class MoeModule(nn.Module): drop_tks (bool, optional): Whether drops tokens in evaluation use_residual (bool, optional): Makes this MoE layer a Residual MoE. More information can be found in `Microsoft paper`_. - residual_instance (nn.Module, optional): The instance of residual module in Resiual MoE + residual_instance (nn.Module, optional): The instance of residual module in Residual MoE expert_instance (MoeExperts, optional): The instance of experts module in MoeLayer expert_cls (Type[nn.Module], optional): The class of each expert when no instance is given expert_args (optional): The args of expert when no instance is given diff --git a/colossalai/nn/layer/moe/routers.py b/colossalai/nn/layer/moe/routers.py index c522c655a511..f1cd0f3574c7 100644 --- a/colossalai/nn/layer/moe/routers.py +++ b/colossalai/nn/layer/moe/routers.py @@ -60,7 +60,7 @@ def pop_routing_loss(self) -> torch.Tensor: class Top1Router(MoeRouter): """Top1 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c] - for routing usage. More deailted function can be found in the paper about Switch Transformer + for routing usage. More derailed function can be found in the paper about Switch Transformer of Google. Args: capacity_factor_train (float, optional): Capacity factor in routing of training. @@ -143,7 +143,7 @@ def forward(self, inputs: torch.Tensor, use_kernel: bool = False, ep_group: Opti class Top2Router(MoeRouter): """Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c] - for routing usage. More deailted function can be found in the paper about ViT-MoE. + for routing usage. More derailed function can be found in the paper about ViT-MoE. Args: capacity_factor_train (float, optional): Capacity factor in routing of training. capacity_factor_eval (float, optional): Capacity factor in routing of evaluation. diff --git a/colossalai/nn/layer/moe/utils.py b/colossalai/nn/layer/moe/utils.py index 9362347414e0..4ca8bd703386 100644 --- a/colossalai/nn/layer/moe/utils.py +++ b/colossalai/nn/layer/moe/utils.py @@ -12,7 +12,7 @@ def half(self, memory_format=None): class NormalNoiseGenerator: - """Generates a random noisy mask for logtis tensor. + """Generates a random noisy mask for logits tensor. All noise is generated from a normal distribution :math:`(0, 1 / E^2)`, where `E = the number of experts`. @@ -32,7 +32,7 @@ def __call__(self, inputs: torch.Tensor): class UniformNoiseGenerator: - """Generates a random noisy mask for logtis tensor. + """Generates a random noisy mask for logits tensor. copied from mesh tensorflow: Multiply values by a random number between :math:`1-epsilon` and :math:`1+epsilon`. Makes models more resilient to rounding errors introduced by bfloat16. diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/nn/layer/parallel_1d/layers.py index e96abd87ed10..406173a18c60 100644 --- a/colossalai/nn/layer/parallel_1d/layers.py +++ b/colossalai/nn/layer/parallel_1d/layers.py @@ -439,7 +439,7 @@ class Linear1D_Col(ParallelLayer): to all GPUs, otherwise, every GPU will have its output which is :math:`Y_i = XA_i`, defaults to False skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer, - which is preserved for kernel fusion, defaults to Fals + which is preserved for kernel fusion, defaults to False weight_initializer (:class:`typing.Callable`, optional): The initializer of weight, defaults to kaiming uniform initializer. bias_initializer (:class:`typing.Callable`, optional): @@ -578,7 +578,7 @@ class Linear1D_Row(ParallelLayer): dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False. skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer, - which is preserved for kernel fusion, defaults to Fals + which is preserved for kernel fusion, defaults to False weight_initializer (:class:`typing.Callable`, optional): The initializer of weight, defaults to kaiming uniform initializer. bias_initializer (:class:`typing.Callable`, optional): @@ -994,11 +994,11 @@ class PatchEmbedding1D(ColossalaiModule): :type dtype: torch.dtype, optional :param flatten: whether to flatten output tensor, defaults to True :type flatten: bool, optional - :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer + :param weight_initializer: The initializer of weight, defaults to kaiming uniform initializer :type weight_initializer: typing.Callable, optional - :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer + :param bias_initializer: The initializer of bias, defaults to xavier uniform initializer :type bias_initializer: typing.Callable, optional - :param position_embed_initializer: The intializer of position embedding, defaults to zero + :param position_embed_initializer: The initializer of position embedding, defaults to zero :type position_embed_initializer: typing.Callable, optional """ diff --git a/colossalai/tensor/colo_tensor.py b/colossalai/tensor/colo_tensor.py index 40eefc3ec5d1..4d762076461d 100644 --- a/colossalai/tensor/colo_tensor.py +++ b/colossalai/tensor/colo_tensor.py @@ -184,7 +184,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): # we have to capture the `backward` function # and make sure that it does not in `torch._C.DisableTorchFunction()` context if func is torch.Tensor.backward: - assert len(args) == 1 # only has 1 paramter + assert len(args) == 1 # only has 1 parameter backward_tensor = torch.Tensor(args[0]) tensor_kwargs = {k: torch.Tensor(v) if torch.is_tensor(v) else v for k, v in kwargs.items()} return backward_tensor.backward(**tensor_kwargs) @@ -228,7 +228,7 @@ def redistribute(self, dist_spec: _DistSpec, pg: Optional[ProcessGroup] = None) 2. If the pg is not not None and not equal to the current process group. First, convert the tensor as replicated among the TP process group. Second, reset the process group to the new pg. - Third, conver the tensor (new replicated both among the tp process group) to the new dist_spec. + Third, convert the tensor (new replicated both among the tp process group) to the new dist_spec. Args: dist_spec (_DistSpec): the new dist spec. @@ -297,7 +297,7 @@ def size_local(self, *args) -> torch.Size: def size_global(self, *args) -> torch.Size: """size_global - override the torch buildin size() + override the torch building size() the shape passed in must be in a replicate placement. Returns: diff --git a/colossalai/tensor/comm_spec.py b/colossalai/tensor/comm_spec.py index 0d8de1062d42..af38d2a502c2 100644 --- a/colossalai/tensor/comm_spec.py +++ b/colossalai/tensor/comm_spec.py @@ -391,7 +391,7 @@ class CommSpec: to determine the buffer shape, and logical_process_axis Argument: - comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec. + comm_pattern(CollectiveCommPattern): describe the communication method used in this spec. sharding_spec(ShardingSpec): This is sharding spec of the tensor which will join the communication action. gather_dim(int, Optional): The gather_dim of the tensor will be gathered. shard_dim(int, Optional): The shard_dim of the tensor will be sharded. diff --git a/colossalai/tensor/compute_spec.py b/colossalai/tensor/compute_spec.py index 73328285ee93..12f8f36bc613 100644 --- a/colossalai/tensor/compute_spec.py +++ b/colossalai/tensor/compute_spec.py @@ -10,7 +10,7 @@ class ComputePattern(Enum): class ComputeSpec(object): """ComputeSpec - The Specification for compuattion pattern + The Specification for computation pattern Args: compute_pattern (ComputePattern): an Enum instance for compute pattern. diff --git a/colossalai/tensor/d_tensor/layout.py b/colossalai/tensor/d_tensor/layout.py index 72a2694a1eaf..ee7ef74a99ae 100644 --- a/colossalai/tensor/d_tensor/layout.py +++ b/colossalai/tensor/d_tensor/layout.py @@ -14,7 +14,7 @@ class Layout: """Layout of a tensor. Attributes: - device_mesh: the device mesh to store the tensor distributedly. + device_mesh: the device mesh to store the tensor distributed. device_type: the type of the device mesh, e.g. 'cpu' or 'cuda'. sharding_spec: the sharding specification to describe how the tensor is sharded. entire_shape: the entire shape of the global tensor. diff --git a/colossalai/tensor/d_tensor/sharding_spec.py b/colossalai/tensor/d_tensor/sharding_spec.py index 7591f760cb30..2ea0c4db89fd 100644 --- a/colossalai/tensor/d_tensor/sharding_spec.py +++ b/colossalai/tensor/d_tensor/sharding_spec.py @@ -14,7 +14,7 @@ class DimSpec: ''' - Sharding spec for single dimension of the sharded tensor decribe the sharding dimension of + Sharding spec for single dimension of the sharded tensor describe the sharding dimension of logical device mesh and give a method to compute the difference between them. This class is used internally in ShardingSpec. @@ -143,7 +143,7 @@ class ShardingSpec: Argument: dim_partition_dict(Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded, - and the value of the key decribe which logical axis will be sharded in that dimension. + and the value of the key describe which logical axis will be sharded in that dimension. sharding_sequence(List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1]. ''' diff --git a/colossalai/tensor/dist_spec_mgr.py b/colossalai/tensor/dist_spec_mgr.py index d5c0ce28e9fb..8657989235db 100644 --- a/colossalai/tensor/dist_spec_mgr.py +++ b/colossalai/tensor/dist_spec_mgr.py @@ -61,7 +61,7 @@ def _shard_as(tensor: torch.Tensor, old_dist_spec: _DistSpec, dist_spec: _DistSp Args: tensor (torch.Tensor): a global (replicated) tensor before shard dist_spec (_DistSpec): the distributed spec. to be sharded as. - pg (ProcessGrouo): the process group of the corresponding colotensor + pg (ProcessGroup): the process group of the corresponding colotensor Returns: torch.Tensor: a torch tensor after sharded. """ diff --git a/colossalai/tensor/distspec.py b/colossalai/tensor/distspec.py index 8dd0d8791537..3a09f1426e31 100644 --- a/colossalai/tensor/distspec.py +++ b/colossalai/tensor/distspec.py @@ -15,7 +15,7 @@ class _DistSpec: A class indicates Distributed Specification. The DistSpec is only works for the tensor parallel process groups. Because the dist spec of data parallel process group can be automatically deduced. - This is an internal data structrue. + This is an internal data structure. The API for users should be `ShardSpec` and `ReplicaSpec`. Args: diff --git a/colossalai/tensor/shape_consistency.py b/colossalai/tensor/shape_consistency.py index 2831b10a3c57..0a840006f086 100644 --- a/colossalai/tensor/shape_consistency.py +++ b/colossalai/tensor/shape_consistency.py @@ -73,7 +73,7 @@ def get_all_all_gather_spec(self, source_spec: ShardingSpec, orig_cost_dict: Dict[str, float]) -> Dict[ShardingSpec, float]: ''' Get all valid sharding specs from source_spec with single all-gather operation, and - accumulate commucation cost on origin cost which will finally be used in auto sharding solver. + accumulate communication cost on origin cost which will finally be used in auto sharding solver. For the all-gather operation, we just care about the S dimension. Argument: @@ -145,7 +145,7 @@ def get_all_all_to_all_spec(self, source_spec: ShardingSpec, orig_cost_dict: Dict[str, float]) -> Dict[ShardingSpec, float]: ''' Get all valid sharding specs from source_spec with single all-to-all operation, and - accumulate commucation cost on origin cost which will finally be used in auto sharding solver. + accumulate communication cost on origin cost which will finally be used in auto sharding solver. For the all-to-all operation, we just care about the pairs containing S dimension. Argument: diff --git a/colossalai/tensor/sharding_spec.py b/colossalai/tensor/sharding_spec.py index cdd0338850cf..bed320130ccd 100644 --- a/colossalai/tensor/sharding_spec.py +++ b/colossalai/tensor/sharding_spec.py @@ -18,7 +18,7 @@ class _DimSpec: ''' - Sharding spec for single dimension of the sharded tensor decribe the sharding dimension of + Sharding spec for single dimension of the sharded tensor describe the sharding dimension of logical device mesh and give a method to compute the difference between them. This class is used internally in ShardingSpec. diff --git a/colossalai/tensor/utils.py b/colossalai/tensor/utils.py index 0c2ead630d59..6e30f97fef03 100644 --- a/colossalai/tensor/utils.py +++ b/colossalai/tensor/utils.py @@ -18,7 +18,7 @@ def all_gather_simulator(target_pair): Argument: target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded, - and the second element decribes which logical axis will be sharded in that dimension. + and the second element describes which logical axis will be sharded in that dimension. ''' _, shard_list = target_pair new_shard_list = shard_list[:-1] @@ -36,7 +36,7 @@ def all_to_all_simulator(f_target_pair, b_target_pair): Therefore, if the behind shard_list is not None, we just extend it to the front shard_list. Argument: target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded, - and the second element decribes which logical axis will be sharded in that dimension. + and the second element describes which logical axis will be sharded in that dimension. e.g.: all-to-all(S0, S1) -> [S01, R] all-to-all(S0, R) -> [R, S0] @@ -46,7 +46,7 @@ def all_to_all_simulator(f_target_pair, b_target_pair): Argument: target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded, - and the second element decribes which logical axis will be sharded in that dimension. + and the second element describes which logical axis will be sharded in that dimension. ''' _, f_shard_list = f_target_pair _, b_shard_list = b_target_pair diff --git a/colossalai/testing/utils.py b/colossalai/testing/utils.py index eac83e6d7bd5..6583eeb12bf4 100644 --- a/colossalai/testing/utils.py +++ b/colossalai/testing/utils.py @@ -17,10 +17,10 @@ def parameterize(argument: str, values: List[Any]) -> Callable: we want to avoid the number of distributed network initialization, we need to have this extra decorator on the function launched by torch.multiprocessing. - If a function is wrapped with this wrapper, non-paramterized arguments must be keyword arguments, - positioanl arguments are not allowed. + If a function is wrapped with this wrapper, non-parametrized arguments must be keyword arguments, + positional arguments are not allowed. - Usgae:: + Usage:: # Example 1: @parameterize('person', ['xavier', 'davis']) @@ -33,7 +33,7 @@ def say_something(person, msg): # > xavier: hello # > davis: hello - # Exampel 2: + # Example 2: @parameterize('person', ['xavier', 'davis']) @parameterize('msg', ['hello', 'bye', 'stop']) def say_something(person, msg): @@ -110,7 +110,7 @@ def test_method(): If the pattern is not None and matches the exception message, the exception will be detected for rerun max_try (int, Optional): Maximum reruns for this function. The default value is 5. - If max_try is None, it will rerun foreven if exception keeps occurings + If max_try is None, it will rerun forever if exception keeps occurring """ def _match_lines(lines, pattern): @@ -144,7 +144,7 @@ def _run_until_success(*args, **kwargs): # Override signature # otherwise pytest.mark.parameterize will raise the following error: - # function does not use argumetn xxx + # function does not use argument xxx sig = signature(func) _run_until_success.__signature__ = sig @@ -231,7 +231,7 @@ def spawn(func, nprocs=1, **kwargs): This function is used to spawn processes for testing. Usage: - # must contians arguments rank, world_size, port + # must contains arguments rank, world_size, port def do_something(rank, world_size, port): ... diff --git a/colossalai/utils/checkpoint/module_checkpoint.py b/colossalai/utils/checkpoint/module_checkpoint.py index a109b3702577..d390da864cd3 100644 --- a/colossalai/utils/checkpoint/module_checkpoint.py +++ b/colossalai/utils/checkpoint/module_checkpoint.py @@ -89,7 +89,7 @@ def load_checkpoint(path: str, torch_load_kwargs: (dict, optional): The kwargs of torch.load inside the function load_state_dict_kwargs (dict, optional): The kwargs of load_state_dict inside the function """ - # initialize the default paramters + # initialize the default parameters if not torch_load_kwargs: torch_load_kwargs = dict() if not load_state_dict_kwargs: diff --git a/colossalai/utils/checkpoint/utils.py b/colossalai/utils/checkpoint/utils.py index 5652600ffd9b..682cd0903d5b 100644 --- a/colossalai/utils/checkpoint/utils.py +++ b/colossalai/utils/checkpoint/utils.py @@ -34,7 +34,7 @@ def gather_tensor(colo_tensor: ColoTensor) -> None: dist.barrier() if dist.get_rank() == 0: - setattr(colo_tensor, 'save_ready', True) # set saving signitrue + setattr(colo_tensor, 'save_ready', True) # set saving signature def scatter_tensor(colo_tensor: ColoTensor, dist_spec: _DistSpec) -> None: diff --git a/colossalai/utils/moe.py b/colossalai/utils/moe.py index 90783e5d9b8e..86d04c11958b 100644 --- a/colossalai/utils/moe.py +++ b/colossalai/utils/moe.py @@ -38,7 +38,7 @@ def sync_moe_model_param(model: nn.Module): param_dict = get_moe_epsize_param_dict(model) - # synchrosize the parameters whose dp_group is the whole world + # synchronize the parameters whose dp_group is the whole world if 1 in param_dict: src_rank = gpc.get_ranks_in_group(ParallelMode.DATA)[0] for param in param_dict[1]: diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py index 5937ee9eff9a..75f8576ca477 100644 --- a/colossalai/zero/gemini/colo_init_context.py +++ b/colossalai/zero/gemini/colo_init_context.py @@ -74,7 +74,7 @@ def __init__(self, """ Args: device (torch.device): the device where parameters initialized are resident. Defaults to torch.device('cpu'). - dtype (torch.dtype): the dtype of parameters initialized. Defults to torch.float. + dtype (torch.dtype): the dtype of parameters initialized. Defaults to torch.float. default_pg (ProcessGroup): the default process group for all initialized parameters. default_dist_spec: the default distributed specifications. """ @@ -164,7 +164,7 @@ def post_process_colo_init_ctx(model: torch.nn.Module, model (torch.nn.module): the model device (torch.device, optional): device type of the model params. Defaults to torch.device('cpu'). dtype (torch.dtype, optional): dtype of the model params. Defaults to torch.float. - default_pg (Optional[ProcessGroup], optional): default process group. Defaults to None. Inidicates a DP-only process group. + default_pg (Optional[ProcessGroup], optional): default process group. Defaults to None. Indicates a DP-only process group. default_dist_spec (Any, optional): default dist spec of params. Defaults to None. Raises: diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index e151f1aefb2d..a2cc8c1f2de8 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -42,7 +42,7 @@ class ZeroDDP(ColoDDP): Args: module (torch.nn.Module): Module to apply ZeRO-DP. - gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous momery space. + gemini_manager (GeminiManager): Manages the chunk manager and heterogeneous memory space. For more details, see the API reference of ``GeminiManager``. pin_memory (bool): Chunks on CPU Memory use pin-memory. force_outputs_fp32 (bool): If set to True, outputs will be fp32. Otherwise, outputs will be fp16. @@ -684,7 +684,7 @@ def __init__(self, memstats: Optional[MemStats] = None, verbose: bool = False) -> None: """ - A torch.Module warpper using ZeRO-DP and Genimi. + A torch.Module wrapper using ZeRO-DP and Gemini. ZeRO is for parallel. Gemini is for memory management. WARNING: The class will modify the module inline! @@ -706,7 +706,7 @@ def __init__(self, Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte. - If the aggregate size of parameters is still samller than the minimum chunk size, + If the aggregate size of parameters is still smaller than the minimum chunk size, all parameters will be compacted into one small chunk. memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer. """ diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py b/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py index 5115ff74da16..8f8fec64924e 100644 --- a/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py +++ b/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py @@ -8,7 +8,7 @@ @OPHOOKS.register_module class ShardGradMemTracerHook(BaseOpHook): """ - A hook to process sharded param before and afther FWD and BWD operator executing. + A hook to process sharded param before and after FWD and BWD operator executing. """ def __init__(self): diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py b/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py index 80736d14085e..a2a62fb9788a 100644 --- a/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py +++ b/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py @@ -8,7 +8,7 @@ @OPHOOKS.register_module class ShardParamHook(BaseOpHook): """ - A hook to process sharded param before and afther FWD and BWD operator executing. + A hook to process sharded param before and after FWD and BWD operator executing. """ def __init__(self): diff --git a/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py b/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py index 3b37444b0fe0..4f9ea7c6d520 100644 --- a/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py +++ b/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py @@ -53,7 +53,7 @@ def finish_iter(self): self._evict_time = 0 def adjust_layout(self) -> None: - """ Adjust the layout of statefuil tensor according to the information provided + """ Adjust the layout of stateful tensor according to the information provided by mem_stats_collector, which should belongs to a Sharded Model. """ # find stateful tensor in state COMPUTE diff --git a/colossalai/zero/legacy/init_ctx/init_context.py b/colossalai/zero/legacy/init_ctx/init_context.py index f8be0ca4f3fc..a921ca0aa83a 100644 --- a/colossalai/zero/legacy/init_ctx/init_context.py +++ b/colossalai/zero/legacy/init_ctx/init_context.py @@ -97,7 +97,7 @@ def calc_fanin_fanout(tensor: torch.Tensor): """We use this function to substitute fan-in and fan-out calculation in torch.nn.init. This can help us get correct fan-in and fan-out for sharded tensor. """ - assert isinstance(tensor, nn.Parameter), "Sharded tensor initilization is only allowed for paramters" + assert isinstance(tensor, nn.Parameter), "Sharded tensor initialization is only allowed for parameters" # get correct shape of input tensor if not hasattr(tensor, 'colo_attr') or not tensor.colo_attr.param_is_sharded: diff --git a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py b/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py index 11297bf6d62c..d663104831ce 100644 --- a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py +++ b/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py @@ -14,7 +14,7 @@ class BucketTensorShardStrategy(TensorShardStrategy): """Use the same shard scheme as `TensorShardStrategy`'s, but it gathers tensors of a sub-module together, which will fully utilize network bandwidth. It is especially useful when sub-module contains bias, - since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usaully small). + since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usually small). """ def gather(self, tensor_list: List[ShardedTensor], process_group: Optional[dist.ProcessGroup] = None): diff --git a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py index edd2cc8e68fe..b3a83b741825 100644 --- a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py +++ b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py @@ -192,7 +192,7 @@ def cpu_offload(self): def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None: """ - dummy memory tracer collected infomation to a file. + dummy memory tracer collected information to a file. try: # forward: model(inputs) # backward: optimizer.backward() @@ -201,7 +201,7 @@ def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> N exit(0) """ if self._use_memory_tracer: - self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0]) + self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0]) if gpc.get_global_rank() == 0: with open(filename, 'w+') as f: f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n') @@ -293,7 +293,7 @@ def _post_backward_operations(self) -> None: if not p.requires_grad: continue # Leave the gradient accumulation state (_require_backward_grad_sync) as-is if not synchronizing this pass. - # NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient allreducing between process group. + # NOTE() (no-sync)/sync pass: (not conduct)/conduct gradient all reducing between process group. # If _require_backward_grad_sync is True, # p.grad remains the accumulated unsharded gradient from prior no-sync passes. # We also allows to interleave no-sync pass with sync passes, if desired. @@ -385,7 +385,7 @@ def _save_grad(self, param: Parameter, grad: torch.Tensor): param.colo_attr.grad_payload_reset(grad.data) # release the memory of param # we set a false None for parameter's payload - # so we can get paramter's device and dtype later in optimizer + # so we can get parameter's device and dtype later in optimizer param.colo_attr.data_payload_reset(torch.empty(0, device=grad.device, dtype=grad.dtype)) if param.colo_attr.is_replicated: diff --git a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py index 7ce1c056f583..be60209af434 100644 --- a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py +++ b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py @@ -67,8 +67,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer): growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000. hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2. max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32. - dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None. - mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None. + dp_process_group (Optional[ProcessGroup], optional): data parallel process group. Defaults to None. + mp_process_group (Optional[ProcessGroup], optional): model parallel process group. Defaults to None. .. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management: https://arxiv.org/abs/2108.05818 @@ -274,7 +274,7 @@ def _register_master_weight(self): assert hasattr(p, 'colo_attr'), 'The parameter must be wrapped with ShardedParam' shard_flag = not p.colo_attr.sharded_data_tensor.is_sharded and p.colo_attr.is_replicated if shard_flag: - # we always shard replicated paramters + # we always shard replicated parameters self.shard_strategy.shard([p.colo_attr.sharded_data_tensor], self.dp_process_group) self.master_params[p] = StatefulTensor(cast_tensor_to_fp32(p.colo_attr.data_payload.to(self.device))) if shard_flag: @@ -312,7 +312,7 @@ def _prepare_grads(self): # If reuse_fp16_shard, grad fp16 which wasn't be offloaded may be evicted to CPU if not p.colo_attr.offload_grad: colo_model_data_tensor_move_inline(p.colo_attr.saved_grad, torch.cuda.current_device()) - # FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful infomation + # FIXME(ver217): p.data here is an empty tensor on CUDA and has no useful information # If we change p.grad directly # it may raise error because of different shape/dtype/device of p.data and p.grad # We just set p.data = p.colo_attr.saved_grad.payload here @@ -333,7 +333,7 @@ def _point_param_fp16_to_master_param(self): def _copy_master_model_to_model_fp16(self): # Copy master param data (fp32) to payload of colo_attr (fp16) - # TODO() improve efficiency by gathering tensors into a chunk and transfering + # TODO() improve efficiency by gathering tensors into a chunk and transferring # a chunk. for group in self.optim.param_groups: for p in group['params']: @@ -350,7 +350,7 @@ def _copy_master_param_to_param_fp16(self, p): p.data = self.master_params[p].payload - # we need to allocate new memory for keep_not_shard paramters + # we need to allocate new memory for keep_not_shard parameters # in order to use copy, otherwise, the sizes of tensor is not compatible if p.colo_attr.data_payload.numel() != p.data.numel(): p.colo_attr.data_payload_reset( diff --git a/colossalai/zero/wrapper.py b/colossalai/zero/wrapper.py index 6cdb8fc59ba5..3e48f49fa305 100644 --- a/colossalai/zero/wrapper.py +++ b/colossalai/zero/wrapper.py @@ -26,7 +26,7 @@ def zero_model_wrapper(model: nn.Module, zero_stage (int, optional): The stage of ZeRO DDP. You can find more information in ZeRO's paper. https://arxiv.org/abs/1910.02054 gemini_config (dict, optional): The configuration dictionary of `GeminiDDP`. `GeminiDDP` is enabled - when the stage is set to 3. You can set the arguemnts of `GeminiDDP` in the gemini_config. + when the stage is set to 3. You can set the arguments of `GeminiDDP` in the gemini_config. Here is an example where we set the device of the model, the placement policy of Gemini, and the size of hidden dimension to help Gemini find out a unified chunk size. @@ -78,7 +78,7 @@ def zero_optim_wrapper(model: nn.Module, max_norm (float, optional): max_norm used for `clip_grad_norm`. You should notice that you shall not do clip_grad_norm by yourself when using ZeRO DDP. The ZeRO optimizer will take care of clip_grad_norm. norm_type (float, optional): norm_type used for `clip_grad_norm`. - optim_config (dict, optinoal): The configuration used for the ZeRO optimizer. + optim_config (dict, optional): The configuration used for the ZeRO optimizer. Example: >>> zero2_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True) diff --git a/docs/source/en/Colossal-Auto/get_started/run_demo.md b/docs/source/en/Colossal-Auto/get_started/run_demo.md index 6f7a82966f20..34872e399c81 100644 --- a/docs/source/en/Colossal-Auto/get_started/run_demo.md +++ b/docs/source/en/Colossal-Auto/get_started/run_demo.md @@ -4,7 +4,7 @@ Colossal-Auto simplifies the process of deploying large-scale machine learning m ### 1. Basic usage -Colossal-Auto can be used to find a hybrid SPMD parallel strategy includes data, tensor(i.e., 1D, 2D, sequencial) for each operation. You can follow the [GPT example](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/experiments/auto_parallel). +Colossal-Auto can be used to find a hybrid SPMD parallel strategy includes data, tensor(i.e., 1D, 2D, sequential) for each operation. You can follow the [GPT example](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/experiments/auto_parallel). Detailed instructions can be found in its `README.md`. ### 2. Integration with activation checkpoint diff --git a/docs/source/en/advanced_tutorials/meet_gemini.md b/docs/source/en/advanced_tutorials/meet_gemini.md index 4889b30a6cf8..8afb6705b6ae 100644 --- a/docs/source/en/advanced_tutorials/meet_gemini.md +++ b/docs/source/en/advanced_tutorials/meet_gemini.md @@ -44,7 +44,7 @@ In some solutions, the [Zero-offload](https://arxiv.org/abs/2101.06840) adopted -Colossal-AI designed Gemini, just like two-stars, which manages the memory space of CPU and GPU efficiently. It can make the tensor dynamically distributed in the storage space of CPU-GPU during training, so that the model training can break through the memory wall of GPU. The memory manager consists of two parts: **MemStatsCollector (MSC)** and **StatefuleTensorMgr (STM)**. +Colossal-AI designed Gemini, just like two-stars, which manages the memory space of CPU and GPU efficiently. It can make the tensor dynamically distributed in the storage space of CPU-GPU during training, so that the model training can break through the memory wall of GPU. The memory manager consists of two parts: **MemStatsCollector (MSC)** and **StatefulTensorMgr (STM)**. We take advantage of the iterative characteristics of the deep learning network training process. We divide iterations into two stages: warmup and non-warmup. One or several iterative steps at the beginning belong to the warmup stage, and the other iterative steps belong to the non-warmup stage. In the warmup stage, we collect information for the MSC, while in the non-warmup stage, STM gets the information collected by the MSC to move the tensor, so as to minimize the CPU-GPU data movement volume. diff --git a/docs/source/en/advanced_tutorials/opt_service.md b/docs/source/en/advanced_tutorials/opt_service.md index b317de91bbdd..a43ec7fdd1fe 100644 --- a/docs/source/en/advanced_tutorials/opt_service.md +++ b/docs/source/en/advanced_tutorials/opt_service.md @@ -20,7 +20,7 @@ To launch the distributed inference service quickly, you can download the OPT-12 2. Prepare a prebuilt service image -Pull a docker image from dockerhub installed with Colossal-AI inference. +Pull a docker image from docker hub installed with Colossal-AI inference. ```bash docker pull hpcaitech/energon-ai:latest diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md index 1f3086559939..b2438a1cf562 100644 --- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md +++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md @@ -12,7 +12,7 @@ Author: Yuxuan Lou ## Introduction -In this example for ViT model, Colossal-AI provides three different parallelism techniques which acclerate model training: data parallelism, pipeline parallelism and tensor parallelism. +In this example for ViT model, Colossal-AI provides three different parallelism techniques which accelerate model training: data parallelism, pipeline parallelism and tensor parallelism. We will show you how to train ViT on CIFAR-10 dataset with these parallelism techniques. To run this example, you will need 2-4 GPUs. @@ -31,7 +31,7 @@ pip install colossalai ## Data Parallelism -Data parallism is one basic way to accelerate model training process. You can apply data parallism to training by only two steps: +Data parallism is one basic way to accelerate model training process. You can apply data parallelism to training by only two steps: 1. Define a configuration file 2. Change a few lines of code in train script @@ -108,7 +108,7 @@ disable_existing_loggers() logger = get_dist_logger() ``` -After initialization, you can acess the variables in the config file by using `colossalai.core.global_context`. +After initialization, you can access the variables in the config file by using `colossalai.core.global_context`. ```python #access parameters @@ -162,7 +162,7 @@ optimizer = colossalai.nn.Lamb(model.parameters(), lr=1.8e-2, weight_decay=0.1) # build loss criterion = torch.nn.CrossEntropyLoss() -# lr_scheduelr +# lr_scheduler lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS) ``` @@ -230,10 +230,10 @@ torchrun --standalone --nproc_per_node train_dp.py --config ./config ## Pipeline Parallelism -Aside from data parallelism, Colossal-AI also support pipleline parallelism. In specific, Colossal-AI uses 1F1B pipeline introduced by NVIDIA. For more details, you can view the related [documents](https://www.colossalai.org/tutorials/features/pipeline_parallel). +Aside from data parallelism, Colossal-AI also support pipeline parallelism. In specific, Colossal-AI uses 1F1B pipeline introduced by NVIDIA. For more details, you can view the related [documents](https://www.colossalai.org/tutorials/features/pipeline_parallel). ### Define your configuration file(`hybrid_parallel/configs/vit_pipeline.py`) -To apply pipleline parallel on the data parallel basis, you only need to add a **parallel dict** +To apply pipeline parallel on the data parallel basis, you only need to add a **parallel dict** ```python from colossalai.amp import AMP_TYPE @@ -250,7 +250,7 @@ clip_grad_norm = 1.0 Other configs: ```python -# hyperparameters +# hyper parameters # BATCH_SIZE is as per GPU # global batch size = BATCH_SIZE x data parallel size BATCH_SIZE = 256 @@ -276,7 +276,7 @@ Colossal-AI provides two methods to build a pipeline model from the existing mod - `colossalai.builder.build_pipeline_model_from_cfg` - `colossalai.builder.build_pipeline_model` -Besides, you can also build a pipeline model from scrath with Colossal-AI. +Besides, you can also build a pipeline model from scratch with Colossal-AI. ```python import math from typing import Callable @@ -521,7 +521,7 @@ def build_cifar(batch_size): return train_dataloader, test_dataloader -# craete dataloaders +# create dataloaders train_dataloader , test_dataloader = build_cifar() # create loss function @@ -539,7 +539,7 @@ lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, #### Start Colossal-AI engine ```python -# intiailize +# initialize engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model, optimizer=optimizer, criterion=criterion, @@ -615,7 +615,7 @@ TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE) Ohter configs: ```python -# hyperparameters +# hyper parameters # BATCH_SIZE is as per GPU # global batch size = BATCH_SIZE x data parallel size BATCH_SIZE = 256 diff --git a/docs/source/en/basics/colotensor_concept.md b/docs/source/en/basics/colotensor_concept.md index 1b855c03b919..909c5e4d3c6f 100644 --- a/docs/source/en/basics/colotensor_concept.md +++ b/docs/source/en/basics/colotensor_concept.md @@ -42,7 +42,7 @@ Therefore, when using Distributed Spec, we only need to describe the way that th ## Compute Spec -An instance of class [ComputeSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.compute_spec.html#colossalai.tensor.compute_spec.ComputeSpec) describes how a Coloensor be used in DNN training. Currently, we will set the correct Compute Pattern for the ColoTensor as the parameters of the module. The specific application scenarios will be shown in the next document. +An instance of class [ComputeSpec](https://colossalai.readthedocs.io/en/latest/colossalai/colossalai.tensor.compute_spec.html#colossalai.tensor.compute_spec.ComputeSpec) describes how a Colotensor be used in DNN training. Currently, we will set the correct Compute Pattern for the ColoTensor as the parameters of the module. The specific application scenarios will be shown in the next document. ## ColoParameter diff --git a/docs/source/en/basics/engine_trainer.md b/docs/source/en/basics/engine_trainer.md index 39792f622aa9..bbe32ed5a3b5 100644 --- a/docs/source/en/basics/engine_trainer.md +++ b/docs/source/en/basics/engine_trainer.md @@ -172,7 +172,7 @@ In this config file, we specify that we want to use batch size 128 per GPU and r #### Step 2. Initialize Distributed Environment We need to initialize the distributed training environment. This has been introduced in the tutorial on how to -[launch Colossal-AI](./launch_colossalai.md). For this demostration, we use `launch_from_torch` and PyTorch launch utility. +[launch Colossal-AI](./launch_colossalai.md). For this demonstration, we use `launch_from_torch` and PyTorch launch utility. ```python import colossalai diff --git a/docs/source/en/concepts/colossalai_overview.md b/docs/source/en/concepts/colossalai_overview.md index d75d20196b08..38b682d49e62 100644 --- a/docs/source/en/concepts/colossalai_overview.md +++ b/docs/source/en/concepts/colossalai_overview.md @@ -6,18 +6,18 @@ Author: Shenggui Li, Siqi Mai With the development of deep learning model size, it is important to shift to a new training paradigm. The traditional training method with no parallelism and optimization became a thing of the past and new training methods are the key to make training large-scale models efficient and cost-effective. -Colossal-AI is designed to be a unfied system to provide an integrated set of training skills and utilities to the user. You can find the common training utilities such as mixed precision training and gradient accumulation. Besides, we provide an array of parallelism including data, tensor and pipeline parallelism. We optimize tensor parallelism with different multi-dimensional distributed matrix-matrix multiplication algorithm. We also provided different pipeline parallelism methods to allow the user to scale their model across nodes efficiently. More advanced features such as offloading can be found in this tutorial documentation in detail as well. +Colossal-AI is designed to be a unified system to provide an integrated set of training skills and utilities to the user. You can find the common training utilities such as mixed precision training and gradient accumulation. Besides, we provide an array of parallelism including data, tensor and pipeline parallelism. We optimize tensor parallelism with different multi-dimensional distributed matrix-matrix multiplication algorithm. We also provided different pipeline parallelism methods to allow the user to scale their model across nodes efficiently. More advanced features such as offloading can be found in this tutorial documentation in detail as well. ## General Usage -We aim to make Colossal-AI easy to use and non-instrusive to user code. There is a simple general workflow if you want to use Colossal-AI. +We aim to make Colossal-AI easy to use and non-intrusive to user code. There is a simple general workflow if you want to use Colossal-AI.
Workflow
-1. Prepare a configiguration file where specifies the features you want to use and your parameters. +1. Prepare a configuration file where specifies the features you want to use and your parameters. 2. Initialize distributed backend with `colossalai.launch` 3. Inject the training features into your training components (e.g. model, optimizer) with `colossalai.initialize`. 4. Run training and testing diff --git a/docs/source/en/features/1D_tensor_parallel.md b/docs/source/en/features/1D_tensor_parallel.md index 695a8f31f8c5..0c24e00178b2 100644 --- a/docs/source/en/features/1D_tensor_parallel.md +++ b/docs/source/en/features/1D_tensor_parallel.md @@ -42,7 +42,7 @@ Given $P$ processors, we present the theoretical computation and memory cost, as ## Usage -To enable 1D tensor parallelism for our model, e.g. on 2 GPUs, we need to configure the parallism setting as below. +To enable 1D tensor parallelism for our model, e.g. on 2 GPUs, we need to configure the parallelism setting as below. ```python CONFIG = dict(parallel=dict( data=1, @@ -52,7 +52,7 @@ CONFIG = dict(parallel=dict( ``` Then Colossal-AI will automatically apply 1D parallelism to all the layers from `colossalai.nn`. -Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below. +Let's define a model that consists of a two-layer multi-layer perception (MLP) as below. ```python import colossalai import colossalai.nn as col_nn diff --git a/docs/source/en/features/2D_tensor_parallel.md b/docs/source/en/features/2D_tensor_parallel.md index 582614c2f2f4..eda3d6426ec9 100644 --- a/docs/source/en/features/2D_tensor_parallel.md +++ b/docs/source/en/features/2D_tensor_parallel.md @@ -60,7 +60,7 @@ Given $P=q\times q$ processors, we present the theoretical computation and memor ## Usage -To enable 2D tensor parallelism for our model, e.g. on 4 GPUs, we need to configure the parallism setting as below. +To enable 2D tensor parallelism for our model, e.g. on 4 GPUs, we need to configure the parallelism setting as below. ```python CONFIG = dict(parallel=dict( data=1, @@ -70,7 +70,7 @@ CONFIG = dict(parallel=dict( ``` Then Colossal-AI will automatically apply 2D parallelism to all the layers from `colossalai.nn`. -Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below. +Let's define a model that consists of a two-layer multi-layer perception (MLP) as below. ```python import colossalai import colossalai.nn as col_nn diff --git a/docs/source/en/features/2p5D_tensor_parallel.md b/docs/source/en/features/2p5D_tensor_parallel.md index 34a261ea0aa0..438fe6829450 100644 --- a/docs/source/en/features/2p5D_tensor_parallel.md +++ b/docs/source/en/features/2p5D_tensor_parallel.md @@ -57,7 +57,7 @@ Given $P=q \times q \times d$ processors, we present the theoretical computation ## Usage -To enable 2.5D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallism setting as below. +To enable 2.5D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallelism setting as below . ```python CONFIG = dict(parallel=dict( data=1, @@ -68,7 +68,7 @@ CONFIG = dict(parallel=dict( ``` Then Colossal-AI will automatically apply 2.5D parallelism to all the layers from `colossalai.nn`. -Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below. +Let's define a model that consists of a two-layer multi-layer perception (MLP) as below. ```python import colossalai import colossalai.nn as col_nn diff --git a/docs/source/en/features/3D_tensor_parallel.md b/docs/source/en/features/3D_tensor_parallel.md index 1207376335ce..c1d8909de72b 100644 --- a/docs/source/en/features/3D_tensor_parallel.md +++ b/docs/source/en/features/3D_tensor_parallel.md @@ -77,7 +77,7 @@ CONFIG = dict(parallel=dict( ``` Then Colossal-AI will automatically apply 3D parallelism to all the layers from `colossalai.nn`. -Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below. +Let's define a model that consists of a two-layer multi-layer perception (MLP) as below. ```python import colossalai import colossalai.nn as col_nn diff --git a/docs/source/en/features/gradient_accumulation.md b/docs/source/en/features/gradient_accumulation.md index d8781ee691bc..ecc209fbac8d 100644 --- a/docs/source/en/features/gradient_accumulation.md +++ b/docs/source/en/features/gradient_accumulation.md @@ -28,7 +28,7 @@ gradient_accumulation = ## Hands-on Practice We provide a [runnable example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/features/gradient_accumulation) -to demonstrate gradient accumulation. In this example, we set the gradinet accumulation size to be 4. You can run the script using this command: +to demonstrate gradient accumulation. In this example, we set the gradient accumulation size to be 4. You can run the script using this command: ```shell python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500 run_resnet_cifar10_with_engine.py diff --git a/docs/source/en/features/mixed_precision_training.md b/docs/source/en/features/mixed_precision_training.md index 71cb6971d346..11aa5235301a 100644 --- a/docs/source/en/features/mixed_precision_training.md +++ b/docs/source/en/features/mixed_precision_training.md @@ -101,7 +101,7 @@ you can use `colossalai.amp.convert_to_amp`. ```python from colossalai.amp import AMP_TYPE -# exmaple of using torch amp +# example of using torch amp model, optimizer, criterion = colossalai.amp.convert_to_amp(model, optimizer, criterion, @@ -220,7 +220,7 @@ The default parameters of Naive AMP: - initial_scale(int): initial scale of gradient scaler - growth_factor(int): the growth rate of loss scale - backoff_factor(float): the decrease rate of loss scale -- hysterisis(int): delay shift in dynamic loss scaling +- hysteresis(int): delay shift in dynamic loss scaling - max_scale(int): maximum loss scale allowed - verbose(bool): if set to `True`, will print debug info @@ -292,7 +292,7 @@ colossalai.launch_from_torch(config=args.config) ### Step 4. Create training components Build your model, optimizer, loss function, lr scheduler and dataloaders. Note that the root path of the dataset is -obtained from the environment varialbe `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])` +obtained from the environment variable `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])` to a path on your machine. Data will be automatically downloaded to the root path. ```python @@ -326,7 +326,7 @@ to a path on your machine. Data will be automatically downloaded to the root pat # build loss criterion = torch.nn.CrossEntropyLoss() - # lr_scheduelr + # lr_scheduler lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS) ``` diff --git a/docs/source/en/features/nvme_offload.md b/docs/source/en/features/nvme_offload.md index 38d2c4af904c..4374da3c9c45 100644 --- a/docs/source/en/features/nvme_offload.md +++ b/docs/source/en/features/nvme_offload.md @@ -57,7 +57,7 @@ It's compatible with all parallel methods in ColossalAI. Let's start from two simple examples -- training GPT with different methods. These examples relies on `transformers`. -We should install denpendencies first: +We should install dependencies first: ```shell pip install psutil transformers @@ -99,7 +99,7 @@ class GPTLMLoss(nn.Module): shift_labels.view(-1)) ``` -And we define some utility functions, which generates random data, computes the number of paramters of a model and get memory usage of current process: +And we define some utility functions, which generates random data, computes the number of parameters of a model and get memory usage of current process: ```python def get_data(batch_size: int, seq_len: int, @@ -251,7 +251,7 @@ Time: 3.691 s Mem usage: 5298.344 MB ``` -NVME offload saves about 294 MB memory. Note that enabling `pin_memory` of Gemini can accelerate training but increase memory usage. So this result also meets our expectation. If we disable `pin_memory`, we can aslo observe a memory usage drop about 900 MB. +NVME offload saves about 294 MB memory. Note that enabling `pin_memory` of Gemini can accelerate training but increase memory usage. So this result also meets our expectation. If we disable `pin_memory`, we can also observe a memory usage drop about 900 MB. ## API Reference diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md index 6b0a9585af85..a105831a5409 100644 --- a/docs/source/en/features/zero_with_chunk.md +++ b/docs/source/en/features/zero_with_chunk.md @@ -32,11 +32,11 @@ and the first and second momentum estimates) are partitioned across the processe 3. **Shard Parameter**: The 16-bit model parameters are partitioned across the processes of a data parallel group. -4. **[Gemini](../advanced_tutorials/meet_gemini.md)**: Dynamic heterogeneous memory space manager for paramters, gradients and optimizer states. +4. **[Gemini](../advanced_tutorials/meet_gemini.md)**: Dynamic heterogeneous memory space manager for parameters, gradients and optimizer states. Besides, this article will introduce the Zero Redundancy Optimizer with chunk-based memory management. -When using ZeRO, we distributed the model by sharding the parameters. The advantage of this method is that the memory of each node is load balanced. But this approach has two significiant disadvantages. First, during communication, a temporary memory buffer needs to be allocated and released afterwards, leading to the memory fragmentation problem. Secondly, using tensor as the granularity for communication will cause the network bandwidth underutilized. Generally, the longer the transmitted message length, the higher the bandwidth utilization. +When using ZeRO, we distributed the model by sharding the parameters. The advantage of this method is that the memory of each node is load balanced. But this approach has two significant disadvantages. First, during communication, a temporary memory buffer needs to be allocated and released afterwards, leading to the memory fragmentation problem. Secondly, using tensor as the granularity for communication will cause the network bandwidth underutilized. Generally, the longer the transmitted message length, the higher the bandwidth utilization. Using the Chunk mechanism introduced in ColossalAI v0.1.8, we can improve the efficiency of ZeRO. We store a continuous set of parameters in initialization order into a Chunk (a chunk is a continuous memory space), and each Chunk has the same size. Organizing memory in chunks can lead to efficient use of network bandwidth between PCI-e and GPU-GPU, reduce the number of communications, and avoid potential memory fragmentation. From e42e51ca3510486c90f505262a1ba9ab84aebd4a Mon Sep 17 00:00:00 2001 From: digger-yu Date: Mon, 24 Apr 2023 19:25:36 +0800 Subject: [PATCH 03/17] Cautious Changed the spelling error under the example folder --- .../en/features/2p5D_tensor_parallel.md | 2 +- examples/images/diffusion/ldm/data/teyvat.py | 2 +- examples/images/diffusion/main.py | 24 +++++++++---------- examples/images/dreambooth/README.md | 2 +- examples/language/gpt/README.md | 2 +- .../gpt/experiments/auto_offload/README.md | 2 +- .../gpt/experiments/auto_parallel/README.md | 4 ++-- .../experiments/pipeline_parallel/README.md | 2 +- examples/language/opt/train_gemini_opt.py | 4 ++-- 9 files changed, 22 insertions(+), 22 deletions(-) diff --git a/docs/source/en/features/2p5D_tensor_parallel.md b/docs/source/en/features/2p5D_tensor_parallel.md index 438fe6829450..c84c6f4d92f5 100644 --- a/docs/source/en/features/2p5D_tensor_parallel.md +++ b/docs/source/en/features/2p5D_tensor_parallel.md @@ -57,7 +57,7 @@ Given $P=q \times q \times d$ processors, we present the theoretical computation ## Usage -To enable 2.5D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallelism setting as below . +To enable 2.5D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallelism setting as below. ```python CONFIG = dict(parallel=dict( data=1, diff --git a/examples/images/diffusion/ldm/data/teyvat.py b/examples/images/diffusion/ldm/data/teyvat.py index 61dc29d56e7c..eb5d3ea469d4 100644 --- a/examples/images/diffusion/ldm/data/teyvat.py +++ b/examples/images/diffusion/ldm/data/teyvat.py @@ -13,7 +13,7 @@ def make_multi_folder_data(paths, caption_files=None, **kwargs): """Make a concat dataset from multiple folders - Don't suport captions yet + Don't support captions yet If paths is a list, that's ok, if it's a Dict interpret it as: k=folder v=n_times to repeat that """ diff --git a/examples/images/diffusion/main.py b/examples/images/diffusion/main.py index e31d75e0874d..713029fc677d 100644 --- a/examples/images/diffusion/main.py +++ b/examples/images/diffusion/main.py @@ -40,7 +40,7 @@ class DataLoaderX(DataLoader): # A custom data loader class that inherits from DataLoader def __iter__(self): # Overriding the __iter__ method of DataLoader to return a BackgroundGenerator - #This is to enable data laoding in the background to improve training performance + #This is to enable data loading in the background to improve training performance return BackgroundGenerator(super().__iter__()) @@ -60,7 +60,7 @@ def str2bool(v): # Create an ArgumentParser object with specifies kwargs parser = argparse.ArgumentParser(**parser_kwargs) - # Add vairous command line arguments with their default balues and descriptions + # Add various command line arguments with their default values and descriptions parser.add_argument( "-n", "--name", @@ -162,7 +162,7 @@ def str2bool(v): # A function that returns the non-default arguments between two objects def nondefault_trainer_args(opt): - # create an argument parsser + # create an argument parser parser = argparse.ArgumentParser() # add pytorch lightning trainer default arguments parser = Trainer.add_argparse_args(parser) @@ -203,7 +203,7 @@ def worker_init_fn(_): else: return np.random.seed(np.random.get_state()[1][0] + worker_id) -#Provide functionality for creating data loadedrs based on provided dataset configurations +#Provide functionality for creating data loaders based on provided dataset configurations class DataModuleFromConfig(pl.LightningDataModule): def __init__(self, @@ -255,7 +255,7 @@ def setup(self, stage=None): def _train_dataloader(self): #Check if the train dataset is iterable is_iterable_dataset = isinstance(self.datasets['train'], Txt2ImgIterableBaseDataset) - #Set the worker initialization function of the dataset isiterable or use_worker_init_fn is True + #Set the worker initialization function of the dataset is iterable or use_worker_init_fn is True if is_iterable_dataset or self.use_worker_init_fn: init_fn = worker_init_fn else: @@ -310,7 +310,7 @@ def _predict_dataloader(self, shuffle=False): class SetupCallback(Callback): - # I nitialize the callback with the necessary parameters + # Initialize the callback with the necessary parameters def __init__(self, resume, now, logdir, ckptdir, cfgdir, config, lightning_config): super().__init__() @@ -371,7 +371,7 @@ def on_fit_start(self, trainer, pl_module): # trainer.save_checkpoint(ckpt_path) -# PyTorch Lightning callback for ogging images during training and validation of a deep learning model +# PyTorch Lightning callback for logging images during training and validation of a deep learning model class ImageLogger(Callback): def __init__(self, @@ -379,10 +379,10 @@ def __init__(self, max_images, # Maximum number of images to log clamp=True, # Whether to clamp pixel values to [-1,1] increase_log_steps=True, # Whether to increase frequency of log steps exponentially - rescale=True, # Whetehr to rescale pixel values to [0,1] + rescale=True, # Whether to rescale pixel values to [0,1] disabled=False, # Whether to disable logging - log_on_batch_idx=False, # Whether to log on baych index instead of global step - log_first_step=False, # Whetehr to log on the first step + log_on_batch_idx=False, # Whether to log on batch index instead of global step + log_first_step=False, # Whether to log on the first step log_images_kwargs=None): # Additional keyword arguments to pass to log_images method super().__init__() self.rescale = rescale @@ -593,7 +593,7 @@ def on_train_epoch_end(self, trainer, pl_module): parser = Trainer.add_argparse_args(parser) opt, unknown = parser.parse_known_args() - # Veirfy the arguments are both specified + # Verify the arguments are both specified if opt.name and opt.resume: raise ValueError("-n/--name and -r/--resume cannot be specified both." "If you want to resume training in a new log folder, " @@ -646,7 +646,7 @@ def on_train_epoch_end(self, trainer, pl_module): # Sets the seed for the random number generator to ensure reproducibility seed_everything(opt.seed) - # Intinalize and save configuratioon using teh OmegaConf library. + # Initialize and save configuration using teh OmegaConf library. try: # init and save configs configs = [OmegaConf.load(cfg) for cfg in opt.base] diff --git a/examples/images/dreambooth/README.md b/examples/images/dreambooth/README.md index b067a437c764..7c117d841e24 100644 --- a/examples/images/dreambooth/README.md +++ b/examples/images/dreambooth/README.md @@ -61,7 +61,7 @@ torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \ - `INSTANCE_DIR` refers to personalized path to instance images, you might need to insert information here. - `OUTPUT_DIR` refers to local path to save the trained model, you might need to find a path with enough space. - `resolution` refers to the corresponding resolution number of your target model. Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model. -- `placement` refers to the training strategy supported by Colossal AI, defult = 'cuda', which refers to loading all the parameters into cuda memory. On the other hand, 'cpu' refers to 'cpu offload' strategy while 'auto' enables 'Gemini', both featured by Colossal AI. +- `placement` refers to the training strategy supported by Colossal AI, default = 'cuda', which refers to loading all the parameters into cuda memory. On the other hand, 'cpu' refers to 'cpu offload' strategy while 'auto' enables 'Gemini', both featured by Colossal AI. ### Training with prior-preservation loss diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md index 10d6c2ddd5d7..47d24a4d69cb 100644 --- a/examples/language/gpt/README.md +++ b/examples/language/gpt/README.md @@ -40,7 +40,7 @@ We provide two stable solutions. One utilizes the Gemini to implement hybrid parallel strategies of Gemini, DDP/ZeRO, and Tensor Parallelism for a huggingface GPT model. The other one use [Titans](https://github.com/hpcaitech/Titans), a distributed executed model zoo maintained by ColossalAI,to implement the hybrid parallel strategies of TP + ZeRO + PP. -We recommend using Gemini to qucikly run your model in a distributed manner. +We recommend using Gemini to quickly run your model in a distributed manner. It doesn't require significant changes to the model structures, therefore you can apply it on a new model easily. And use Titans as an advanced weapon to pursue a more extreme performance. Titans has included the some typical models, such as Vit and GPT. diff --git a/examples/language/gpt/experiments/auto_offload/README.md b/examples/language/gpt/experiments/auto_offload/README.md index a0d252119056..535aa76541cc 100644 --- a/examples/language/gpt/experiments/auto_offload/README.md +++ b/examples/language/gpt/experiments/auto_offload/README.md @@ -27,7 +27,7 @@ pip install transformers ## Dataset -For simplicity, the input data is randonly generated here. +For simplicity, the input data is randomly generated here. ## Training diff --git a/examples/language/gpt/experiments/auto_parallel/README.md b/examples/language/gpt/experiments/auto_parallel/README.md index 404c8391109e..fb1b47802ed3 100644 --- a/examples/language/gpt/experiments/auto_parallel/README.md +++ b/examples/language/gpt/experiments/auto_parallel/README.md @@ -34,11 +34,11 @@ conda install -c conda-forge coin-or-cbc ## Dataset -For simplicity, the input data is randonly generated here. +For simplicity, the input data is randomly generated here. ## Training ```bash -#Run the auto parallel resnet example with 4 GPUs with a dummy dataset. +#Run the auto parallel resent example with 4 GPUs with a dummy dataset. colossalai run --nproc_per_node 4 auto_parallel_with_gpt.py ``` diff --git a/examples/language/gpt/experiments/pipeline_parallel/README.md b/examples/language/gpt/experiments/pipeline_parallel/README.md index 702e3c8d6540..5af994a00665 100644 --- a/examples/language/gpt/experiments/pipeline_parallel/README.md +++ b/examples/language/gpt/experiments/pipeline_parallel/README.md @@ -27,7 +27,7 @@ pip install transformers ## Dataset -For simplicity, the input data is randonly generated here. +For simplicity, the input data is randomly generated here. ## Training diff --git a/examples/language/opt/train_gemini_opt.py b/examples/language/opt/train_gemini_opt.py index 4874f831c2ec..3614b689de26 100755 --- a/examples/language/opt/train_gemini_opt.py +++ b/examples/language/opt/train_gemini_opt.py @@ -163,7 +163,7 @@ def main(): else: init_dev = get_current_device() - # shard init prameters + # shard init parameters if args.shardinit: logger.info("Sharding initialization !", ranks=[0]) else: @@ -192,7 +192,7 @@ def main(): config=config, local_files_only=False) - # enable graident checkpointing + # enable gradient checkpointing model.gradient_checkpointing_enable() numel = sum([p.numel() for p in model.parameters()]) From fc6909131f15323bc3ef69a0ad11961ad91a8496 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 15:09:55 +0800 Subject: [PATCH 04/17] Update runtime_preparation_pass.py revert autograft to autograd --- colossalai/auto_parallel/passes/runtime_preparation_pass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py index fccc59cd7579..08af846b221d 100644 --- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py +++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py @@ -429,7 +429,7 @@ def _shard_param(param, target_sharding_spec): setattr(param, 'sharding_spec', origin_sharding_spec) # TODO: build a ColoParameter class to manager the distributed parameters # we could use .data here, because all the operations just happen before the real training - # loop, so we don't need to track these operations in the autograft graph. + # loop, so we don't need to track these operations in the autograd graph. param = torch.nn.Parameter( shape_consistency_manager.apply_for_autoparallel_runtime(param.data, param.sharding_spec, target_sharding_spec).detach().clone()) From 944b25f3130fb43cf8aaef2fa680b0178cf94304 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 17:21:24 +0800 Subject: [PATCH 05/17] Update search_chunk.py utile to until --- colossalai/autochunk/search_chunk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/autochunk/search_chunk.py b/colossalai/autochunk/search_chunk.py index 3b572924f8e2..59645c80e808 100644 --- a/colossalai/autochunk/search_chunk.py +++ b/colossalai/autochunk/search_chunk.py @@ -16,7 +16,7 @@ class SearchChunk(object): This is the core class for AutoChunk. It defines the framework of the strategy of AutoChunk. - Chunks will be selected one by one utile search stops. + Chunks will be selected one by one until search stops. The chunk search is as follows: 1. find the peak memory node From 331745923bfd363a746609b40b0f3dbe3dbf3c76 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 17:23:15 +0800 Subject: [PATCH 06/17] Update check_installation.py change misteach to mismatch in line 91 --- colossalai/cli/check/check_installation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/cli/check/check_installation.py b/colossalai/cli/check/check_installation.py index 3c4081d8957c..cb3dbbc09301 100644 --- a/colossalai/cli/check/check_installation.py +++ b/colossalai/cli/check/check_installation.py @@ -88,7 +88,7 @@ def check_installation(): click.echo(f"Note:") click.echo(f"1. The table above checks the version compatibility of the libraries/tools in the current environment") click.echo( - f" - PyTorch version misteach: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation" + f" - PyTorch version mismatch: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation" ) click.echo( f" - System and PyTorch CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version required by PyTorch" From 2293fd945116260b64cd45890e917f47c6b02a57 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 17:26:17 +0800 Subject: [PATCH 07/17] Update 1D_tensor_parallel.md revert to perceptron --- docs/source/en/features/1D_tensor_parallel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/features/1D_tensor_parallel.md b/docs/source/en/features/1D_tensor_parallel.md index 0c24e00178b2..7577e50400e9 100644 --- a/docs/source/en/features/1D_tensor_parallel.md +++ b/docs/source/en/features/1D_tensor_parallel.md @@ -52,7 +52,7 @@ CONFIG = dict(parallel=dict( ``` Then Colossal-AI will automatically apply 1D parallelism to all the layers from `colossalai.nn`. -Let's define a model that consists of a two-layer multi-layer perception (MLP) as below. +Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below. ```python import colossalai import colossalai.nn as col_nn From 56a23877a8e053b1ea3d8796d63813b70eddbd3c Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 17:29:05 +0800 Subject: [PATCH 08/17] Update 2D_tensor_parallel.md revert to perceptron in line 73 --- docs/source/en/features/2D_tensor_parallel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/features/2D_tensor_parallel.md b/docs/source/en/features/2D_tensor_parallel.md index eda3d6426ec9..7b6c10766099 100644 --- a/docs/source/en/features/2D_tensor_parallel.md +++ b/docs/source/en/features/2D_tensor_parallel.md @@ -70,7 +70,7 @@ CONFIG = dict(parallel=dict( ``` Then Colossal-AI will automatically apply 2D parallelism to all the layers from `colossalai.nn`. -Let's define a model that consists of a two-layer multi-layer perception (MLP) as below. +Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below. ```python import colossalai import colossalai.nn as col_nn From 0daa155d7db129f2d37c47c62fa09a6b48fa497b Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 17:30:01 +0800 Subject: [PATCH 09/17] Update 2p5D_tensor_parallel.md revert to perceptron in line 71 --- docs/source/en/features/2p5D_tensor_parallel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/features/2p5D_tensor_parallel.md b/docs/source/en/features/2p5D_tensor_parallel.md index c84c6f4d92f5..6076562e6dca 100644 --- a/docs/source/en/features/2p5D_tensor_parallel.md +++ b/docs/source/en/features/2p5D_tensor_parallel.md @@ -68,7 +68,7 @@ CONFIG = dict(parallel=dict( ``` Then Colossal-AI will automatically apply 2.5D parallelism to all the layers from `colossalai.nn`. -Let's define a model that consists of a two-layer multi-layer perception (MLP) as below. +Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below. ```python import colossalai import colossalai.nn as col_nn From ebb2a5ef25eab6a2256715c03c5fafe79e123916 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 17:31:10 +0800 Subject: [PATCH 10/17] Update 3D_tensor_parallel.md revert to perceptron in line 80 --- docs/source/en/features/3D_tensor_parallel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/features/3D_tensor_parallel.md b/docs/source/en/features/3D_tensor_parallel.md index c1d8909de72b..1207376335ce 100644 --- a/docs/source/en/features/3D_tensor_parallel.md +++ b/docs/source/en/features/3D_tensor_parallel.md @@ -77,7 +77,7 @@ CONFIG = dict(parallel=dict( ``` Then Colossal-AI will automatically apply 3D parallelism to all the layers from `colossalai.nn`. -Let's define a model that consists of a two-layer multi-layer perception (MLP) as below. +Let's define a model that consists of a two-layer multi-layer perceptron (MLP) as below. ```python import colossalai import colossalai.nn as col_nn From e461bb974a1e64b64514cc9f17791a907790875f Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 17:32:51 +0800 Subject: [PATCH 11/17] Update README.md revert to resnet in line 42 --- examples/language/gpt/experiments/auto_parallel/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/language/gpt/experiments/auto_parallel/README.md b/examples/language/gpt/experiments/auto_parallel/README.md index fb1b47802ed3..1c8b1c35109f 100644 --- a/examples/language/gpt/experiments/auto_parallel/README.md +++ b/examples/language/gpt/experiments/auto_parallel/README.md @@ -39,6 +39,6 @@ For simplicity, the input data is randomly generated here. ## Training ```bash -#Run the auto parallel resent example with 4 GPUs with a dummy dataset. +#Run the auto parallel resnet example with 4 GPUs with a dummy dataset. colossalai run --nproc_per_node 4 auto_parallel_with_gpt.py ``` From dc9bc1250866d12b49f9f9c3d72976cd1b02d965 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 17:55:10 +0800 Subject: [PATCH 12/17] Update reorder_graph.py revert to indice in line 7 --- colossalai/autochunk/reorder_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/autochunk/reorder_graph.py b/colossalai/autochunk/reorder_graph.py index dd5fc7499aa2..3b00d47fb955 100644 --- a/colossalai/autochunk/reorder_graph.py +++ b/colossalai/autochunk/reorder_graph.py @@ -4,7 +4,7 @@ class ReorderGraph(object): """ - Reorder node list and indices trace list + Reorder node list and indice trace list """ def __init__(self, trace_indice: TraceIndice, node_mgr: NodeMgr) -> None: From 9fde28ab35c034fdf8f589ee1c0fe509123bfcc0 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 17:57:31 +0800 Subject: [PATCH 13/17] Update p2p.py revert to megatron in line 94 --- colossalai/communication/p2p.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/communication/p2p.py b/colossalai/communication/p2p.py index 782f3461990c..0200cd3c6553 100644 --- a/colossalai/communication/p2p.py +++ b/colossalai/communication/p2p.py @@ -91,7 +91,7 @@ def _communicate(object_send_next: Union[torch.Tensor, List[torch.Tensor]] = Non dtype: torch.dtype = None, scatter_gather_tensors: bool = False) -> Tuple[Union[torch.Tensor, List[torch.Tensor]]]: """ - Adapted from megaton.p2p_communication. + Adapted from megatron.p2p_communication. Communicate tensors between stages. Used as helper method in other communication methods that are used in pipeline schedule. Takes the following arguments: From 74b842dd29c01e79ed0ebda6c8d87e0abab6499a Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 17:58:40 +0800 Subject: [PATCH 14/17] Update initialize.py revert to torchrun in line 198 --- colossalai/initialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/initialize.py b/colossalai/initialize.py index bae4cb3bdf63..5d3f3e5530cb 100644 --- a/colossalai/initialize.py +++ b/colossalai/initialize.py @@ -195,7 +195,7 @@ def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = 'nccl', seed: int = 1024, verbose: bool = True): - """A wrapper for colossalai.launch for torch run or torch.distributed.launch by reading rank and world size + """A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size from the environment variables set by PyTorch Args: From cfb3e6d15077ef3db05de5188f41adfc132afdf1 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 18:00:10 +0800 Subject: [PATCH 15/17] Update routers.py change to detailed in line 63 --- colossalai/nn/layer/moe/routers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/nn/layer/moe/routers.py b/colossalai/nn/layer/moe/routers.py index f1cd0f3574c7..193f15a7c7fb 100644 --- a/colossalai/nn/layer/moe/routers.py +++ b/colossalai/nn/layer/moe/routers.py @@ -60,7 +60,7 @@ def pop_routing_loss(self) -> torch.Tensor: class Top1Router(MoeRouter): """Top1 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c] - for routing usage. More derailed function can be found in the paper about Switch Transformer + for routing usage. More detailed function can be found in the paper about Switch Transformer of Google. Args: capacity_factor_train (float, optional): Capacity factor in routing of training. From 9f3cba4a192c4de4daf0d77d52929d9d1db7cde5 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 18:01:50 +0800 Subject: [PATCH 16/17] Update routers.py change to detailed in line 146 --- colossalai/nn/layer/moe/routers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/nn/layer/moe/routers.py b/colossalai/nn/layer/moe/routers.py index 193f15a7c7fb..c5b8390bf047 100644 --- a/colossalai/nn/layer/moe/routers.py +++ b/colossalai/nn/layer/moe/routers.py @@ -143,7 +143,7 @@ def forward(self, inputs: torch.Tensor, use_kernel: bool = False, ep_group: Opti class Top2Router(MoeRouter): """Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c] - for routing usage. More derailed function can be found in the paper about ViT-MoE. + for routing usage. More detailed function can be found in the paper about ViT-MoE. Args: capacity_factor_train (float, optional): Capacity factor in routing of training. capacity_factor_eval (float, optional): Capacity factor in routing of evaluation. From 117145eb97db3b0d4f6d0dbc2916b79ac3273126 Mon Sep 17 00:00:00 2001 From: digger-yu Date: Tue, 25 Apr 2023 18:03:22 +0800 Subject: [PATCH 17/17] Update README.md revert random number in line 402 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e19e716a28e2..79f733122cb3 100644 --- a/README.md +++ b/README.md @@ -399,7 +399,7 @@ You may contact us or participate in the following ways: Thanks so much to all of our amazing contributors! - +