From 1a2b01d9f1ee19f9e5577d8171f09c178b06a8c3 Mon Sep 17 00:00:00 2001 From: digger yu Date: Thu, 11 May 2023 18:41:41 +0800 Subject: [PATCH 1/2] fix typo applications/ and colossalai/ date 5.11 --- applications/Chat/examples/community/peft/README.md | 2 +- applications/Chat/inference/README.md | 2 +- applications/Chat/inference/benchmark.py | 2 +- colossalai/auto_parallel/README.md | 4 ++-- .../auto_parallel/passes/runtime_preparation_pass.py | 4 ++-- colossalai/autochunk/autochunk_codegen.py | 6 +++--- colossalai/autochunk/trace_indice.py | 2 +- colossalai/checkpoint_io/index_file.py | 2 +- colossalai/checkpoint_io/utils.py | 2 +- colossalai/cli/check/check_installation.py | 8 ++++---- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/applications/Chat/examples/community/peft/README.md b/applications/Chat/examples/community/peft/README.md index eabb56fd8294..844bfd3d22c3 100644 --- a/applications/Chat/examples/community/peft/README.md +++ b/applications/Chat/examples/community/peft/README.md @@ -18,7 +18,7 @@ For SFT training, just call train_peft_sft.py Its arguments are almost identical to train_sft.py instead adding a new eval_dataset if you have a eval_dataset file. The data file is just a plain datafile, please check the format in the easy_dataset.py. For stage-3 rlhf training, call train_peft_prompts.py. -Its arguments are almost idential to train_prompts.py. The only difference is that I use text files to indicate the prompt and pretrained data file. The models are included in easy_models.py. Currently only bloom models are tested, but technically gpt2/opt/llama should be supported. +Its arguments are almost identical to train_prompts.py. The only difference is that I use text files to indicate the prompt and pretrained data file. The models are included in easy_models.py. Currently only bloom models are tested, but technically gpt2/opt/llama should be supported. # Dataformat Please refer the formats in test_sft.txt, test_prompts.txt, test_pretrained.txt. diff --git a/applications/Chat/inference/README.md b/applications/Chat/inference/README.md index 434677c98fa5..4848817e0fd1 100644 --- a/applications/Chat/inference/README.md +++ b/applications/Chat/inference/README.md @@ -75,7 +75,7 @@ E.g. you can set `export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH`. Please ensure you have downloaded HF-format model weights of LLaMA models first. -Then you can follow [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa). This lib provides efficient CUDA kernels and weight convertion script. +Then you can follow [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa). This lib provides efficient CUDA kernels and weight conversion script. After installing this lib, we may convert the original HF-format LLaMA model weights to 4-bit version. diff --git a/applications/Chat/inference/benchmark.py b/applications/Chat/inference/benchmark.py index 59cd1eeea2aa..a8485f588705 100644 --- a/applications/Chat/inference/benchmark.py +++ b/applications/Chat/inference/benchmark.py @@ -123,7 +123,7 @@ def evaluate( start = time() for instruction in instructions: print(f"Instruction: {instruction}") - resp, tokens = evaluate(model, tokenizer, instruction, temparature=0.2, num_beams=1) + resp, tokens = evaluate(model, tokenizer, instruction, temperature=0.2, num_beams=1) total_tokens += tokens print(f"Response: {resp}") print('\n----------------------------\n') diff --git a/colossalai/auto_parallel/README.md b/colossalai/auto_parallel/README.md index 8e47e1bb0b4a..f011ec8ccbd7 100644 --- a/colossalai/auto_parallel/README.md +++ b/colossalai/auto_parallel/README.md @@ -16,8 +16,8 @@ A *symbolic profiler* for collecting computing and memory overhead related to st ### Solver **Solver** is designed to find the optimal execution plan for a given computation graph and cluster in two stages: -1) *Intra-op parallelism stage* is to find the plan with the minimum total execution time of all nodes with respect to the constraint of the memory budget. The optimaztion goal of intra-op parallelism solver is modified from Alpa 's intra-op parallelsim ILP solver. -2) *Activation checkpoint stage* is to search for the fastest execution plan that meets the memory budget on the computation graph after inserting the communication nodes by the intra-op parallelism stage. The algorithm to find optimial activation checkpoint is modified from Rotor . The reason we use two-stage optimization is that if the two tasks are formulated together, the solving time will be significantly increased, which will greatly affect the user experience of the system. On the contrary, solving in two hierarchical levels has many advantages. Firstly, compared with the computation graph with activation checkpointing, the original graph has fewer nodes, which can reduce the solving cost of intra-op parallelism solver. In addition, a more optimal solution can be found by adding the communication overhead into the activation checkpoint modeling. +1) *Intra-op parallelism stage* is to find the plan with the minimum total execution time of all nodes with respect to the constraint of the memory budget. The optimization goal of intra-op parallelism solver is modified from Alpa 's intra-op parallelism ILP solver. +2) *Activation checkpoint stage* is to search for the fastest execution plan that meets the memory budget on the computation graph after inserting the communication nodes by the intra-op parallelism stage. The algorithm to find optimal activation checkpoint is modified from Rotor . The reason we use two-stage optimization is that if the two tasks are formulated together, the solving time will be significantly increased, which will greatly affect the user experience of the system. On the contrary, solving in two hierarchical levels has many advantages. Firstly, compared with the computation graph with activation checkpointing, the original graph has fewer nodes, which can reduce the solving cost of intra-op parallelism solver. In addition, a more optimal solution can be found by adding the communication overhead into the activation checkpoint modeling. ### Generator **Generator** applies the searched execution plan to the computation graph and recompiles the computation graph to optimized PyTorch code. It has *a series compile pass* to insert a communication node or do the kernel substitution as the intra-op parallelism solver required. Additionally, we implement a *code generation* feature to recognize the annotation from the activation checkpoint solver and inject the activation checkpoint block following annotation instructions. diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py index 08af846b221d..177f3765f5a0 100644 --- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py +++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py @@ -169,7 +169,7 @@ def _post_processing(node, size_processing_node): This function is used to process the dependency between the size node and its users after inserting the size_process_node. ''' - # store original node and processing node pair in node_pairs dictioanry + # store original node and processing node pair in node_pairs dictionary # It will be used to replace the original node with processing node in slice object node_pairs[node] = size_processing_node size_processing_node._meta_data = node._meta_data @@ -388,7 +388,7 @@ def module_params_sharding_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMes """ mod_graph = gm.graph nodes = tuple(mod_graph.nodes) - # This stream is created for overlaping the communication and computation. + # This stream is created for overlapping the communication and computation. reduction_stream = torch.cuda.Stream() def _add_hook_for_grad_communication(node, param, name=None): diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py index d0a467254d72..cc98c1570b4a 100644 --- a/colossalai/autochunk/autochunk_codegen.py +++ b/colossalai/autochunk/autochunk_codegen.py @@ -40,7 +40,7 @@ def _gen_chunk_slice_dim(chunk_dim: int, chunk_indice_name: str, shape: List) -> return new_shape -def _gen_loop_start(chunk_input: List[Node], chunk_output: List[Node], chunk_ouput_dim: int, chunk_size=2) -> str: +def _gen_loop_start(chunk_input: List[Node], chunk_output: List[Node], chunk_output_dim: int, chunk_size=2) -> str: """ Generate chunk loop start @@ -52,7 +52,7 @@ def _gen_loop_start(chunk_input: List[Node], chunk_output: List[Node], chunk_oup Args: chunk_input (List[Node]): chunk input node chunk_output (Node): chunk output node - chunk_ouput_dim (int): chunk output node chunk dim + chunk_output_dim (int): chunk output node chunk dim chunk_size (int): chunk size. Defaults to 2. Returns: @@ -74,7 +74,7 @@ def _gen_loop_start(chunk_input: List[Node], chunk_output: List[Node], chunk_oup input_node.name, input_node.name) out_shape = get_node_shape(chunk_output[0]) - chunk_shape = out_shape[chunk_ouput_dim[0]] + chunk_shape = out_shape[chunk_output_dim[0]] context += "chunk_size = %d\nfor chunk_idx in range(0, %d, chunk_size):\n" % (chunk_size, chunk_shape) return context diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py index c7fce4c8bee1..d56bf843f18d 100644 --- a/colossalai/autochunk/trace_indice.py +++ b/colossalai/autochunk/trace_indice.py @@ -18,7 +18,7 @@ class TraceIndice(object): dim(x1)=dim(x2)=dim(x3)=[a, b, c] This class will record every node's dims' indice, compute and source. - Attibutes: + Attributes: node_list (List) indice_trace_list (List): [{"indice": [...], "compute": [...], "source": [...]}, {...}] indice_view_list (Dict): not used for now diff --git a/colossalai/checkpoint_io/index_file.py b/colossalai/checkpoint_io/index_file.py index 15a6d09f3b5e..334ecbc04738 100644 --- a/colossalai/checkpoint_io/index_file.py +++ b/colossalai/checkpoint_io/index_file.py @@ -159,7 +159,7 @@ def get_all_param_names(self): def write_index_file(self, save_index_file): """ - Wriete index file. + Write index file. """ save_index_file = os.path.join(self.root_path, save_index_file) index = {"metadata": self.metadata, "weight_map": self.weight_map} diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py index 16e41631f0d5..ee4bd72e89ec 100644 --- a/colossalai/checkpoint_io/utils.py +++ b/colossalai/checkpoint_io/utils.py @@ -21,7 +21,7 @@ def calculate_tensor_size(tensor: torch.Tensor) -> float: If so, a new shard should be created. Args: - tenosr (torch.Tensor): the tensor to calculate size for. + tensor (torch.Tensor): the tensor to calculate size for. Returns: float: size of the tensor in MB. diff --git a/colossalai/cli/check/check_installation.py b/colossalai/cli/check/check_installation.py index cb3dbbc09301..4a481f3bd122 100644 --- a/colossalai/cli/check/check_installation.py +++ b/colossalai/cli/check/check_installation.py @@ -31,7 +31,7 @@ def check_installation(): found_aot_cuda_ext = _check_aot_built_cuda_extension_installed() cuda_version = _check_cuda_version() torch_version, torch_cuda_version = _check_torch_version() - colossalai_verison, prebuilt_torch_version_required, prebuilt_cuda_version_required = _parse_colossalai_version() + colossalai_version, prebuilt_torch_version_required, prebuilt_cuda_version_required = _parse_colossalai_version() # if cuda_version is None, that means either # CUDA_HOME is not found, thus cannot compare the version compatibility @@ -57,7 +57,7 @@ def check_installation(): click.echo(f'#### Installation Report ####') click.echo(f'\n------------ Environment ------------') - click.echo(f"Colossal-AI version: {to_click_output(colossalai_verison)}") + click.echo(f"Colossal-AI version: {to_click_output(colossalai_version)}") click.echo(f"PyTorch version: {to_click_output(torch_version)}") click.echo(f"System CUDA version: {to_click_output(cuda_version)}") click.echo(f"CUDA version required by PyTorch: {to_click_output(torch_cuda_version)}") @@ -137,7 +137,7 @@ def _parse_colossalai_version(): # 1. X.X.X+torchX.XXcuXX.X (when colossalai is installed with CUDA extensions) # 2. X.X.X (when colossalai is not installed with CUDA extensions) # where X represents an integer. - colossalai_verison = colossalai.__version__.split('+')[0] + colossalai_version = colossalai.__version__.split('+')[0] try: torch_version_for_aot_build = colossalai.__version__.split('torch')[1].split('cu')[0] @@ -145,7 +145,7 @@ def _parse_colossalai_version(): except: torch_version_for_aot_build = None cuda_version_for_aot_build = None - return colossalai_verison, torch_version_for_aot_build, cuda_version_for_aot_build + return colossalai_version, torch_version_for_aot_build, cuda_version_for_aot_build def _check_aot_built_cuda_extension_installed(): From 9e78ee53e04f3e1e15d8703e1c2cdefc55bb5d3f Mon Sep 17 00:00:00 2001 From: digger yu Date: Mon, 15 May 2023 14:32:26 +0800 Subject: [PATCH 2/2] fix typo colossalai/ --- .../tensor_shard/node_handler/node_handler.py | 2 +- .../node_handler/strategy/batch_norm_generator.py | 8 ++++---- .../node_handler/strategy/binary_elementwise_generator.py | 2 +- .../node_handler/strategy/strategy_generator.py | 4 ++-- .../auto_parallel/tensor_shard/solver/cost_graph.py | 4 ++-- .../auto_parallel/tensor_shard/solver/graph_analysis.py | 6 +++--- colossalai/auto_parallel/tensor_shard/solver/solver.py | 2 +- colossalai/testing/pytest_wrapper.py | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py index ab391ebfaf80..d3d09a9dcf65 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py @@ -75,7 +75,7 @@ def update_resharding_cost(self, strategy: ShardingStrategy) -> None: prev_strategy.get_sharding_spec_by_name(node_name) for prev_strategy in prev_strategy_vector ] - # create data structrure to store costs + # create data structure to store costs if node not in resharding_costs: resharding_costs[node] = [] diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py index 1f3812429fc2..79b69acb25b3 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py @@ -24,7 +24,7 @@ class BatchNormStrategyGenerator(StrategyGenerator): To keep the math consistency, there are two way to do BatchNorm if the input shards on batch dimension: 1. We gather the input partitions through batch dimension, then do the normal BatchNorm. - 2. We do the SyncBatchNorm on the each input partition seperately, the SyncBN op will help + 2. We do the SyncBatchNorm on the each input partition separately, the SyncBN op will help us to keep the computing correctness. In this generator, both methods will be considered. """ @@ -212,7 +212,7 @@ def split_input_batch(self, mesh_dim_0): # set communication action # For SyncBN case, we don't need to do communication for weight and bias. - # TODO: the communication happens interally at SyncBN operation. We need to replace the BN operation + # TODO: the communication happens internally at SyncBN operation. We need to replace the BN operation # to SyncBN operation instead of inserting a communication node. output_comm_action = self.get_communication_action( sharding_spec=sharding_spec_mapping["output"], @@ -250,7 +250,7 @@ def split_input_batch_1d(self, mesh_dim_0, mesh_dim_1): # set communication action # For SyncBN case, we don't need to do communication for gradients of weight and bias. - # TODO: the communication happens interally at SyncBN operation. We need to replace the BN operation + # TODO: the communication happens internally at SyncBN operation. We need to replace the BN operation # to SyncBN operation instead of inserting a communication node. output_comm_action = self.get_communication_action( sharding_spec=sharding_spec_mapping["output"], @@ -298,7 +298,7 @@ def split_input_both_dim(self, mesh_dim_0, mesh_dim_1): # set communication action # For SyncBN case, we don't need to do communication for gradients of weight and bias. - # TODO: the communication happens interally at SyncBN operation. We need to replace the BN operation + # TODO: the communication happens internally at SyncBN operation. We need to replace the BN operation # to SyncBN operation instead of inserting a communication node. output_comm_action = self.get_communication_action( sharding_spec=sharding_spec_mapping["output"], diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/binary_elementwise_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/binary_elementwise_generator.py index fd7f811c8972..d27cc046eaf3 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/binary_elementwise_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/binary_elementwise_generator.py @@ -51,7 +51,7 @@ def update_memory_cost(self, strategy: ShardingStrategy) -> ShardingStrategy: # compute fwd memory cost in bytes # as the elementwise ops are not memory-intensive - # we approximate the fwd memroy cost to be the output + # we approximate the fwd memory cost to be the output # and the backward memory cost to be grad of input and other input_bytes = self._compute_size_in_bytes(strategy, 'input') other_bytes = self._compute_size_in_bytes(strategy, 'other') diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/strategy_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/strategy_generator.py index 6d68521aaea7..d42429745c61 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/strategy_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/strategy_generator.py @@ -225,7 +225,7 @@ def _compute_size_in_bytes_helper(sharding_spec, meta_data): if isinstance(meta_data, torch.Tensor): element_bytes = _compute_size_in_bytes_helper(sharding_spec, meta_data) else: - # if meta_data is not a tensor, we count the memroy as 0 + # if meta_data is not a tensor, we count the memory as 0 element_bytes = 0 total_bytes += element_bytes @@ -233,7 +233,7 @@ def _compute_size_in_bytes_helper(sharding_spec, meta_data): if isinstance(op_data.data, torch.Tensor): total_bytes = _compute_size_in_bytes_helper(strategy.sharding_specs[op_data], op_data.data) else: - # if op_data.data is not a tensor, we count the memroy as 0 + # if op_data.data is not a tensor, we count the memory as 0 total_bytes = 0 return total_bytes diff --git a/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py b/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py index 74290453ca0c..1b2d3ad57407 100644 --- a/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py +++ b/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py @@ -9,7 +9,7 @@ class CostGraph: 1. To feed the quadratic resharding costs into solver, we need to linearize it. We build edge_cost in CostGraph, and it stored every combinations of strategies for a src-dst node pair in an 1D list. 2. To reduce the searching space, we merge computationally-trivial operators, such as - element-wise operators, transpose, and reduction, into their following nodes. The merging infomation will + element-wise operators, transpose, and reduction, into their following nodes. The merging information will be given by the StrategiesVector depending on the type of target node and following nodes. Argument: @@ -90,7 +90,7 @@ def _check_tensor_in_node(data): if self.simplify and strategies_vector.check_merge(): for followed_node in strategies_vector.predecessor_nodes: # we only merge node pairs which src node has a tensor element inside. - # This is necessay because the node without a tensor element inside will not + # This is necessary because the node without a tensor element inside will not # be assigned any strategy. if _check_tensor_in_node(followed_node._meta_data): self.merge_pair.append((followed_node, dst_node)) diff --git a/colossalai/auto_parallel/tensor_shard/solver/graph_analysis.py b/colossalai/auto_parallel/tensor_shard/solver/graph_analysis.py index be39a74cb237..171aa8b3399f 100644 --- a/colossalai/auto_parallel/tensor_shard/solver/graph_analysis.py +++ b/colossalai/auto_parallel/tensor_shard/solver/graph_analysis.py @@ -83,7 +83,7 @@ def graph(self) -> Graph: def liveness_analysis(self) -> List[LiveStage]: """ - Analyse the graph to obtain the variable liveness information. This function returns + Analyses the graph to obtain the variable liveness information. This function returns an ordered dictionary where the key is the compute stage ID and the value is a LivenessStage object. """ compute_nodes = self.graph.nodes @@ -91,7 +91,7 @@ def liveness_analysis(self) -> List[LiveStage]: # checked: record all variables created since the first stage # all: record the live variables only exist until the current stage. - # this can be different from the `checked list`` as some varialbes may be destroyed prior to this stage. + # this can be different from the `checked list`` as some variables may be destroyed prior to this stage. # unique: record the unique live variables only exist until the current stage. # this is different from `all list` as some variables are duplicated. checked_variables = LiveVariableVector() @@ -103,7 +103,7 @@ def liveness_analysis(self) -> List[LiveStage]: # find new living variables # ############################# # detect whether the current op is an in-place op - # if it is an in-place op, we would deem it as a duplciate var + # if it is an in-place op, we would deem it as a duplicate var is_inplace = False if node.op == 'call_function': # check if this is an inplace op such as torch.nn.functional.relu(x, inplace=True) diff --git a/colossalai/auto_parallel/tensor_shard/solver/solver.py b/colossalai/auto_parallel/tensor_shard/solver/solver.py index f5c6663dce80..564c5f09220c 100644 --- a/colossalai/auto_parallel/tensor_shard/solver/solver.py +++ b/colossalai/auto_parallel/tensor_shard/solver/solver.py @@ -44,7 +44,7 @@ def __init__(self, graph: The computing graph to be optimized. strategies_constructor: It will provide all the possible strategies for each node in the computing graph. cost_graph: A graph data structure to simplify the edge cost graph. - graph_analyser: graph_analyser will analyse the graph to obtain the variable liveness information, which will be used to generate memory constraints. + graph_analyser: graph_analyser will analyses the graph to obtain the variable liveness information, which will be used to generate memory constraints. memory_budget: Memory constraint for the solution. solution_numbers: If solution_numbers is larger than one, solver will us a serious of solutions based on different memory budget. memory_increasing_coefficient: If solution_numbers is larger than one, we will use this coefficient to generate new memory budget. diff --git a/colossalai/testing/pytest_wrapper.py b/colossalai/testing/pytest_wrapper.py index a472eb3723ec..b264b009028a 100644 --- a/colossalai/testing/pytest_wrapper.py +++ b/colossalai/testing/pytest_wrapper.py @@ -33,7 +33,7 @@ def test_for_something(): assert isinstance(name, str) flag = os.environ.get(name.upper(), '0') - reason = f'Environment varialbe {name} is {flag}' + reason = f'Environment variable {name} is {flag}' if flag == '1': return pytest.mark.skipif(False, reason=reason) else: