Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion colossalai/_analyzer/fx/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def emit_ckpt_func(body,
delete_unused_value_func,
ckpt_level=0,
in_ckpt=False):
"""Emit ckpt fuction in nested way
"""Emit ckpt function in nested way

Args:
body: forward code - in recursive calls, this part will be checkpoint
Expand Down
2 changes: 1 addition & 1 deletion colossalai/auto_parallel/offload/region.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def copy_grad_to_region_slice(self, param: torch.nn.Parameter, data_slice: torch
Copy data slice to the memory space indexed by the input tensor in the region.

Args:
param (torch.nn.Parameter): the param used to retrive meta information
param (torch.nn.Parameter): the param used to retrieve meta information
data_slice (torch.Tensor): the tensor to be copied to the region
"""

Expand Down
2 changes: 1 addition & 1 deletion colossalai/auto_parallel/offload/training_simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class TrainingSimulator(ABC):

Args:
region_list (List[Region]): represents the linearized DNN computing graph.
comp_power (float): the NVIDIA GPU FP16 compuing power.
comp_power (float): the NVIDIA GPU FP16 computing power.
link_to_bw (Dict[str, Dict[float, float]]): communication links and the corresponding bandwidth.
"""

Expand Down
4 changes: 2 additions & 2 deletions colossalai/auto_parallel/passes/runtime_preparation_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def size_value_converting_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh

def _extract_target_dim(node):
'''
A helper function to etract the target dimension from size node.
A helper function to extract the target dimension from size node.
There are two usages of torch.Tensor.size:
1. tensor.size()
2. tensor.size(dim)
Expand Down Expand Up @@ -427,7 +427,7 @@ def _shard_param(param, target_sharding_spec):
if target_sharding_spec.dim_partition_dict != {}:
origin_sharding_spec = ShardingSpec(device_mesh, param.shape, {})
setattr(param, 'sharding_spec', origin_sharding_spec)
# TODO: build a ColoParamter class to manager the distributed parameters
# TODO: build a ColoParameter class to manager the distributed parameters
# we could use .data here, because all the operations just happen before the real training
# loop, so we don't need to track these operations in the autograd graph.
Comment thread
binmakeswell marked this conversation as resolved.
param = torch.nn.Parameter(
Expand Down
2 changes: 1 addition & 1 deletion colossalai/autochunk/autochunk_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def emit_code_with_chunk(body: List[str],
body = _replace_new_tensor_like_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body)
# new tensor
body = _replace_new_tensor_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body)
# reassgin reshape size
# reassign reshape size
body[-1] = _replace_reshape_size(body[-1], node.name, chunk_infos[region_idx]["reshape_size"])
body[-1] = " " + body[-1]
delete_unused_value_func(node, body, chunk_inputs_names)
Expand Down
2 changes: 1 addition & 1 deletion colossalai/autochunk/estimate_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def estimate_chunk_inference_mem(self, node_list: List, chunk_infos: Dict = None

Returns:
act_memory_peak_log (List): peak memory of every node
act_memory_after_node_log (List): memory after excuting every node
act_memory_after_node_log (List): memory after executing every node
active_node_list_log (List): active nodes of every node. active nodes refer to
nodes generated but not deleted.
"""
Expand Down
6 changes: 3 additions & 3 deletions colossalai/autochunk/search_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class SearchChunk(object):
This is the core class for AutoChunk.

It defines the framework of the strategy of AutoChunk.
Chunks will be selected one by one utill search stops.
Chunks will be selected one by one until search stops.

The chunk search is as follows:
1. find the peak memory node
Expand Down Expand Up @@ -73,7 +73,7 @@ def _init_trace(self) -> None:

def _find_peak_region(self, mem_peak: List) -> int:
"""
find peak node, along with its neighbour nodes exceeds max mem
find peak node, along with its neighbor nodes exceeds max mem
"""
max_value = max(mem_peak)
max_idx = mem_peak.index(max_value)
Expand Down Expand Up @@ -118,7 +118,7 @@ def _search_max_chunk_region(self, active_node: List, peak_region: int, chunk_re
chunk_region_start (int)
chunk_region_end (int)
"""
# check if peak node already in chunkinfo
# check if peak node already in chunk info
if chunk_regions is not None:
for i in chunk_regions:
if i["region"][0] < peak_region[0] <= i["region"][1] or \
Expand Down
2 changes: 1 addition & 1 deletion colossalai/autochunk/trace_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,7 @@ def check_region_start_end(self, start_node: Node, start_dim: int, start_idx: in
# check index source align
if not self.check_index_source(start_dim, start_node, start_idx, end_dim, end_node):
return False
# check index copmute
# check index compute
if not self.check_index_compute(start_idx, end_dim, end_node, end_idx):
return False
return True
12 changes: 6 additions & 6 deletions colossalai/autochunk/trace_indice.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

class TraceIndice(object):
"""
Trace all indice infomation for every node.
Trace all indice information for every node.

Indice is a logical concept. Equal dims can been treated as one indice.
eg. dim(x1) = [a, b, c]
Expand Down Expand Up @@ -153,7 +153,7 @@ def _inherit_all_indice(self, node_from: Node, node_to: Node) -> None:

def _inherit_more_indice_from_node_with_exclude(self, node_from: Node, node_to: Node, exclude: List = None) -> None:
"""
inheirt indice from node without init
inherit indice from node without init
"""
if exclude == None:
exclude = []
Expand Down Expand Up @@ -301,7 +301,7 @@ def _assign_permute_indice(self, node: Node, node_idx: int) -> None:
def _assign_linear_indice(self, node: Node, node_idx: int) -> None:
"""
Assign indice for linear op.
1. copy trace from input node and change last indice accroding to weight
1. copy trace from input node and change last indice according to weight
2. mark equal for input node last indice, weight first dim and bias dim.
3. inherit input's computation, mark computation for last dim.

Expand Down Expand Up @@ -360,7 +360,7 @@ def _assign_baddbmm_indice(self, node: Node, node_idx: int) -> None:
def _assign_matmul_indice(self, node: Node, node_idx: int) -> None:
"""
Assign indice for matmul op.
1. copy trace from matmul_left and change last indice accroding to matmul_right. (assert they have same length)
1. copy trace from matmul_left and change last indice according to matmul_right. (assert they have same length)
2. mark equal for input matmul_left -1 indice and matmul_right -2 dim.
3. inherit matmul_left and matmul_right computation, mark computation for last dim.

Expand Down Expand Up @@ -720,11 +720,11 @@ def _assign_view_reshape_indice(self, node: Node, node_idx: int) -> None:
Assign indice for view and reshape op.
1. get origin shape and target shape by meta info.
2. compute the real value of -1 in target shape.
3. determine changed dim, and assgin indice for generated dim.
3. determine changed dim, and assign indice for generated dim.
4. log changed dim and generated dim for restore
5. inherit computation.
6. look into view list to see whether the view is associated with other,
if so assgin equal dim according to previous view.
if so assign equal dim according to previous view.

Args:
node (node)
Expand Down
2 changes: 1 addition & 1 deletion colossalai/booster/booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
class Booster:
"""
Booster is a high-level API for training neural networks. It provides a unified interface for
training with different precisio, accelerator, and plugin.
training with different precision, accelerator, and plugin.

Examples:
>>> colossalai.launch(...)
Expand Down
8 changes: 4 additions & 4 deletions colossalai/checkpoint_io/checkpoint_io_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def load_model(self,

Args:
model (nn.Module): model to be loaded.
checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the
checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the
mainstream model zoos such as Hugging Face and TIMM. The checkpoint path can be:
1. a file path, e.g. 'model.pt'
2. a path to a json file which defines the index to the sharded checkpoint
Expand Down Expand Up @@ -127,7 +127,7 @@ def save_model(self,
1. a file path, e.g. 'model.pt'
2. a directory path to save the sharded checkpoint, e.g. './checkpoints/' when shard = True.
shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into
multiple files. The model shards will be specificed by a `model.index.json` file. When shard = True, please ensure
multiple files. The model shards will be specified by a `model.index.json` file. When shard = True, please ensure
that the checkpoint path is a directory path instead of a file path.
gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
variant (str): If specified, weights are saved in the format pytorch_model.<variant>.bin. Default: None.
Expand All @@ -149,7 +149,7 @@ def load_optimizer(self, optimizer: Optimizer, checkpoint: str):

Args:
optimizer (Optimizer): optimizer to be loaded.
checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the
checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the
"""
index_file_exists, index_file_path = has_index_file(checkpoint)

Expand Down Expand Up @@ -180,7 +180,7 @@ def save_optimizer(self,
2. a path to a json file which defines the index to the sharded checkpoint for the optimizer
3. a path to a folder containing a unique .index.json file for sharded checkpoint
shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into
multiple files. The optimizer shards will be specificed by a `optimizer.index.json` file.
multiple files. The optimizer shards will be specified by a `optimizer.index.json` file.
gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
prefix (str): prefix for the optimizer checkpoint when shard = True. Default: None.
size_per_shard (int): size per shard in MB. Default: 1024. This value is only used when shard is set to True.
Expand Down
4 changes: 2 additions & 2 deletions colossalai/cli/check/check_installation.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def check_installation():
click.echo("")
click.echo(f"Note:")
click.echo(
f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment varialbe CUDA_EXT=1 is set"
f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment variable CUDA_EXT=1 is set"
)
click.echo(f"2. If AOT compilation is not enabled, stay calm as the CUDA kernels can still be built during runtime")

Expand All @@ -88,7 +88,7 @@ def check_installation():
click.echo(f"Note:")
click.echo(f"1. The table above checks the version compatibility of the libraries/tools in the current environment")
click.echo(
f" - PyTorch version mistach: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation"
f" - PyTorch version mismatch: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation"
)
click.echo(
f" - System and PyTorch CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version required by PyTorch"
Expand Down
8 changes: 4 additions & 4 deletions colossalai/communication/p2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,10 @@ def _communicate(object_send_next: Union[torch.Tensor, List[torch.Tensor]] = Non
previous rank.
recv_next (bool): boolean for whether tensor should be received from
next rank.
recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defualts to None.
recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defualts to None.
prev_rank (int): the rank of the previous pipeline stage, defualts to None,
next_rank (int): the rank of the next pipeline stage, defualts to None,
recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defaults to None.
recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defaults to None.
prev_rank (int): the rank of the previous pipeline stage, defaults to None,
next_rank (int): the rank of the next pipeline stage, defaults to None,
dtype (torch.dtype): data type of intermediate buffers, defaults to None
scatter_gather_tensors (bool): whether to scatter and gather tensor between pipeline stages, defaults to False

Expand Down
2 changes: 1 addition & 1 deletion colossalai/communication/p2p_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def recv_backward(next_rank: int = None) -> Any:
next_rank (int, optional): The rank of the source of the tensor.

Returns:
Any: The input gradient tensor or gradident tensor list.
Any: The input gradient tensor or gradient tensor list.
"""
if gpc.is_pipeline_last_stage():
output_tensor_grad = None
Expand Down
2 changes: 1 addition & 1 deletion colossalai/context/moe_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def setup(self, seed: int, use_kernel_optim: bool = True):
from colossalai.core import global_context as gpc
self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
assert self.world_size % self.max_ep_size == 0, \
"Maximum epxert parallel size must be a factor of the number of GPUs"
"Maximum expert parallel size must be a factor of the number of GPUs"
self.min_dp_size = self.world_size // self.max_ep_size

# Enabling kernel optimization may raise error in some cases
Expand Down
4 changes: 2 additions & 2 deletions colossalai/context/parallel_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self):
# load config from file
self._config = None

# default 3D parallel args, will be overwritten during process group intialization
# default 3D parallel args, will be overwritten during process group initialization
self.world_size = 1
self.data_parallel_size = 1
self.pipeline_parallel_size = 1
Expand Down Expand Up @@ -264,7 +264,7 @@ def _add_world_size(self, parallel_mode: ParallelMode, world_size: int):
"""Adds world size for `parallel_mode`.

Args:
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode correponding to the process group
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode corresponding to the process group
world_size (int): The world size to be added

Raises:
Expand Down
8 changes: 4 additions & 4 deletions colossalai/context/random/seed_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,23 +59,23 @@ def set_mode(self, parallel_mode: ParallelMode):
self._current_mode = parallel_mode
torch.cuda.set_rng_state(self._seed_states[parallel_mode])

def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False):
def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
"""Adds a seed to the seed manager for `parallel_mode`.

Args:
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
seed (int): The seed to be added.
overwrtie (bool, optional): Whether allows to overwrite the seed that has been set already
overwrite (bool, optional): Whether allows to overwrite the seed that has been set already

Raises:
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of :class:`colossalai.context.ParallelMode`
or the seed for `parallel_mode` has been added.
"""
assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
if overwrtie is False:
if overwrite is False:
assert parallel_mode not in self._seed_states, f'The seed for {parallel_mode} has been added'
elif parallel_mode in self._seed_states:
print(f"Warnning: {parallel_mode} seed has been overwritten.", flush=True)
print(f"Warning: {parallel_mode} seed has been overwritten.", flush=True)

current_state = torch.cuda.get_rng_state()
torch.cuda.manual_seed(seed)
Expand Down
2 changes: 1 addition & 1 deletion colossalai/fx/codegen/activation_checkpoint_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def emit_ckpt_func(body,
delete_unused_value_func,
level=0,
in_ckpt=False):
"""Emit ckpt fuction in nested way
"""Emit ckpt function in nested way
Args:
body: forward code, in recursive calls, this part will be checkpoint
functions code
Expand Down
4 changes: 2 additions & 2 deletions colossalai/fx/passes/split_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def record_output(def_node: torch.fx.node.Node, use_node: Optional[torch.fx.node
use_partition = partitions[use_partition_name]
use_partition.outputs.setdefault(def_node.name)

# split nodes into parititons
# split nodes into partitions
for node in m.graph.nodes:
orig_nodes[node.name] = node

Expand Down Expand Up @@ -198,7 +198,7 @@ def record_output(def_node: torch.fx.node.Node, use_node: Optional[torch.fx.node
if len(sorted_partitions) != len(partitions):
raise RuntimeError("cycle exists between partitions!")

# add placeholders to parititons
# add placeholders to partitions
for partition_name in sorted_partitions:
partition = partitions[partition_name]
for input in partition.inputs:
Expand Down
2 changes: 1 addition & 1 deletion colossalai/kernel/cuda_native/multihead_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ class MultiHeadAttention(nn.Module):
Arguments:
hidden_size: Total dimension of hidden_size.
nhead: Number of parallel attention heads.
batch_size: Batch Size for one foward
batch_size: Batch Size for one forward
max_seq_len: Max length of input sequence
dropout: Dropout probability
norm_first: perform LayerNorms before attention
Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/_ops/embedding_bag.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def colo_embedding_bag(input_tensor: GeneralTensor,
assert isinstance(weight, ColoTensor)
input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group())

# Handle differen parallel actions.
# Handle different parallel actions.

if not weight.has_compute_spec(): # No Model Parallel Applied
assert weight.is_replicate(), 'Invalid weight spec for native embedding op'
Expand Down
Loading