Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
39f1da3
fix typo colossalai/autochunk auto_parallel amp
digger-yu May 17, 2023
73d710a
Merge branch 'main' of https://github.com/digger-yu/ColossalAI
digger-yu May 19, 2023
dce5d3d
fix typo colossalai/auto_parallel nn utils etc.
digger-yu May 19, 2023
29c56d3
Merge branch 'main' of https://github.com/digger-yu/ColossalAI
digger-yu May 23, 2023
86ad586
fix typo colossalai/auto_parallel autochunk fx/passes etc.
digger-yu May 23, 2023
df52fee
Merge branch 'main' of https://github.com/digger-yu/ColossalAI
digger-yu May 24, 2023
9636e44
fix typo docs/
digger-yu May 24, 2023
750cd5e
Merge branch 'main' of https://github.com/digger-yu/ColossalAI
digger-yu May 24, 2023
6d4c219
Merge branch 'main' of https://github.com/digger-yu/ColossalAI
digger-yu May 24, 2023
767ee4e
change placememt_policy to placement_policy in docs/ and examples/
digger-yu May 24, 2023
0d46760
fix typo colossalai/ applications/
digger-yu May 24, 2023
6de1c6d
Merge branch 'main' of https://github.com/digger-yu/ColossalAI
digger-yu May 24, 2023
f5654c1
Merge branch 'main' of https://github.com/digger-yu/ColossalAI
digger-yu May 25, 2023
14ca05f
fix typo colossalai/cli fx kernel
digger-yu May 25, 2023
171f7de
Merge branch 'main' of https://github.com/digger-yu/ColossalAI
digger-yu Jun 2, 2023
b648738
fix typo colossalai/nn
digger-yu Jun 2, 2023
441ac19
revert change warmuped
digger-yu Jun 2, 2023
53839b9
Merge branch 'main' of https://github.com/digger-yu/ColossalAI
digger-yu Jun 5, 2023
3b28efa
fix typo colossalai/pipeline tensor nn
digger-yu Jun 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion colossalai/nn/optimizer/cpu_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class CPUAdam(NVMeOptimizer):
"""Implements Adam algorithm.

Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
Supports parameters updating on both GPU and CPU, depending on the device of parameters.
But the parameters and gradients should on the same device:
* Parameters on CPU and gradients on CPU is allowed.
* Parameters on GPU and gradients on GPU is allowed.
Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/optimizer/hybrid_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
class HybridAdam(CPUAdam):
"""Implements Adam algorithm.

Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
Supports parameters updating on both GPU and CPU, depending on the device of parameters.
But the parameters and gradients should on the same device:
* Parameters on CPU and gradients on CPU is allowed.
* Parameters on GPU and gradients on GPU is allowed.
Expand Down
8 changes: 4 additions & 4 deletions colossalai/pipeline/pipelinable.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
for k, v in kwargs.items():
if isinstance(v, torch.nn.Module):
v = self._layer_spec_dict[id(v)]
# (lyl)TODO: analyse ColoTensor as well
# (lyl)TODO: analyze ColoTensor as well
modified_kwargs[k] = v

# keep track of the module children
Expand Down Expand Up @@ -117,7 +117,7 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
def to_layer_list(self, exec_seq=None):
"""
Create a layer spec list and func list with execution sequence given by user.
If exec_seq is None, we will take the module initizing order as execution order.
If exec_seq is None, we will take the module initializing order as execution order.
"""

self._exec_seq = exec_seq
Expand Down Expand Up @@ -177,7 +177,7 @@ def to_layer_list(self, exec_seq=None):

def partition(self, num_chunks, pipeline_size, rank):
"""
Partitioned model will be built respect to partion policy.
Partitioned model will be built respect to partition policy.
The real module instance will be built in this method.
"""
if isinstance(self._policy, str):
Expand All @@ -193,7 +193,7 @@ def partition(self, num_chunks, pipeline_size, rank):
self.customized_parts = customized_partition(self._exec_seq)
assert len(self.customized_parts) == gpc.get_world_size(
ParallelMode.PIPELINE
), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partions is {len(self.customized_parts)}'
), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partitions is {len(self.customized_parts)}'
parts = self.customized_parts[rank]
else:
raise ValueError("A string partition policy should be one of ['uniform', 'balanced', 'customized'].")
Expand Down
10 changes: 5 additions & 5 deletions colossalai/pipeline/rpc/_pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def __init__(self,
self.device = device
self._initialize_outstanding_range()

# variable and const for context managment
# variable and const for context management
self.outstanding = 0
self.forward_times = 0
self.backward_times = 0
Expand Down Expand Up @@ -226,7 +226,7 @@ def sync_global_worker_rrefs(self, pp_rank_to_worker_rref: Dict[int, PyRRef]) ->
self.pp_rank_to_worker_rref = pp_rank_to_worker_rref

# for some schedule need the other worker's info to initialise partition (like Chimera)
# construction of partition is executed after the registion of pp_rank_to_worker_rref
# construction of partition is executed after the registration of pp_rank_to_worker_rref
self._initialize_partition()

# res_use works for lifecycle counter,
Expand Down Expand Up @@ -418,7 +418,7 @@ def subscribe_producer(self, microbatch_id: int, forward_only: bool):
# On current PP middleware design for DAG, get_output_by_key used by _subscribe_producer
# can only be executed once for every producer-consumer stage pair, which is necessary
# to count the lifecycle of work_item. So, keeping the _subscribe_producer in the same
# lock of work_item queue operation gurantees the consistency of lifecycle counter.
# lock of work_item queue operation guarantees the consistency of lifecycle counter.
work_item_from_producer = self._subscribe_producer(microbatch_id, forward_only)
self.work_list[key] = work_item_from_producer
self.work_list_condition_lock.notify_all()
Expand Down Expand Up @@ -460,7 +460,7 @@ def subscribe_consumer(self, microbatch_id: int):
# On current PP middleware design for DAG, get_output_by_key used by subscribe_consumer
# can only be executed once for every producer-consumer stage pair, which is necessary
# to count the lifecycle of work_item. So, keeping the subscribe_consumer in the same
# lock of work_item queue operation gurantees the consistency of lifecycle counter.
# lock of work_item queue operation guarantees the consistency of lifecycle counter.
work_item_from_consumer = self._subscribe_consumer(microbatch_id)
self.work_list[key] = work_item_from_consumer
self.work_list_condition_lock.notify_all()
Expand Down Expand Up @@ -508,7 +508,7 @@ def _get_producer_consumer(self) -> None:
assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"

# should be aranged in order, the order of the input of current forward
# should be arranged in order, the order of the input of current forward
self.producer_stage_ids = self.get_producer_stage_ids()
self.consumer_stage_ids = self.get_consumer_stage_ids()

Expand Down
6 changes: 3 additions & 3 deletions colossalai/pipeline/rpc/_pipeline_schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def _get_producer_consumer(self) -> None:
assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"

# should be aranged in order, the order of the input of current forward
# should be arranged in order, the order of the input of current forward
self.producer_stage_ids = []
self.consumer_stage_ids = []

Expand Down Expand Up @@ -174,7 +174,7 @@ def _initialize_partition(self):
else:
# if it is down pipeline, create partition by origin method
co_up_pp_worker_rref = self.pp_rank_to_worker_rref[pp_rank - stage_num]
# get the coresponding model state dict and wait for its init
# get the corresponding model state dict and wait for its init
state_dict = co_up_pp_worker_rref.rpc_sync().get_partition_state_dict()
super()._initialize_partition()
self.module_partition.load_state_dict(state_dict)
Expand Down Expand Up @@ -228,7 +228,7 @@ def _hook_before_step(self):
stage_num = self.actual_stage_num
co_pp_rank = (pp_rank + stage_num) % (2 * stage_num)

# if currrent pp_rank is not the first to do step
# if current pp_rank is not the first to do step
# wait its previous pp_rank finish step
grads = self.get_parameter_gradients()

Expand Down
2 changes: 1 addition & 1 deletion colossalai/pipeline/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _binary_search(weights, num):

def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
assert num_items % num_chunks == 0, \
"Layer length should be divided by the number of chunks, otherwise parameter method is recomended"
"Layer length should be divided by the number of chunks, otherwise parameter method is recommended"

logger = get_dist_logger()
parts = [[] for _ in range(pipeline_parallel_size)]
Expand Down
2 changes: 1 addition & 1 deletion colossalai/tensor/d_tensor/comm_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class CommSpec:
to determine the buffer shape, and logical_process_axis

Argument:
comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec.
comm_pattern(CollectiveCommPattern): describe the communication method used in this spec.
process_groups_dict(Dict): A dict which contains the process groups used to apply this CommSpec.
gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
shard_dim(int, Optional): The shard_dim of the tensor will be sharded.
Expand Down
4 changes: 2 additions & 2 deletions colossalai/tensor/d_tensor/sharding_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __repr__(self):

def _convert_str_to_shard_list(self, str_spec):
'''
Conver str_spec into shard_list.
Convert str_spec into shard_list.

Argument:
str_spec(str): dim spec in str type.
Expand All @@ -58,7 +58,7 @@ def _convert_str_to_shard_list(self, str_spec):

def build_difference_2d_dict(self):
'''
Build a difference maping for 2D device mesh case. It will be used to
Build a difference mapping for 2D device mesh case. It will be used to
compute the difference between DimSpec pairs.
'''

Expand Down
2 changes: 1 addition & 1 deletion colossalai/tensor/param_op_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def _get_grad_args(*args):
for obj in args:
if _is_grad_tensor(obj):
return args, None
# otherwise, the first arguement should be a tuple of grad tensors
# otherwise, the first argument should be a tuple of grad tensors
# if there is no grad tensor, the backward of PreFwdPostBwd can't be triggered
arg_zero = args[0]
if not isinstance(arg_zero, tuple):
Expand Down
2 changes: 1 addition & 1 deletion colossalai/tensor/process_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def set_cpu_groups(self):
@property
def has_cpu_groups(self) -> bool:
"""has_cpu_groups
If cpu groups have been initailized.
If cpu groups have been initialized.

Returns:
bool: cpu process groups have been initialized or not.
Expand Down
6 changes: 3 additions & 3 deletions colossalai/tensor/shape_consistency.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def get_all_all_to_all_spec(self, source_spec: ShardingSpec,
def get_all_shard_spec(self, source_spec: ShardingSpec, orig_cost_dict):
'''
Get all valid sharding specs from source_spec with single shard operation, and
accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
accumulate communication cost on origin cost which will finally be used in auto sharding solver.
For the sharding operation, we just care about legal sharding dimensions.

Argument:
Expand Down Expand Up @@ -386,7 +386,7 @@ def get_all_mix_gather_spec(self, source_spec: ShardingSpec,
def get_all_one_step_transform_spec(self, source_spec: ShardingSpec, orig_cost_dict) -> Dict[ShardingSpec, float]:
'''
Get all valid sharding specs from source_spec with one step transform, and
accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
accumulate communication cost on origin cost which will finally be used in auto sharding solver.
Note:
all-gather will eliminate a sharding dimension, all-to-all will keep sharding dimension same as before,
and shard will add a sharding dimension. Therefore, the result of above operations are mutual exclusive,
Expand Down Expand Up @@ -577,7 +577,7 @@ def shape_consistency(self, source_spec: ShardingSpec,
Step3:
Repeat above steps until the source spec transform to target spec.

During finding the transform path, commucation cost will be accumulated, and it
During finding the transform path, communication cost will be accumulated, and it
will be finally used in auto parallel solver.

Additionally, to avoid repeating the path search in runtime, we cached all solved path
Expand Down
6 changes: 3 additions & 3 deletions colossalai/tensor/sharding_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __repr__(self):

def _convert_str_to_shard_list(self, str_spec):
'''
Conver str_spec into shard_list.
Convert str_spec into shard_list.

Argument:
str_spec(str): dim spec in str type.
Expand All @@ -62,7 +62,7 @@ def _convert_str_to_shard_list(self, str_spec):

def build_difference_2d_dict(self):
'''
Build a difference maping for 2D device mesh case. It will be used to
Build a difference mapping for 2D device mesh case. It will be used to
compute the difference between DimSpec pairs.
'''

Expand Down Expand Up @@ -166,7 +166,7 @@ class ShardingSpec:
device_mesh(DeviceMesh): A logical view of a physical mesh.
entire_shape(torch.Size): The entire shape of tensor before sharded.
dim_partition_dict(Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded,
and the value of the key decribe which logical axis will be sharded in that dimension.
and the value of the key describe which logical axis will be sharded in that dimension.
sharding_sequence(List[_DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
'''

Expand Down
2 changes: 1 addition & 1 deletion colossalai/tensor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def shard_simulator(target_pair, legal_sharding_dims):

Argument:
target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
and the second element decribes which logical axis will be sharded in that dimension.
and the second element describes which logical axis will be sharded in that dimension.
'''
_, shard_list = target_pair
shard_list_list = []
Expand Down