diff --git a/colossalai/initialize.py b/colossalai/initialize.py index 5d3f3e5530cb..dc0df0517508 100644 --- a/colossalai/initialize.py +++ b/colossalai/initialize.py @@ -238,7 +238,7 @@ def initialize(model: nn.Module, loaded into gpc.config. Args: - model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model. + model (:class:`torch.nn.Module` or Callable): Your model instance or a function to build the model. optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`): Your optimizer instance. criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance. diff --git a/colossalai/zero/gemini/memory_tracer/utils.py b/colossalai/zero/gemini/memory_tracer/utils.py index 6962c058110e..65f6ba775139 100644 --- a/colossalai/zero/gemini/memory_tracer/utils.py +++ b/colossalai/zero/gemini/memory_tracer/utils.py @@ -7,7 +7,7 @@ def colo_model_optimizer_usage(optim) -> Tuple[int, int]: """Trace the optimizer memory usage Args: - optim (ShardedOptimV2): an instance of ShardedOptimver + optim (ShardedOptimV2): an instance of ShardedOptimizer Returns: Tuple[int, int]: cuda/cpu memory usage in Byte diff --git a/colossalai/zero/legacy/init_ctx/init_context.py b/colossalai/zero/legacy/init_ctx/init_context.py index a3fa46b38b5a..84e2d2f4f8e1 100644 --- a/colossalai/zero/legacy/init_ctx/init_context.py +++ b/colossalai/zero/legacy/init_ctx/init_context.py @@ -46,7 +46,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses): """A context to initialize model. 1. Convert the model to fp16. - 2. The paramaters of the module are adapted to type ShardedParameter. + 2. The parameters of the module are adapted to type ShardedParameter. 3. Shard the param and grad according to flags. Args: diff --git a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py index be3842beb208..e7064277fb3c 100644 --- a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py +++ b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py @@ -69,7 +69,7 @@ class ShardedModelV2(nn.Module): If it's 'auto', they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well. Note that 'auto' policy can only work well when no other processes use CUDA during your training. Defaults to 'cuda'. - gradient_predivide_factor (Optional[float], optional): Gradient is divived by this value before reduce-scatter. Defaults to 1.0. + gradient_predivide_factor (Optional[float], optional): Gradient is divided by this value before reduce-scatter. Defaults to 1.0. reuse_fp16_shard (bool, optional): Whether to reuse fp16 shard for param and grad. Enabling this can reduce GPU memory usage, but you have to make sure you disable it when using gradient accumulation. In this mode, grad will be fp16. Make sure your optimizer supports mixed precision (fp32 param and fp16 grad). @@ -205,7 +205,7 @@ def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> N exit(0) """ if self._use_memory_tracer: - self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0]) + self.logger.error(f'dump memory tracer collected information to a {filename}', ranks=[0]) if gpc.get_global_rank() == 0: with open(filename, 'w+') as f: f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n') @@ -385,7 +385,7 @@ def _save_grad(self, param: Parameter, grad: torch.Tensor): # make parameters point to gradient assert param.colo_attr.saved_grad.is_null( - ), 'Gradien accumulation is not supported when reuse_fp16_shard=True' + ), 'Gradient accumulation is not supported when reuse_fp16_shard=True' param.colo_attr.grad_payload_reset(grad.data) # release the memory of param diff --git a/colossalai/zero/low_level/_utils.py b/colossalai/zero/low_level/_utils.py index afc98e7a7f54..218f7603bc54 100644 --- a/colossalai/zero/low_level/_utils.py +++ b/colossalai/zero/low_level/_utils.py @@ -261,7 +261,7 @@ def sync_param(flat_tensor, tensor_list): share the same memory space. This function will update the tensor list so that they point to the same value. - :param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor lsit + :param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor list :param tensor_list: A list of tensors corresponding to the flattened tensor :type flat_tensor: torch.Tensor :type tensor_list: List[torch.Tensor] diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py index d4d03e5b5fcd..ee03c0f0ae15 100644 --- a/colossalai/zero/low_level/low_level_optim.py +++ b/colossalai/zero/low_level/low_level_optim.py @@ -207,8 +207,8 @@ def __init__( for param in self._working_param_groups[group_id]: self._param_store.set_param_reduction_state(param, False) - # intialize communication stream for - # communication-compuation overlapping + # initialize communication stream for + # communication-computation overlapping if self._overlap_communication: self._comm_stream = torch.cuda.Stream() @@ -269,7 +269,7 @@ def _partition_param_list(self, param_list): params_per_rank = [[] for _ in range(self._world_size)] numel_per_rank = [0 for _ in range(self._world_size)] - # partititon the parameters in a greedy fashion + # partition the parameters in a greedy fashion sorted_params = sorted(param_list, key=lambda x: x.numel(), reverse=True) for param in sorted_params: # allocate this parameter to the rank with @@ -297,7 +297,7 @@ def _attach_reduction_hook(self): param_group = self._working_param_groups[group_id] for param in param_group: if param.requires_grad: - # determines the reduction destionation rank + # determines the reduction destination rank # this is only valid for stage 2 # dst_rank = None means using all-reduce # else using reduce