Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions colossalai/trainer/_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ class Trainer:
>>> # Initialize your engine, train_dataloader, test_dataloader, lr_scheduler
>>> engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion)
>>> # Beginning training progress
>>> timier = ...
>>> timer = ...
>>> logger = ...
>>> trainer = Trainer(engine=engine, logger=logger, timer=timier)
>>> trainer = Trainer(engine=engine, logger=logger, timer=timer)
>>> # add hooks you would like to use here.
>>> hook_list = []
>>> trainer.fit(
Expand All @@ -56,7 +56,7 @@ def __init__(
timer: MultiTimer = None,
logger: DistributedLogger = None,
):
# training-ralated params
# training-related params
self._engine = engine
self._max_epochs = 0
self._cur_epoch = 0
Expand Down Expand Up @@ -118,7 +118,7 @@ def _set_current_step(self, epoch: int):
self._cur_step = epoch * self._steps_per_epoch

def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
"""Call timer funciton with a given timer name.
"""Call timer function with a given timer name.

Args:
action (str): Function to be called on timer.
Expand Down
2 changes: 1 addition & 1 deletion colossalai/utils/data_sampler/data_parallel_sampler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# adpated from torch.utils.data.DistributedSampler
# adapted from torch.utils.data.DistributedSampler

import math
import random
Expand Down
2 changes: 1 addition & 1 deletion colossalai/utils/model/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def _init_subclass(cls, **kwargs):
cls.__init__ = preprocess_after(cls.__init__)

# Replace .__init__() for all existing subclasses of torch.nn.Module
# Excution self._post_init_method after the default init function.
# Execution self._post_init_method after the default init function.
substitute_init_recursively(torch.nn.modules.module.Module, _enable_class, set())

# holding on to the current __init__subclass__ for exit
Expand Down
8 changes: 4 additions & 4 deletions colossalai/utils/profiler/legacy/comm_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def append(s: str = None):
res.append(sep)

if self.warn_flag:
append("Warnning: there exists multiple communication operations in the same time. As a result, "
append("Warning: there exists multiple communication operations in the same time. As a result, "
"the profiling result is not accurate.")

if self.total_cuda_time == 0:
Expand All @@ -123,12 +123,12 @@ def append(s: str = None):
append("total number of calls: {}".format(self.total_count))
append("All events:")

seperation = '-' * 74
separation = '-' * 74
row_format = '{:^10}' + '{:^12}' * 2 + '{:^16}' + '{:^12}' * 2

append(seperation)
append(separation)
append(row_format.format('Location', 'GPU time', 'Percentage', 'Comm volume', 'Bandwidth', 'Num of calls'))
append(seperation)
append(separation)

show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].self_cuda_time)
for location, event in show_list:
Expand Down
6 changes: 3 additions & 3 deletions colossalai/utils/profiler/legacy/pcie_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,12 @@ def append(s: str = None):

append("Possible data transmission events in PCIE:")

seperation = '-' * 62
separation = '-' * 62
row_format = '{:^10}' + '{:^12}' + '{:^16}' + '{:^12}' * 2

append(seperation)
append(separation)
append(row_format.format('Location', 'GPU time', 'Trans volume', 'Bandwidth', 'Num of calls'))
append(seperation)
append(separation)

show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].cuda_time)
for location, event in show_list:
Expand Down
4 changes: 2 additions & 2 deletions colossalai/utils/profiler/legacy/prof_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def _format_memory(nbytes):
return str(nbytes) + ' B'


def _format_bandwidth(volme: float or int, time_us: int):
def _format_bandwidth(volume: float or int, time_us: int):
sec_div_mb = (1000.0 / 1024.0)**2
mb_per_sec = volme / time_us * sec_div_mb
mb_per_sec = volume / time_us * sec_div_mb

if mb_per_sec >= 1024.0:
return '{:.3f} GB/s'.format(mb_per_sec / 1024.0)
Expand Down
4 changes: 2 additions & 2 deletions colossalai/utils/rank_recorder/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Rank Recorder
This is a useful tool to get the records of certain functions in each rank. The records of each rank will dump into a json file after the end of multiple process program. You can parse and visualise the json file easily.
This is a useful tool to get the records of certain functions in each rank. The records of each rank will dump into a json file after the end of multiple process program. You can parse and visualize the json file easily.

Before using the tool, you should ensure dist.is_initialized() return true before exit of program.

Expand All @@ -20,7 +20,7 @@ with recorder(record_name, current_rank) as r:
```

## Example
This is a demo to display kernel select in cuda and visualise the cost of several procedures in each rank.
This is a demo to display kernel select in cuda and visualize the cost of several procedures in each rank.

```python
import time
Expand Down
4 changes: 2 additions & 2 deletions colossalai/utils/rank_recorder/rank_recorder.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def merge_recode(self):
with open(self.export_name + '.json', 'w', encoding='utf-8') as f:
json.dump(recoders, f, ensure_ascii=False)

def visualise_record(self):
def visualize_record(self):
with open(self.export_name + '.json', 'r', encoding='utf-8') as f:
records = json.load(f)
records = dict(records)
Expand Down Expand Up @@ -171,7 +171,7 @@ def exit_worker(self):
if rank == 1:
# take the base time of rank 0 as standard
self.merge_recode()
self.visualise_record()
self.visualize_record()


recorder = Recorder()
Expand Down
2 changes: 1 addition & 1 deletion colossalai/zero/gemini/chunk/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def copy_tensor_to_chunk_slice(self, tensor: torch.Tensor, data_slice: torch.Ten
Copy data slice to the memory space indexed by the input tensor in the chunk.

Args:
tensor (torch.Tensor): the tensor used to retrive meta information
tensor (torch.Tensor): the tensor used to retrieve meta information
data_slice (torch.Tensor): the tensor to be copied to the chunk
"""
# sanity check
Expand Down
2 changes: 1 addition & 1 deletion colossalai/zero/gemini/chunk/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def copy_tensor_to_chunk_slice(self, tensor: torch.Tensor, data: torch.Tensor) -
Copy data to the chunk.

Args:
tensor (torch.Tensor): the tensor used to retrive meta information
tensor (torch.Tensor): the tensor used to retrieve meta information
data (torch.Tensor): the tensor to be copied to the chunk
"""
chunk = self.tensor_chunk_map[tensor]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(self, chunk_manager: ChunkManager, memstats: Optional[MemStats] = N
# override
def record_model_data_volume(self) -> None:
"""
record model data volumn on cuda and cpu.
record model data volume on cuda and cpu.
"""
if self._start_flag and not self.use_outside_memstats:
cuda_mem = self._chunk_manager.total_mem['cuda']
Expand Down
4 changes: 2 additions & 2 deletions colossalai/zero/gemini/memory_tracer/memory_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def clear(self):

class AsyncMemoryMonitor(MemoryMonitor):
"""
An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU
An Async Memory Monitor running during computing. Sampling memory usage of the current GPU
at interval of `1/(10**power)` sec.

The idea comes from Runtime Memory Tracer of PatrickStar
Expand All @@ -67,7 +67,7 @@ class AsyncMemoryMonitor(MemoryMonitor):
async_mem_monitor.save('log.pkl')

Args:
power (int, optional): the power of time interva. Defaults to 10.
power (int, optional): the power of time interval. Defaults to 10.

.. _PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
https://arxiv.org/abs/2108.05818
Expand Down
2 changes: 1 addition & 1 deletion colossalai/zero/gemini/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def get_static_torch_model(zero_ddp_model,
zero_ddp_model (ZeroDDP): a zero ddp model
device (torch.device): the device of the final torch model
dtype (torch.dtype): the dtype of the final torch model
only_rank_0 (bool): if True, only rank0 has the coverted torch model
only_rank_0 (bool): if True, only rank0 has the converted torch model

Returns:
torch.nn.Module: a static torch model used for saving checkpoints or numeric checks
Expand Down
4 changes: 2 additions & 2 deletions colossalai/zero/legacy/gemini/ophooks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def register_ophooks_recursively(module: torch.nn.Module,
ophook_list: List[BaseOpHook],
name: str = "",
filter_fn: Optional[Callable] = None):
r"""Recursilvely register pre/post hooks for all submodules in the module in FWD and BWD."""
r"""Recursively register pre/post hooks for all submodules in the module in FWD and BWD."""
assert isinstance(module, torch.nn.Module)
assert isinstance(ophook_list, (list, tuple))
assert len(ophook_list) > 0, 'expected at least 1 hook in the argument ophook_list but found 0'
Expand All @@ -103,7 +103,7 @@ def register_ophooks_recursively(module: torch.nn.Module,
if len(list(module.parameters(recurse=False))) == 0:
return

# return from flitered module
# return from filtered module
if filter_fn is not None and filter_fn(module):
return

Expand Down
2 changes: 1 addition & 1 deletion colossalai/zero/legacy/gemini/tensor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def colo_model_data_tensor_move_inline(t: Union[StatefulTensor, torch.Tensor], t
move a tensor to the target_device
Args:
t (Union[StatefulTensor, torch.Tensor]): the tensor be moved
target_device: a traget device, if type is int, it the index of cuda card.
target_device: a target device, if type is int, it the index of cuda card.
"""
if not isinstance(target_device, torch.device):
target_device = torch.device(f'cuda:{target_device}')
Expand Down