Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion applications/Chat/coati/dataset/reward_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from .utils import is_rank_0


# Dahaos/rm-static
# Dahoas/rm-static
class RmStaticDataset(Dataset):
"""
Dataset for reward model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li
Convert the sharding spec from the logical shape to the physical shape.
"""
# create multiple sharding strategies for the inputs
# as input can be multi-dimensinal and the partition dim is only 2D,
# as input can be multi-dimensional and the partition dim is only 2D,
# we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output
strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy,
input_name=str(
Expand Down Expand Up @@ -221,7 +221,7 @@ def post_process(self, strategy: ShardingStrategy):
Convert the sharding spec from the logical shape to the physical shape.
"""
# create multiple sharding strategies for the inputs
# as input can be multi-dimensinal and the partition dim is only 2D,
# as input can be multi-dimensional and the partition dim is only 2D,
# we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output
strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy,
input_name=str(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _update_sharding_spec_for_transposed_weight_for_linear(strategy: ShardingStr
weight_name: str) -> ShardingStrategy:
"""
This function is a helper function used by both module node handler and function node handler. This function will
convert the sharding spec for the transposed weight to the correct partititon spec.
convert the sharding spec for the transposed weight to the correct partition spec.

Args:
strategy (ShardingStrategy): the strategy generated by the strategy generator.
Expand Down Expand Up @@ -197,7 +197,7 @@ def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, Li
strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy, weight_name='weight')

# create multiple sharding strategies for the inputs
# as input can be multi-dimensinal and the partition dim is only 2D,
# as input can be multi-dimensional and the partition dim is only 2D,
# we need to map the partition at dim 0 to one of the first few dimensions of the input
strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy,
input_name=str(self.node.args[0]),
Expand Down Expand Up @@ -267,7 +267,7 @@ def post_process(self, strategy: ShardingStrategy):
strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy,
weight_name=str(self.node.args[1]))
# create multiple sharding strategies for the inputs
# as input can be multi-dimensinal and the partition dim is only 2D,
# as input can be multi-dimensional and the partition dim is only 2D,
# we need to map the partition at dim 0 to one of the first few dimensions of the input
strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy,
input_name=str(self.node.args[0]),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def get_matmul_type(input_dim: int, other_dim: int):
Determine which type of matmul operation should be executed for the given tensor dimensions.

Args:
input_dim (int): the number of dimensions for the input tenosr
other_dim (int): the number of dimensions for the other tenosr
input_dim (int): the number of dimensions for the input tensor
other_dim (int): the number of dimensions for the other tensor
"""
if input_dim == 1 and other_dim == 1:
matmul_type = MatMulType.DOT
Expand Down Expand Up @@ -268,13 +268,13 @@ def _update_sharding_spec(key, strategy, physical_batch_dim):
dim_partition_dict = sharding_spec.dim_partition_dict
entire_shape = sharding_spec.entire_shape

# upddate the dimension index for the matrix dimensions
# update the dimension index for the matrix dimensions
if 2 in dim_partition_dict:
dim_partition_dict[len(self.batch_dims_before_view) + 1] = dim_partition_dict.pop(2)
if 1 in dim_partition_dict:
dim_partition_dict[len(self.batch_dims_before_view)] = dim_partition_dict.pop(1)

# map the logical batch dim to phyiscal batch dim
# map the logical batch dim to physical batch dim
if 0 in dim_partition_dict:
batch_dim_shard = dim_partition_dict.pop(0)
dim_partition_dict[physical_batch_dim] = batch_dim_shard
Expand Down Expand Up @@ -414,7 +414,7 @@ def _get_logical_shape_for_dot(self):

def _get_logical_shape_for_mm(self):
"""
We need to handle the input tensor for a matrix-matrix multiplcation as the input
We need to handle the input tensor for a matrix-matrix multiplication as the input
tensor can be a 1D or 2D tensor. If it is a 1D tensor, 1 will be prepended to its shape
(e.g. [4] -> [1, 4]).
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV
return self.strategies_vector

def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, List[ShardingStrategy]]:
# tranform the strategy generated
# transform the strategy generated
# e.g. to process the sharding strategy for the transposed weights
return strategy

Expand Down
2 changes: 1 addition & 1 deletion colossalai/auto_parallel/tensor_shard/utils/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic
"""

if isinstance(input_, Node):
assert hasattr(input_, '_meta_data'), f'The given node has no attribte _meta_data'
assert hasattr(input_, '_meta_data'), f'The given node has no attribute _meta_data'
meta_tensor = input_._meta_data
assert meta_tensor is not None, "The given node's _meta_data attribute is None"
shape = meta_tensor.shape
Expand Down
12 changes: 6 additions & 6 deletions colossalai/auto_parallel/tensor_shard/utils/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@

class PreviousStatus(Enum):
"""
This class shows the status of previous comparision.
This class shows the status of previous comparison.
"""
RESET = 0
# ORIGIN means the dimension size of original tensor is larger in the previous comparision.
# ORIGIN means the dimension size of original tensor is larger in the previous comparison.
ORIGIN = 1
# TGT means the dimension size of target tensor is larger in the previous comparision.
# TGT means the dimension size of target tensor is larger in the previous comparison.
TGT = 2


Expand Down Expand Up @@ -91,7 +91,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D
tgt_index += 1

if previous_label == PreviousStatus.TGT:
# if the target dimension size is larger in the previous comparision, which means
# if the target dimension size is larger in the previous comparison, which means
# the origin dimension size has already accumulated larger than target dimension size, so
# we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
Expand All @@ -111,7 +111,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D
origin_index += 1

if previous_label == PreviousStatus.ORIGIN:
# if the origin element is larger in the previous comparision, which means
# if the origin element is larger in the previous comparison, which means
# the target element has already accumulated larger than origin element, so
# we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
Expand Down Expand Up @@ -139,7 +139,7 @@ def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]],
Rule:
For a sharded dimension of input tensor, if it is not the minimum element of the input tuple,
the function will return false.
To illustrate this issue, there are two cases to analyse:
To illustrate this issue, there are two cases to analyze:
1. no sharded dims in the input tuple: we could do the reshape operation safely just as the normal
operation without distributed tensor.
2. sharded dims in the input tuple: the sharded dim must be the minimum element, then during shape
Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/optimizer/cpu_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class CPUAdam(NVMeOptimizer):
"""Implements Adam algorithm.

Supports parameters updating on both GPU and CPU, depanding on the device of paramters.
Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
But the parameters and gradients should on the same device:
* Parameters on CPU and gradients on CPU is allowed.
* Parameters on GPU and gradients on GPU is allowed.
Expand Down
8 changes: 4 additions & 4 deletions colossalai/nn/optimizer/hybrid_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,19 @@
class HybridAdam(NVMeOptimizer):
"""Implements Adam algorithm.

Supports parameters updating on both GPU and CPU, depanding on the device of paramters.
Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
But the parameters and gradients should on the same device:
* Parameters on CPU and gradients on CPU is allowed.
* Parameters on GPU and gradients on GPU is allowed.
* Parameters on GPU and gradients on CPU is **not** allowed.

`HybriadAdam` requires CUDA extensions which can be built during installation or runtime.
`HybridAdam` requires CUDA extensions which can be built during installation or runtime.

This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam.

* For parameters updating on CPU, it uses CPUAdam.
* For parameters updating on GPU, it uses FusedAdam.
* Hybird precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.
* Hybrid precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.

:class:`colossalai.nn.optimizer.HybridAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
or ``torch.optim.Adam`` with ``adamw_mode=False``
Expand Down Expand Up @@ -131,7 +131,7 @@ def step(self, closure=None, div_scale: float = -1):
assert state['exp_avg'].device.type == 'cuda', "exp_avg should stay on cuda"
assert state['exp_avg_sq'].device.type == 'cuda', "exp_avg should stay on cuda"

# record the state by gruop and update at once
# record the state by group and update at once
g_l.append(p.grad.data)
p_l.append(p.data)
m_l.append(state['exp_avg'])
Expand Down
6 changes: 3 additions & 3 deletions colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
return
torch.cuda.current_stream().wait_stream(stream)
# As mentioned in https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html,
# PyTorch uses the "caching allocator" for memroy allocation for tensors. When a tensor is
# freed, its memory is likely to be reused by newly constructed tenosrs. By default,
# PyTorch uses the "caching allocator" for memory allocation for tensors. When a tensor is
# freed, its memory is likely to be reused by newly constructed tensors. By default,
# this allocator traces whether a tensor is still in use by only the CUDA stream where it
# was created. When a tensor is used by additional CUDA streams, we need to call record_stream
# to tell the allocator about all these streams. Otherwise, the allocator might free the
Expand Down Expand Up @@ -294,7 +294,7 @@ def print_comm_stats(self):
print(
f"CPU->CUDA BWD {self._cpu_to_cuda_numel * self.elem_size_in_byte / 1e6 / elapsed} MB/s {self._cpu_to_cuda_numel / 1e6} M elem"
)
print(f'cpu_to_cuda_elpase {elapsed} sec')
print(f'cpu_to_cuda_elapse {elapsed} sec')

for k, v in self._elapsed_dict.items():
print(f'{k}: {v}')
Expand Down
2 changes: 1 addition & 1 deletion colossalai/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
norm_type = float(norm_type)

# Parameters can be on CPU or CUDA
# If parameters are on CPU, disable CUDA kernerls
# If parameters are on CPU, disable CUDA kernels

# Calculate norm.
if norm_type == inf:
Expand Down
12 changes: 6 additions & 6 deletions colossalai/utils/tensor_detector/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ detector.detect()

I have made some comments on the right of the output for your understanding.

Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memery Allocated`. PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly.
Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memory Allocated`. PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly.

**The order of print is not equal to the order the tensor creates, but they are really close.**

Expand All @@ -61,7 +61,7 @@ Note that the total `Mem` of all the tensors and parameters is not equal to `Tot
+ mlp.2.bias cuda:0 (32,) True torch.float32 128 B
------------------------------------------------------------------------------------------------------------
Detect Location: "test_tensor_detector.py" line 27
Totle GPU Memery Allocated on cuda:0 is 4.5 KB
Total GPU Memory Allocated on cuda:0 is 4.5 KB
------------------------------------------------------------------------------------------------------------


Expand All @@ -72,7 +72,7 @@ Totle GPU Memery Allocated on cuda:0 is 4.5 KB
+ Tensor cuda:0 (32,) True torch.float32 128 B # output
------------------------------------------------------------------------------------------------------------
Detect Location: "test_tensor_detector.py" line 30
Totle GPU Memery Allocated on cuda:0 is 5.5 KB
Total GPU Memory Allocated on cuda:0 is 5.5 KB
------------------------------------------------------------------------------------------------------------


Expand All @@ -82,7 +82,7 @@ Totle GPU Memery Allocated on cuda:0 is 5.5 KB
+ Tensor cuda:0 () True torch.float32 4 B # loss
------------------------------------------------------------------------------------------------------------
Detect Location: "test_tensor_detector.py" line 32
Totle GPU Memery Allocated on cuda:0 is 6.0 KB
Total GPU Memory Allocated on cuda:0 is 6.0 KB
------------------------------------------------------------------------------------------------------------


Expand All @@ -103,7 +103,7 @@ Totle GPU Memery Allocated on cuda:0 is 6.0 KB
- Tensor cuda:0 (8,) True torch.float32 32 B # deleted activation
------------------------------------------------------------------------------------------------------------
Detect Location: "test_tensor_detector.py" line 34
Totle GPU Memery Allocated on cuda:0 is 10.0 KB
Total GPU Memory Allocated on cuda:0 is 10.0 KB
------------------------------------------------------------------------------------------------------------


Expand All @@ -117,7 +117,7 @@ Totle GPU Memery Allocated on cuda:0 is 10.0 KB
+ Tensor cuda:0 (32,) False torch.float32 128 B
------------------------------------------------------------------------------------------------------------
Detect Location: "test_tensor_detector.py" line 36
Totle GPU Memery Allocated on cuda:0 is 14.0 KB
Total GPU Memory Allocated on cuda:0 is 14.0 KB
------------------------------------------------------------------------------------------------------------
```

Expand Down
8 changes: 4 additions & 4 deletions colossalai/utils/tensor_detector/tensor_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get_tensor_mem(self, tensor):
return self.mem_format(memory_size)

def mem_format(self, real_memory_size):
# format the tensor memory into a reasonal magnitude
# format the tensor memory into a reasonable magnitude
if real_memory_size >= 2**30:
return str(real_memory_size / (2**30)) + ' GB'
if real_memory_size >= 2**20:
Expand All @@ -71,7 +71,7 @@ def collect_tensors_state(self):
if (not self.include_cpu) and obj.device == torch.device('cpu'):
continue
self.detected.append(id(obj))
# skip paramters we had added in __init__ when module is an instance of nn.Module for the first epoch
# skip parameters we had added in __init__ when module is an instance of nn.Module for the first epoch
if id(obj) not in self.tensor_info:

name = type(obj).__name__
Expand All @@ -84,7 +84,7 @@ def collect_tensors_state(self):
name = par_name + ' (with grad)'
else:
# with no grad attached
# there will be no new paramters created during running
# there will be no new parameters created during running
# so it must be in saved_tensor_info
continue
# we can also marked common tensors as tensor(with grad)
Expand Down Expand Up @@ -155,7 +155,7 @@ def print_tensors_state(self):
if device == torch.device('cpu'):
continue
gpu_mem_alloc = self.mem_format(torch.cuda.memory_allocated(device))
self.info += f"Totle GPU Memery Allocated on {device} is {gpu_mem_alloc}\n"
self.info += f"Total GPU Memory Allocated on {device} is {gpu_mem_alloc}\n"
self.info += LINE
self.info += '\n\n'
if self.show_info:
Expand Down
Loading