Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
04e5272
Merge pull request #1 from hpcaitech/main
Cypher30 Jul 14, 2022
75618b3
Merge pull request #2 from hpcaitech/main
Cypher30 Jul 15, 2022
3e4620c
Merge pull request #3 from hpcaitech/main
Cypher30 Jul 20, 2022
cf24049
Merge remote-tracking branch 'upstream/main' into main
Jul 20, 2022
3d223b6
Merge remote-tracking branch 'upstream/main' into main
Jul 21, 2022
644115c
Merge branch 'hpcaitech:main' into main
Cypher30 Jul 22, 2022
d995ade
Merge branch 'hpcaitech:main' into main
Cypher30 Jul 25, 2022
bba2dbe
Merge branch 'hpcaitech:main' into main
Cypher30 Jul 26, 2022
05ca628
Merge branch 'hpcaitech:main' into main
Cypher30 Jul 26, 2022
0a967da
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 6, 2022
0637c0d
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 8, 2022
74a6227
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 10, 2022
e550490
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 10, 2022
2d7f5d9
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 11, 2022
b62e870
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 12, 2022
b4b0974
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 15, 2022
65c20de
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 16, 2022
1660bfc
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 17, 2022
6eb0ad0
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 20, 2022
56df059
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 26, 2022
480e932
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 30, 2022
0fa66ee
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 30, 2022
1d013b0
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 31, 2022
5774db2
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 5, 2022
e8ff699
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 6, 2022
855c728
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 7, 2022
2c113ea
Merge branch 'main' of github.com:Cypher30/ColossalAI into main
Sep 8, 2022
838ba70
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 13, 2022
cacec2b
Merge branch 'main' of github.com:Cypher30/ColossalAI into main
Sep 13, 2022
5ed6ef0
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 14, 2022
668af30
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 14, 2022
df79772
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 15, 2022
7b6a0fc
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 20, 2022
c30022e
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 23, 2022
df20f4d
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 26, 2022
2d5a6a0
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 29, 2022
07d27a6
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 3, 2022
dc68ba9
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 5, 2022
929e7d3
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 6, 2022
90aa46a
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 11, 2022
40363da
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 13, 2022
fe3fca5
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 25, 2022
956156e
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 26, 2022
cb20212
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 26, 2022
744a775
Merge branch 'hpcaitech:main' into main
Cypher30 Nov 10, 2022
1629a90
Merge branch 'hpcaitech:main' into main
Cypher30 Nov 10, 2022
f0558e3
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 1, 2022
bb7bd4a
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 21, 2022
26de8e5
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 23, 2022
83a1418
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 30, 2022
0802b94
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 30, 2022
16ea2c7
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 31, 2022
44df1f9
Merge branch 'hpcaitech:main' into main
Cypher30 Jan 2, 2023
fcd117c
Merge branch 'hpcaitech:main' into main
Cypher30 Jan 9, 2023
f20106e
Merge branch 'hpcaitech:main' into main
Cypher30 Feb 2, 2023
c388340
[autoparallel] matmul metainfo
Feb 6, 2023
54f70ca
Merge branch 'hpcaitech:main' into feature/matmul_metainfo
Cypher30 Feb 6, 2023
5b59fb0
[auto_parallel] remove unused print
Feb 7, 2023
8c57228
[tests] skip test_matmul_handler when torch version is lower than 1.12.0
Feb 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
235 changes: 234 additions & 1 deletion colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from functools import reduce
from typing import Callable, Dict, List, Tuple, Union

import torch
Expand All @@ -16,7 +17,7 @@

from ..registry import meta_register

__all__ = ['linear_meta_info']
__all__ = ['linear_meta_info', 'matmul_meta_info']


@meta_register.register(torch.nn.functional.linear)
Expand Down Expand Up @@ -170,3 +171,235 @@ def linear_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L
fwd_out = [torch.zeros_like(output_tensor, device='meta')]

return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out


@meta_register.register(torch.matmul)
def matmul_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
"""torch.matmul meta info generator
There are several cases for torch.matmul:
1. Vector-vector multiplication => no temp memory, forward memory cost is 1 element (could be neglected), backward memory cost is the same
as two input vectors.
2. Matrix-vector multiplication => if the first input is matrix, no temp memory is needed, otherwise, there is a temp memory in the backward
phase for the transpose of the matrix. The forward memory cost is the size of output tensor, backward memory cost is the size of the two inputs; if
the first input is vector, the forward memory cost is the size of the output tensor, and during the backward phase, it will allocate a temp memory
the same size as the input matrix, and allocate memory for the gradient of two inputs.
3. Batched Matrix-vector multiplication => if the first input is the batched matrix, no temp memory, the forward memory cost is the size of
output tensor, backward memory cost is the size of the two inputs; if the second input is the batched matrix, the matmul will allocate memory for
the gradient of the batched matrix in the forward phase (as they create a new tensor without the former batches), so the forward memory cost is
the output tensor and the newly created matrix (take the same amount of memory of the input batched matrix). During the backward phase, it will
allocate a temp memory the same size as input batched matrix, and allocate a tensor for the gradient of the input vector. The gradient of the batched
matrix will be stored in the memory allocated during the forward phase.
3. Matrix-matrix multiplication => no temp memory, forward memory is the size of output tensor, backward memory is the size of the two inputs
4. Batched matrix-matrix multiplication => if the first input is the batched matrix, no temp memory, the forward memory cost is the size of two
inputs and backward memory cost is the size of the output tensor; if the second input is the batched matrix, during the forward phase it will allocate
memory for the output and gradient of the second input, and has a temp memory the same size as the output, during the backward phase, it
will allocate memory for the gradient of the first input and has a temp memory which is as big as output and the second input.
5. Batched matrix-batched matrix multiplication => if the two inputs have the same batch dimensions, no temp memory, the forward memory cost is the size
of output, backward memory cost is the size of the two inputs; it the two inputs have different batch dimensions, during the forward phase it will allocate
memory of the expanded inputs (so that the batch dimensions could match) and the output, and during the backward phase, it has a temp memory of the size of
two expanded inputs, and it will allocate memory for the gradient of the two inputs and discard the expanded inputs allocated during the forward phase.

Returns:
Tuple[TrainCycleItem, TrainCycleItem, bool]: compute cost, memory cost and forward inputs

"""
# Get input and output tensors
input_tensors = [args[0].data, args[1].data]
output_tensors = [args[-1].data]

# Check dimension
if all(len(tensor.shape) == 1 for tensor in input_tensors):
# Dot
fwd_compute_cost = flop_mapping[torch.ops.aten.dot.default](input_tensors, output_tensors)
bwd_compute_cost = flop_mapping[torch.ops.aten.mul.Tensor](input_tensors[0], output_tensors) * 2

fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors), parameter=0, temp=0, buffer=0)
bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors), parameter=0, temp=0, buffer=0)

elif len(input_tensors[0].shape) >= 2 and len(input_tensors[1].shape) == 1:
# gemv case 1: matrix-vector multiplication
# &
# batched gemv case 1: batched matrix-vector multiplication

fwd_compute_cost = flop_mapping[torch.ops.aten.mv.default](
[input_tensors[0].reshape(-1, input_tensors[0].shape[-1]), input_tensors[1]], output_tensors)

# combine the dimensions of output
bwd_compute_cost = flop_mapping[torch.ops.aten.mul.Tensor](
[output_tensors[0].reshape(-1), input_tensors[1]],
output_tensors) + \
flop_mapping[torch.ops.aten.mv.default](
[input_tensors[0].reshape(-1, input_tensors[0].shape[-1]).transpose(0, 1), output_tensors[0].reshape(-1)],
output_tensors)

fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors), parameter=0, temp=0, buffer=0)
bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors), parameter=0, temp=0, buffer=0)

elif len(input_tensors[0].shape) == 1 and len(input_tensors[1].shape) == 2:
# gemv case 2: vector-matrix multiplication
fwd_compute_cost = flop_mapping[torch.ops.aten.mv.default](input_tensors, output_tensors)

bwd_compute_cost = flop_mapping[torch.ops.aten.mul.Tensor]([output_tensors[0], input_tensors[0]], output_tensors) + \
flop_mapping[torch.ops.aten.mv.default]([input_tensors[1], output_tensors[0]], output_tensors)

fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors), parameter=0, temp=0, buffer=0)
bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors),
parameter=0,
temp=activation_size(input_tensors[1]),
buffer=0)

elif len(input_tensors[0].shape) == 1 and len(input_tensors[1].shape) >= 3:
# batched gemv case 2: vector-batched matrix multiplication

fwd_compute_cost = flop_mapping[torch.ops.aten.mv.default](
[input_tensors[1].transpose(-2, -1).reshape(-1, input_tensors[1].shape[-2]), input_tensors[0]],
[output_tensors[0].reshape(-1)])

# combine the dimensions of output
bwd_compute_cost = flop_mapping[torch.ops.aten.mul.Tensor](
[output_tensors[0].reshape(-1), input_tensors[0]],
output_tensors
) + \
flop_mapping[torch.ops.aten.mv.default](
[input_tensors[1].transpose(-2, -1).reshape(-1, input_tensors[1].shape[-2]).transpose(0, 1), output_tensors[0].reshape(-1)],
output_tensors
)

fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors + [input_tensors[1]]))
bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors[0]),
parameter=0,
temp=activation_size(input_tensors[1]),
buffer=0)

elif len(input_tensors[0].shape) >= 2 and len(input_tensors[1].shape) == 2:
# gemm & batched gemm case 1: batched matrix-matrix multiplication

fwd_compute_cost = flop_mapping[torch.ops.aten.mm.default](
[input_tensors[0].reshape(-1, input_tensors[0].shape[-1]), input_tensors[1]],
[output_tensors[0].reshape(-1, output_tensors[0].shape[-1])])

bwd_compute_cost = flop_mapping[torch.ops.aten.mm.default](
[input_tensors[0].reshape(-1, input_tensors[0].shape[-1]).transpose(0, 1), output_tensors[0].reshape(-1, output_tensors[0].shape[-1])],
[input_tensors[1]]
) + \
flop_mapping[torch.ops.aten.mm.default](
[output_tensors[0].reshape(-1, output_tensors[0].shape[-1]), input_tensors[1].transpose(0, 1)],
[input_tensors[0].reshape(-1, input_tensors[0].shape[-1])]
)

fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors), parameter=0, temp=0, buffer=0)
bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors), parameter=0, temp=0, buffer=0)

elif len(input_tensors[0].shape) == 2 and len(input_tensors[1].shape) >= 3:
# batched gemm case 2: matrix-batched matrix multiplication
fwd_compute_cost = flop_mapping[torch.ops.aten.mm.default]([
input_tensors[1].transpose(-2, -1).reshape(-1, input_tensors[1].shape[-2]), input_tensors[0].transpose(
0, 1)
], [output_tensors[0].transpose(-2, -1)])

bwd_compute_cost = flop_mapping[torch.ops.aten.mm.default](
[output_tensors[0].transpose(-2, -1).reshape(-1, output_tensors[0].shape[-2]).transpose(0, 1), input_tensors[1].transpose(-2, -1).reshape(-1, input_tensors[1].shape[-2])],
[input_tensors[0]]
) + \
flop_mapping[torch.ops.aten.mm.default](
[output_tensors[0].transpose(-2, -1).reshape(-1, output_tensors[0].shape[-2]), input_tensors[0]],
[input_tensors[1].transpose(-2, -1).reshape(-1, input_tensors[1].shape[-2])]
)

fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors) + activation_size(input_tensors[1]),
temp=activation_size(output_tensors))
bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors[0]),
parameter=0,
temp=activation_size(input_tensors[1]) + activation_size(output_tensors))

elif all(len(tensor.shape) >= 3 for tensor in input_tensors):
# Batched matrix-batched matrix multiplication
# Fetch shape of the two inputs and see if the batch dimensions are the same
_is_batch_dims_same = True
if len(input_tensors[0].shape) == len(input_tensors[1].shape):
for (shape_0, shape_1) in zip(input_tensors[0].shape[:-2], input_tensors[1].shape[:-2]):
if shape_0 != shape_1:
_is_batch_dims_same = False
break
else:
_is_batch_dims_same = False

# retireve dimensions
input_dim_00 = input_tensors[0].shape[-2]
input_dim_01 = input_tensors[0].shape[-1]
input_dim_10 = input_tensors[1].shape[-2]
input_dim_11 = input_tensors[1].shape[-1]
output_dim_0 = output_tensors[0].shape[-2]
output_dim_1 = output_tensors[0].shape[-1]

if _is_batch_dims_same:
# Case 1: batch dimensions are the same

# Forward compute cost: C = A * B
fwd_compute_cost = flop_mapping[torch.ops.aten.bmm.default]([
input_tensors[0].reshape(-1, input_dim_00, input_dim_01), input_tensors[1].reshape(
-1, input_dim_10, input_dim_11)
], [output_tensors[0].reshape(-1, output_dim_0, output_dim_1)])

# Backward compute cost: dB = A^T * dC, dA = dC * B^T
bwd_compute_cost = flop_mapping[torch.ops.aten.bmm.default](
[input_tensors[0].transpose(-2, -1).reshape(-1, input_dim_01, input_dim_00), output_tensors[0].reshape(-1, output_dim_0, output_dim_1)],
[input_tensors[1].reshape(-1, input_dim_11, input_dim_10)]
) + \
flop_mapping[torch.ops.aten.bmm.default](
[output_tensors[0].reshape(-1, output_dim_0, output_dim_1), input_tensors[1].transpose(-2, -1).reshape(-1, input_dim_11, input_dim_10)],
[input_tensors[0].reshape(-1, input_dim_00, input_dim_01)]
)

fwd_mem_cost = MemoryCost(activation=activation_size(output_tensors))
bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors))

else:
# Case 2: batch dimensions are different
batch_dims = output_tensors[0].shape[:-2]
extended_input_0 = torch.rand(reduce(lambda x, y: x * y, batch_dims),
input_dim_00,
input_dim_01,
device="meta")
extended_input_1 = torch.rand(reduce(lambda x, y: x * y, batch_dims),
input_dim_10,
input_dim_11,
device="meta")

# Forward compute cost: C = A * B
fwd_compute_cost = flop_mapping[torch.ops.aten.bmm.default](
[extended_input_0, extended_input_1], [output_tensors[0].reshape(-1, output_dim_0, output_dim_1)])

# Backward compute cost: dB = A^T * dC, dA = dC * B^T
bwd_compute_cost = flop_mapping[torch.ops.aten.bmm.default](
[extended_input_0.transpose(-2, -1), output_tensors[0].reshape(-1, output_dim_0, output_dim_1)],
[extended_input_1]
) + \
flop_mapping[torch.ops.aten.bmm.default](
[output_tensors[0].reshape(-1, output_dim_0, output_dim_1), extended_input_1.transpose(-2, -1)],
[extended_input_0]
)

fwd_mem_cost = MemoryCost(
activation=activation_size([output_tensors[0], extended_input_0, extended_input_1]))
bwd_mem_cost = MemoryCost(activation=activation_size(input_tensors) -
activation_size([extended_input_0, extended_input_1]),
temp=activation_size([extended_input_0, extended_input_1]))

# compute cost
compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)

# memory cost
total_cost = MemoryCost(activation=fwd_mem_cost.activation + bwd_mem_cost.activation,
parameter=fwd_mem_cost.parameter + bwd_mem_cost.parameter,
temp=fwd_mem_cost.temp + bwd_mem_cost.temp,
buffer=fwd_mem_cost.buffer + bwd_mem_cost.buffer)

memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_cost)

# store fwd_in, fwd_buffer, fwd_out
fwd_in = input_tensors
fwd_buffer = []
fwd_out = output_tensors

return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from ..sharding_strategy import OperationData, OperationDataType, ShardingStrategy
from ..utils import recover_sharding_spec_for_broadcast_shape
from .node_handler import NodeHandler
from .node_handler import MetaInfoNodeHandler, NodeHandler
from .registry import operator_registry
from .strategy import (
BatchedMatMulStrategyGenerator,
Expand Down Expand Up @@ -326,7 +326,7 @@ def _get_bmm_logical_shape(input_shape, other_shape, transforms):

@operator_registry.register(torch.matmul)
@operator_registry.register(torch.Tensor.matmul)
class MatMulHandler(NodeHandler):
class MatMulHandler(MetaInfoNodeHandler):
"""
The MatMulHandler is a node handler which handles the sharding strategy generation for the matmul operation.
According to https://pytorch.org/docs/stable/generated/torch.matmul.html, the operations will vary depending on
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
from colossalai.auto_parallel.tensor_shard.utils import check_sharding_spec_validity
from colossalai.device.device_mesh import DeviceMesh
from colossalai.logging import get_dist_logger
from colossalai.tensor.shape_consistency import ShapeConsistencyManager

from .strategy import StrategyGenerator
Expand Down Expand Up @@ -266,6 +267,10 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV
# attach metainfos to the handler
setattr(self, "metainfo_vector", metainfo_vector)

else:
logger = get_dist_logger()
logger.warning(f'The target function {target} is not patched yet, ')

return self.strategies_vector


Expand Down Expand Up @@ -317,4 +322,8 @@ def register_strategy(self, compute_resharding_cost: bool = True) -> StrategiesV
# attach metainfos to the handler
setattr(self, "metainfo_vector", metainfo_vector)

else:
logger = get_dist_logger()
logger.warning(f'The target function {target} is not patched yet')

return self.strategies_vector
27 changes: 25 additions & 2 deletions colossalai/fx/profiler/opcount.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,28 @@ def matmul_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
# Inputs contains the shapes of two matrices.
input_shapes = [v.shape for v in inputs]
assert len(input_shapes) == 2, input_shapes
assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes

# There are three cases: 1) gemm, 2) gemv, 3) dot
if all(len(shape) == 2 for shape in input_shapes):
# gemm
assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes
elif all(len(shape) == 1 for shape in input_shapes):
# dot
assert input_shapes[0][0] == input_shapes[1][0], input_shapes

# expand shape
input_shapes[0] = torch.Size([1, input_shapes[0][0]])
input_shapes[1] = torch.Size([input_shapes[1][0], 1])
else:
# gemv
if len(input_shapes[0]) == 1:
assert input_shapes[0][0] == input_shapes[1][-2], input_shapes
input_shapes.reverse()
else:
assert input_shapes[1][0] == input_shapes[0][-1], input_shapes

# expand the shape of the vector to [batch size, 1]
input_shapes[-1] = torch.Size([input_shapes[-1][-1], 1])
flops = reduce(operator.mul, input_shapes[0]) * input_shapes[-1][-1]
return flops

Expand Down Expand Up @@ -204,8 +225,10 @@ def zero_flop_jit(*args):

if version.parse(torch.__version__) >= version.parse('1.12.0'):
flop_mapping = {
# gemm
# gemm, gemv and dot
aten.mm.default: matmul_flop_jit,
aten.mv.default: matmul_flop_jit,
aten.dot.default: matmul_flop_jit,
aten.matmul.default: matmul_flop_jit,
aten.addmm.default: addmm_flop_jit,
aten.bmm.default: bmm_flop_jit,
Expand Down
Loading