Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
04e5272
Merge pull request #1 from hpcaitech/main
Cypher30 Jul 14, 2022
75618b3
Merge pull request #2 from hpcaitech/main
Cypher30 Jul 15, 2022
3e4620c
Merge pull request #3 from hpcaitech/main
Cypher30 Jul 20, 2022
cf24049
Merge remote-tracking branch 'upstream/main' into main
Jul 20, 2022
3d223b6
Merge remote-tracking branch 'upstream/main' into main
Jul 21, 2022
644115c
Merge branch 'hpcaitech:main' into main
Cypher30 Jul 22, 2022
d995ade
Merge branch 'hpcaitech:main' into main
Cypher30 Jul 25, 2022
bba2dbe
Merge branch 'hpcaitech:main' into main
Cypher30 Jul 26, 2022
05ca628
Merge branch 'hpcaitech:main' into main
Cypher30 Jul 26, 2022
0a967da
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 6, 2022
0637c0d
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 8, 2022
74a6227
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 10, 2022
e550490
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 10, 2022
2d7f5d9
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 11, 2022
b62e870
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 12, 2022
b4b0974
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 15, 2022
65c20de
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 16, 2022
1660bfc
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 17, 2022
6eb0ad0
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 20, 2022
56df059
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 26, 2022
480e932
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 30, 2022
0fa66ee
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 30, 2022
1d013b0
Merge branch 'hpcaitech:main' into main
Cypher30 Aug 31, 2022
5774db2
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 5, 2022
e8ff699
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 6, 2022
855c728
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 7, 2022
2c113ea
Merge branch 'main' of github.com:Cypher30/ColossalAI into main
Sep 8, 2022
838ba70
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 13, 2022
cacec2b
Merge branch 'main' of github.com:Cypher30/ColossalAI into main
Sep 13, 2022
5ed6ef0
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 14, 2022
668af30
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 14, 2022
df79772
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 15, 2022
7b6a0fc
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 20, 2022
c30022e
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 23, 2022
df20f4d
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 26, 2022
2d5a6a0
Merge branch 'hpcaitech:main' into main
Cypher30 Sep 29, 2022
07d27a6
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 3, 2022
dc68ba9
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 5, 2022
929e7d3
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 6, 2022
90aa46a
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 11, 2022
40363da
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 13, 2022
fe3fca5
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 25, 2022
956156e
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 26, 2022
cb20212
Merge branch 'hpcaitech:main' into main
Cypher30 Oct 26, 2022
744a775
Merge branch 'hpcaitech:main' into main
Cypher30 Nov 10, 2022
1629a90
Merge branch 'hpcaitech:main' into main
Cypher30 Nov 10, 2022
f0558e3
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 1, 2022
bb7bd4a
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 21, 2022
26de8e5
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 23, 2022
83a1418
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 30, 2022
0802b94
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 30, 2022
16ea2c7
Merge branch 'hpcaitech:main' into main
Cypher30 Dec 31, 2022
44df1f9
Merge branch 'hpcaitech:main' into main
Cypher30 Jan 2, 2023
fcd117c
Merge branch 'hpcaitech:main' into main
Cypher30 Jan 9, 2023
f20106e
Merge branch 'hpcaitech:main' into main
Cypher30 Feb 2, 2023
e7cf700
Merge branch 'hpcaitech:main' into main
Cypher30 Feb 8, 2023
4f6bd1c
Merge branch 'hpcaitech:main' into main
Cypher30 Feb 15, 2023
afe135d
Merge branch 'hpcaitech:main' into main
Cypher30 Feb 16, 2023
a7a254d
[autoparallel] tanh meta information
Feb 16, 2023
e37062d
[autoparallel] remove redundant code
Feb 16, 2023
f0416e7
[autoparallel] patch meta information of torch.nn.Dropout
Feb 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 71 additions & 110 deletions colossalai/auto_parallel/meta_profiler/meta_registry/activation.py
Original file line number Diff line number Diff line change
@@ -1,124 +1,85 @@
from typing import List, Tuple
from typing import Callable, List, Tuple

import torch

from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, OperationDataType, TrainCycleItem
from colossalai.fx.profiler.memory_utils import activation_size
from colossalai.fx.profiler.opcount import flop_mapping
from colossalai.fx.profiler.opcount import elementwise_flop_counter

from ..registry import meta_register

__all__ = ["relu_meta_info"]
__all__ = ["elementwise_meta_info"]


@meta_register.register(torch.nn.ReLU)
def relu_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
"""torch.nn.ReLU metainfo generator
The aten graph of torch.nn.ReLU is
graph():
%input_2 : [#users=1] = placeholder[target=placeholder](default=)
%relu_default : [#users=2] = call_function[target=torch.ops.aten.relu.default](args = (%input_2,), kwargs = {})
%zeros_like_default : [#users=1] = call_function[target=torch.ops.aten.zeros_like.default](args = (%relu_default,), kwargs = {dtype: None, layout: None, device: None, pin_memory: None})
%detach_default : [#users=1] = call_function[target=torch.ops.aten.detach.default](args = (%relu_default,), kwargs = {})
%threshold_backward_default : [#users=1] = call_function[target=torch.ops.aten.threshold_backward.default](args = (%zeros_like_default, %detach_default, None), kwargs = {})
%detach_default_1 : [#users=1] = call_function[target=torch.ops.aten.detach.default](args = (%threshold_backward_default,), kwargs = {})
%detach_default_2 : [#users=0] = call_function[target=torch.ops.aten.detach.default](args = (%detach_default_1,), kwargs = {})
def elementwise_meta_info(temp_mem_scale: float = 0, buffer_mem_scale: float = 0) -> Callable:
"""This is a function to create the meta information generator for elementwise operations

Returns:
Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
"""

input_tensor = args[0].data
output_tensor = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
is_inplace = kwargs.get("inplace", False)

# construct input args for forward
fwd_in_args = [input_tensor]

# construct input args for backward
bwd_in_args = [output_tensor]

# calculate cost
# the fwd op with compute cost is relu.default
# the bwd op with compute cost is threshold_backward

# calculate compute cost
fwd_compute_cost = flop_mapping[torch.ops.aten.relu.default](fwd_in_args, (output_tensor,))
bwd_compute_cost = flop_mapping[torch.ops.aten.threshold_backward.default](bwd_in_args, (input_tensor,))
compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)

# calculate memory cost
# NOTE: the inplace ReLU don't have forward memory cost
# NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
fwd_memory_cost = MemoryCost(
activation=activation_size(input_tensor) if is_inplace else activation_size([output_tensor, input_tensor]),
parameter=0,
temp=0,
buffer=0)
Args:
temp_mem_scale (float, optional): temp memory scaling factor for backward. Defaults to 0.
buffer_mem_scale (float, optional): buffer memory scaling factor for forward. Defaults to 0.

bwd_memory_cost = MemoryCost(activation=activation_size(input_tensor), parameter=0, temp=0, buffer=0)

# total cost is the sum of forward and backward cost
total_cost = MemoryCost(activation=fwd_memory_cost.activation + bwd_memory_cost.activation,
parameter=fwd_memory_cost.parameter + bwd_memory_cost.parameter)

memory_cost = TrainCycleItem(fwd=fwd_memory_cost, bwd=bwd_memory_cost, total=total_cost)

# store fwd_in, fwd_buffer, fwd_out
# NOTE: It might seems a little bit weird here, we just want to align it with the older version
# of MetaInfoProp. In the future we might modify this part to make it clearer.
fwd_in = []
fwd_buffer = [torch.zeros_like(output_tensor, device='meta')]
fwd_out = [torch.zeros_like(output_tensor, device='meta')]

return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out


@meta_register.register(torch.nn.Softmax)
@meta_register.register(torch.nn.functional.softmax)
def softmax_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
"""torch.nn.Softmax metainfo generator
Returns:
Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
Callable: meta information generator
"""
input_tensor = next(
filter(
lambda x:
(x.type == OperationDataType.ARG or x.type == OperationDataType.PARAM) and x.name != 'softmax_dim',
args)).data
output_tensor = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
softmax_dim = next(filter(lambda x: x.name == 'softmax_dim', args)).data

# calculate cost

# calculate compute cost
fwd_compute_cost = flop_mapping[torch.ops.aten._softmax.default]([input_tensor], [output_tensor])
bwd_compute_cost = flop_mapping[torch.ops.aten._softmax_backward_data.default]([output_tensor], [input_tensor])

compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)

# calculate memory cost
# NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
fwd_memory_cost = MemoryCost(activation=activation_size([input_tensor, output_tensor]),
parameter=0,
temp=0,
buffer=0)
bwd_memory_cost = MemoryCost(activation=activation_size(input_tensor),
parameter=0,
temp=activation_size(input_tensor),
buffer=0)

# total cost is the sum of forward and backward cost
total_cost = MemoryCost(activation=fwd_memory_cost.activation + bwd_memory_cost.activation,
parameter=fwd_memory_cost.parameter + bwd_memory_cost.parameter,
temp=fwd_memory_cost.temp + bwd_memory_cost.temp,
buffer=fwd_memory_cost.buffer + bwd_memory_cost.buffer)

memory_cost = TrainCycleItem(fwd=fwd_memory_cost, bwd=bwd_memory_cost, total=total_cost)

# store fwd_in, fwd_buffer, fwd_out
fwd_in = []
fwd_buffer = [torch.zeros_like(output_tensor, device='meta')]
fwd_out = [torch.zeros_like(output_tensor, device='meta')]

return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out

def meta_func(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
input_tensor = next(
filter(
lambda x:
(x.type == OperationDataType.ARG or x.type == OperationDataType.PARAM) and x.name != 'softmax_dim',
args)).data
output_tensor = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
is_inplace = 1 if kwargs.get('inplace', False) else 0

flop_counter = elementwise_flop_counter(1, 0)
# calculate compute cost
fwd_compute_cost = flop_counter([input_tensor], [output_tensor])
bwd_compute_cost = flop_counter([output_tensor], [input_tensor])

compute_cost = TrainCycleItem(fwd=fwd_compute_cost,
bwd=bwd_compute_cost,
total=fwd_compute_cost + bwd_compute_cost)

# calculate memory cost
# NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
# NOTE: if in_place is True, we will not create a new tensor in forward
fwd_memory_cost = MemoryCost(activation=activation_size(input_tensor) * (2 - is_inplace),
parameter=0,
temp=0,
buffer=activation_size(input_tensor) * buffer_mem_scale)

# temp_mem_scale is for situation like softmax backward
# the buffer will be removed during backward phase
bwd_memory_cost = MemoryCost(
activation=activation_size(input_tensor) - activation_size(input_tensor) * buffer_mem_scale,
parameter=0,
temp=activation_size(input_tensor) * temp_mem_scale + activation_size(input_tensor) * buffer_mem_scale,
buffer=0)

# total cost is the sum of forward and backward cost
total_cost = MemoryCost(activation=fwd_memory_cost.activation + bwd_memory_cost.activation,
parameter=fwd_memory_cost.parameter + bwd_memory_cost.parameter,
temp=fwd_memory_cost.temp + bwd_memory_cost.temp,
buffer=fwd_memory_cost.buffer + bwd_memory_cost.buffer)

memory_cost = TrainCycleItem(fwd=fwd_memory_cost, bwd=bwd_memory_cost, total=total_cost)

# store fwd_in, fwd_buffer, fwd_out
fwd_in = []
fwd_buffer = [torch.zeros_like(output_tensor, device='meta')]
fwd_out = [torch.zeros_like(output_tensor, device='meta')]

return compute_cost, memory_cost, fwd_in, fwd_buffer, fwd_out

return meta_func


# register meta information
# (0, 0)
meta_register.register([torch.nn.ReLU, torch.nn.functional.relu, torch.tanh])(elementwise_meta_info(0, 0))

# (1, 0)
meta_register.register([torch.nn.Softmax, torch.nn.functional.softmax])(elementwise_meta_info(1, 0))

# (0, 0.25) for dropout, the buffer is in bool type so that the buffer memory cost is 0.25 times of input tensor
meta_register.register([torch.nn.Dropout, torch.nn.functional.dropout])(elementwise_meta_info(0, 0.25))
Original file line number Diff line number Diff line change
Expand Up @@ -17,51 +17,15 @@
from tests.test_auto_parallel.test_tensor_shard.test_metainfo.utils import mem_test_for_node_strategy, print_results


def _ReLU_module_mem_test(rank, world_size, port):
"""This function is for ReLU memory test
Test and print real memory cost and estimated, this test will not be executed except with the tag AUTO_PARALLEL

Args:
Args:
rank: device rank
bias: indicate whether conv module need bias
world_size: number of devices
port: port for initializing process group
"""
disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
model = nn.Sequential(nn.ReLU()).cuda()
input = torch.rand(4, 128, 64, 64).cuda()
input.requires_grad = True
physical_mesh_id = torch.arange(0, 4)
mesh_shape = (2, 2)
device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)

# index of target node in computation graph
node_index = 1
# total number of target node strategies
strategy_number = 1
mem_test_for_node_strategy(rank=rank,
model=model,
device_mesh=device_mesh,
node_index=node_index,
strategy_number=strategy_number,
input_args=[input],
meta_arg_names=['input'])


@run_on_environment_flag(name='AUTO_PARALLEL')
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_ReLU_meta_concrete_info_match():
world_size = 4
run_func_module = partial(_ReLU_module_mem_test, world_size=world_size, port=free_port())
mp.spawn(run_func_module, nprocs=world_size)


@pytest.mark.skipif(torch.__version__ < '1.12.0', reason="need pytorch 1.12.0 or higher for aten level operations")
def test_sofmax_meta_info():
meta_func = meta_register.get(torch.nn.functional.softmax)
@parameterize('func', [
torch.nn.functional.softmax,
torch.nn.functional.relu,
torch.tanh,
torch.nn.functional.dropout,
])
def test_activation_meta_info(func):
meta_func = meta_register.get(func)
# construct meta tensors
input_tensor = torch.rand(256, 1024, device="meta")
output_tensor = torch.rand(256, 1024, device="meta")
Expand All @@ -87,7 +51,7 @@ def test_sofmax_meta_info():
# fwd
torch.cuda.reset_peak_memory_stats()
mem_stamp0 = torch.cuda.memory_allocated()
output_real_tensor = torch.nn.functional.softmax(input_real_tensor, dim=softmax_dim)
output_real_tensor = func(input_real_tensor)
fwd_allocated = torch.cuda.memory_allocated() - mem_stamp0
fwd_peak = torch.cuda.max_memory_allocated() - mem_stamp0

Expand All @@ -104,5 +68,4 @@ def test_sofmax_meta_info():


if __name__ == '__main__':
# test_ReLU_meta_concrete_info_match()
test_sofmax_meta_info()
test_activation_meta_info()