Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
contains( github.event.pull_request.labels.*.name, 'Run Build and Test')
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.10.1-11.3.0
image: hpcaitech/pytorch-cuda:1.11.0-11.3.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
timeout-minutes: 40
steps:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build_gpu_8.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
- name: Unit Testing
run: |
gpu_used=$(nvidia-smi -i 0 --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -gt "100" ] && PYTHONPATH=$PWD pytest tests
[ "$gpu_used" -le "100" ] && PYTHONPATH=$PWD pytest tests
env:
DATA: /data/scratch/cifar-10

45 changes: 27 additions & 18 deletions .github/workflows/compatibility_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,14 @@ name: Compatibility Test
on:
workflow_dispatch:
inputs:
version:
type: choice
description: version for testing
default: 'all'
torch_version:
type: string
description: torch version, separated by comma
required: true
cuda_version:
type: string
description: cuda version, separated by comma
required: true
options:
- all
- pytorch-cuda:1.9.0-11.1.1 # python 3.8
- pytorch-cuda:1.8.1-11.1.1 # python 3.8
- pytorch-cuda:1.7.1-11.0.3 # python 3.8
- pytorch-cuda:1.6.0-10.2 # python 3.6


jobs:
matrix_preparation:
Expand All @@ -24,12 +20,25 @@ jobs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- id: set-matrix
env:
TORCH_VERSIONS: ${{ inputs.torch_version }}
CUDA_VERSIONS: ${{ inputs.cuda_version }}
run: |
[ "${{github.event.inputs.version}}" != "" ] && matrix="[\"frankleeeee/${{github.event.inputs.version}}\"]"
[ "${{github.event.inputs.version}}" == "" ] || [ "${{github.event.inputs.version}}" == "all" ] && \
matrix="[\"frankleeeee/pytorch-cuda:1.9.0-11.1.1\", \"frankleeeee/pytorch-cuda:1.8.1-11.1.1\", \"frankleeeee/pytorch-cuda:1.7.1-11.0.3\", \"frankleeeee/pytorch-cuda:1.6.0-10.2\"]"
echo $matrix
echo "::set-output name=matrix::{\"container\":$(echo $matrix)}"
IFS=','
DOCKER_IMAGE=()

for tv in $TORCH_VERSIONS
do
for cv in $CUDA_VERSIONS
do
DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"")
done
done

container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
container="[${container}]"
echo "$container"
echo "::set-output name=matrix::{\"container\":$(echo "$container")}"

build:
name: Test for PyTorch Compatibility
Expand All @@ -46,16 +55,16 @@ jobs:
steps:
- name: Install dependencies
run: |
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip install -U pip setuptools wheel --user
- uses: actions/checkout@v2
- name: Install Colossal-AI
run: |
pip install -r requirements/requirements.txt
pip install -r requirements/requirements-test.txt
pip install -v --no-cache-dir .
pip install -r requirements/requirements-test.txt
- name: Unit Testing
run: |
PYTHONPATH=$PWD pytest tests
env:
DATA: /data/scratch/cifar-10
NCCL_SHM_DISABLE: 1
41 changes: 27 additions & 14 deletions .github/workflows/release_bdist.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,15 @@ name: Release bdist wheel
on:
workflow_dispatch:
inputs:
torch_version:
type: string
description: torch version, separated by comma
required: true
default: "all"
cuda_version:
type: choice
description: CUDA Version
default: 'all'
type: string
description: cuda version, separated by comma
required: true
options:
- all
- "11.3"
- "11.1"
- "10.2"
github_ref:
type: string
description: Branch or Tag
Expand All @@ -27,12 +26,24 @@ jobs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- id: set-matrix
env:
TORCH_VERSIONS: ${{ inputs.torch_version }}
CUDA_VERSIONS: ${{ inputs.cuda_version }}
run: |
[ "${{github.event.inputs.cuda_version}}" != "" ] && matrix="[\"hpcaitech/cuda-conda:${{github.event.inputs.cuda_version}}\"]"
[ "${{github.event.inputs.cuda_version}}" == "" ] || [ "${{github.event.inputs.cuda_version}}" == "all" ] && \
matrix="[\"hpcaitech/cuda-conda:11.3\", \"hpcaitech/cuda-conda:11.1\", \"hpcaitech/cuda-conda:10.2\"]"
echo $matrix
echo "::set-output name=matrix::{\"container\":$(echo $matrix)}"
echo $TORCH_VERSIONS
echo $CUDA_VERSIONS
IFS=','
DOCKER_IMAGE=()

for cv in $CUDA_VERSIONS
do
DOCKER_IMAGE+=("\"hpcaitech/cuda-conda:${cv}\"")
done

container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
container="[${container}]"
echo "$container"
echo "::set-output name=matrix::{\"container\":$(echo "$container")}"

build:
name: Release bdist wheels
Expand Down Expand Up @@ -62,7 +73,9 @@ jobs:
- name: Build bdist wheel
run: |
pip install beautifulsoup4 requests packaging
python ./build_colossalai_wheel.py
python ./build_colossalai_wheel.py --torch_version $TORCH_VERSIONS
env:
TORCH_VERSIONS: ${{ inputs.torch_version }}
- name: 🚀 Deploy
uses: garygrossgarten/github-action-scp@release
with:
Expand Down
37 changes: 18 additions & 19 deletions .github/workflows/scripts/build_colossalai_wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--torch_version', type=str)
parser.add_argument('--nightly', action='store_true',
help='whether this build is for nightly release, if True, will only build on the latest PyTorch version and Python 3.8')
return parser.parse_args()
Expand Down Expand Up @@ -81,29 +82,27 @@ def main():
args = parse_args()
wheel_info = all_wheel_info()

if args.nightly:
latest_torch_version = list(wheel_info.keys())
# filter wheels on condition
all_torch_versions = list(wheel_info.keys())
def _compare_version(a, b):
if version.parse(a) > version.parse(b):
return 1
else:
return -1

def _compare_version(a, b):
if version.parse(a) > version.parse(b):
return 1
else:
return -1
all_torch_versions.sort(key=cmp_to_key(_compare_version))

latest_torch_version.sort(key=cmp_to_key(_compare_version))

if args.nightly:
# only keep the latest version
for key in latest_torch_version[:-1]:
for key in all_torch_versions[:-1]:
wheel_info.pop(key)

# we only keep python 3.8 for nightly release
for torch_version, cuda_versioned_info in wheel_info.items():
for cuda_version, python_versioned_info in cuda_versioned_info.items():
python_versions = list(python_versioned_info.keys())

for key in python_versions:
if key != '3.8':
python_versioned_info.pop(key)
elif args.torch_version != 'all':
torch_versions = args.torch_version.split(',')
# only keep the torch versions specified
for key in all_torch_versions:
if key not in torch_versions:
wheel_info.pop(key)

build_colossalai(wheel_info)

if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,19 @@ def handle_gradient(self):
# Pack the buckets.
for param in self._model.parameters():
group = getattr(param, 'pipeline_shared_module_pg', None)
if param.requires_grad and param.grad is not None and group is not None:
if param.requires_grad and group is not None and (
(hasattr(param, 'colo_attr') and not param.colo_attr.saved_grad.is_null())
or param.grad is not None):
tp = param.data.type()
buckets[group][tp].append(param)

# For each bucket, all-reduce and copy all-reduced grads.
for group, group_buckets in buckets.items():
for tp, bucket in group_buckets.items():
grads = [param.grad.data for param in bucket]
grads = [
param.colo_attr.grad_payload if hasattr(param, 'colo_attr') else param.grad.data
for param in bucket
]
coalesced = _flatten_dense_tensors(grads).to(torch.cuda.current_device())
dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=group)
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
Expand Down
34 changes: 33 additions & 1 deletion colossalai/fx/passes/adding_split_node_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ def pipe_split():


def balanced_split_pass(gm: torch.fx.GraphModule, pp_size: int):
# TODO(lyl): balanced policy V2, split module by node size(weight+bias+output)
"""
In balanced_split_pass, we split module by the size of parameters(weights+bias).
"""
mod_graph = gm.graph
total_param_amount = 0
for param in mod_graph.owning_module.parameters():
Expand Down Expand Up @@ -39,6 +41,36 @@ def balanced_split_pass(gm: torch.fx.GraphModule, pp_size: int):
return gm


def balanced_split_pass_v2(gm: torch.fx.GraphModule, pp_size: int):
"""
In balanced_split_pass_v12, we split module by the size of nodes(weights+bias+outputs).
"""
mod_graph = gm.graph
# To use balanced_split_pass_v2, we need run meta_info_prop interpreter first.
# If nodes don't have meta info, this pass will fall back to normal balanced split pass.
check_node = list(mod_graph.nodes)[0]
if 'tensor_meta' not in check_node.meta:
return balanced_split_pass(gm, pp_size)

total_element_size = 0
for node in mod_graph.nodes:
total_element_size += node.node_size

partition_size = total_element_size // pp_size
accumulate_node_size = 0
for node in mod_graph.nodes:
if pp_size <= 1:
break
accumulate_node_size += node.node_size
if accumulate_node_size >= partition_size:
accumulate_node_size = 0
pp_size -= 1
with mod_graph.inserting_after(node):
split_node = mod_graph.create_node('call_function', pipe_split)
gm.recompile()
return gm


def uniform_split_pass(gm: torch.fx.GraphModule, pp_size: int):
mod_graph = gm.graph
valid_children_size = 0
Expand Down
21 changes: 19 additions & 2 deletions colossalai/fx/passes/meta_info_prop.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ class MetaInfoProp(torch.fx.Interpreter):

def run_node(self, n: Node) -> Any:
result = super().run_node(n)

found_tensor = False

def extract_tensor_meta(obj):
Expand All @@ -83,7 +82,25 @@ def extract_tensor_meta(obj):
n.meta['tensor_meta'] = meta
else:
n.meta['tensor_meta'] = TensorMetadata(None, None, False, None, 0)

# counting the total size of node outputs
total_node_size = 0
if isinstance(n.meta['tensor_meta'], TensorMetadata):
total_node_size += n.meta['tensor_meta'].numel
else:
for element in n.meta['tensor_meta']:
assert isinstance(
element, TensorMetadata
), f"``n.meta['tensor_meta']`` should be either TensorMetadata or a tuple of TensorMetadata."
total_node_size += element.numel
# counting the total size of parameters
total_param_size = 0
if n.op == 'call_module':
target_module = n.graph.owning_module.get_submodule(n.target)
for param in target_module.parameters():
total_param_size += param.numel()

total_node_size += total_param_size
n.node_size = total_node_size
n.meta['type'] = type(result)
return result

Expand Down
Loading