Cypher30 · Cypher30 · Jul 15, 2022 · Jul 14, 2022 · Jul 14, 2022 · Jul 14, 2022
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -14,7 +14,7 @@ jobs:
         contains( github.event.pull_request.labels.*.name, 'Run Build and Test')
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.10.1-11.3.0
+      image: hpcaitech/pytorch-cuda:1.11.0-11.3.0
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
     timeout-minutes: 40
     steps:

diff --git a/.github/workflows/build_gpu_8.yml b/.github/workflows/build_gpu_8.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Unit Testing
         run: |
           gpu_used=$(nvidia-smi -i 0 --query-gpu=memory.used --format=csv,noheader,nounits)
-          [ "$gpu_used" -gt "100" ] && PYTHONPATH=$PWD pytest tests
+          [ "$gpu_used" -le "100" ] && PYTHONPATH=$PWD pytest tests
         env:
           DATA: /data/scratch/cifar-10
 
diff --git a/.github/workflows/compatibility_test.yml b/.github/workflows/compatibility_test.yml
@@ -3,18 +3,14 @@ name: Compatibility Test
 on:
   workflow_dispatch:
     inputs:
-      version:
-        type: choice
-        description: version for testing
-        default: 'all'
+      torch_version:
+        type: string
+        description: torch version, separated by comma
+        required: true
+      cuda_version:
+        type: string
+        description: cuda version, separated by comma
         required: true
-        options:
-        - all
-        - pytorch-cuda:1.9.0-11.1.1 # python 3.8
-        - pytorch-cuda:1.8.1-11.1.1 # python 3.8
-        - pytorch-cuda:1.7.1-11.0.3 # python 3.8
-        - pytorch-cuda:1.6.0-10.2 # python 3.6
-
 
 jobs:
   matrix_preparation:
@@ -24,12 +20,25 @@ jobs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
     - id: set-matrix
+      env:
+        TORCH_VERSIONS: ${{ inputs.torch_version }}
+        CUDA_VERSIONS: ${{ inputs.cuda_version }}
       run: |
-        [ "${{github.event.inputs.version}}" != "" ] && matrix="[\"frankleeeee/${{github.event.inputs.version}}\"]"
-        [ "${{github.event.inputs.version}}" == "" ] || [ "${{github.event.inputs.version}}" == "all" ] && \
-          matrix="[\"frankleeeee/pytorch-cuda:1.9.0-11.1.1\", \"frankleeeee/pytorch-cuda:1.8.1-11.1.1\", \"frankleeeee/pytorch-cuda:1.7.1-11.0.3\", \"frankleeeee/pytorch-cuda:1.6.0-10.2\"]"
-        echo $matrix
-        echo "::set-output name=matrix::{\"container\":$(echo $matrix)}"
+        IFS=','
+        DOCKER_IMAGE=()
+
+        for tv in $TORCH_VERSIONS
+        do
+            for cv in $CUDA_VERSIONS
+            do
+                DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"")
+            done
+        done
+
+        container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
+        container="[${container}]"
+        echo "$container"
+        echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
 
   build:
     name: Test for PyTorch Compatibility
@@ -46,16 +55,16 @@ jobs:
     steps:
       - name: Install dependencies
         run: |
-          pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
           pip install -U pip setuptools wheel --user
       - uses: actions/checkout@v2
       - name: Install Colossal-AI      
         run: |
           pip install -r requirements/requirements.txt
-          pip install -r requirements/requirements-test.txt
           pip install -v --no-cache-dir .
+          pip install -r requirements/requirements-test.txt
       - name: Unit Testing
         run: |
           PYTHONPATH=$PWD pytest tests
         env:
           DATA: /data/scratch/cifar-10
+          NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/release_bdist.yml b/.github/workflows/release_bdist.yml
@@ -3,16 +3,15 @@ name: Release bdist wheel
 on:
   workflow_dispatch:
     inputs:
+      torch_version:
+        type: string
+        description: torch version, separated by comma
+        required: true
+        default: "all"
       cuda_version:
-        type: choice
-        description: CUDA Version
-        default: 'all'
+        type: string
+        description: cuda version, separated by comma
         required: true
-        options:
-        - all
-        - "11.3"
-        - "11.1"
-        - "10.2"
       github_ref:
         type: string
         description: Branch or Tag
@@ -27,12 +26,24 @@ jobs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
     - id: set-matrix
+      env:
+        TORCH_VERSIONS: ${{ inputs.torch_version }}
+        CUDA_VERSIONS: ${{ inputs.cuda_version }}
       run: |
-        [ "${{github.event.inputs.cuda_version}}" != "" ] && matrix="[\"hpcaitech/cuda-conda:${{github.event.inputs.cuda_version}}\"]"
-        [ "${{github.event.inputs.cuda_version}}" == "" ] || [ "${{github.event.inputs.cuda_version}}" == "all" ] && \
-          matrix="[\"hpcaitech/cuda-conda:11.3\", \"hpcaitech/cuda-conda:11.1\", \"hpcaitech/cuda-conda:10.2\"]"
-        echo $matrix
-        echo "::set-output name=matrix::{\"container\":$(echo $matrix)}"
+        echo $TORCH_VERSIONS
+        echo $CUDA_VERSIONS
+        IFS=','
+        DOCKER_IMAGE=()
+
+        for cv in $CUDA_VERSIONS
+        do
+            DOCKER_IMAGE+=("\"hpcaitech/cuda-conda:${cv}\"")
+        done
+
+        container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
+        container="[${container}]"
+        echo "$container"
+        echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
 
   build:
     name: Release bdist wheels
@@ -62,7 +73,9 @@ jobs:
       - name: Build bdist wheel
         run: |
           pip install beautifulsoup4 requests packaging
-          python ./build_colossalai_wheel.py
+          python ./build_colossalai_wheel.py --torch_version $TORCH_VERSIONS
+        env:
+          TORCH_VERSIONS: ${{ inputs.torch_version }}
       - name: 🚀 Deploy
         uses: garygrossgarten/github-action-scp@release
         with:

diff --git a/.github/workflows/scripts/build_colossalai_wheel.py b/.github/workflows/scripts/build_colossalai_wheel.py
@@ -15,6 +15,7 @@
 
 def parse_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument('--torch_version', type=str)
     parser.add_argument('--nightly', action='store_true', 
         help='whether this build is for nightly release, if True, will only build on the latest PyTorch version and Python 3.8')
     return parser.parse_args()
@@ -81,29 +82,27 @@ def main():
     args = parse_args()
     wheel_info = all_wheel_info()
 
-    if args.nightly:
-        latest_torch_version = list(wheel_info.keys())
+    # filter wheels on condition
+    all_torch_versions = list(wheel_info.keys())
+    def _compare_version(a, b):
+        if version.parse(a) > version.parse(b):
+            return 1
+        else:
+            return -1
 
-        def _compare_version(a, b):
-            if version.parse(a) > version.parse(b):
-                return 1
-            else:
-                return -1
+    all_torch_versions.sort(key=cmp_to_key(_compare_version))
 
-        latest_torch_version.sort(key=cmp_to_key(_compare_version))
-
+    if args.nightly:
         # only keep the latest version
-        for key in latest_torch_version[:-1]:
+        for key in all_torch_versions[:-1]:
             wheel_info.pop(key)
-
-        # we only keep python 3.8 for nightly release
-        for torch_version, cuda_versioned_info in wheel_info.items():
-            for cuda_version, python_versioned_info in cuda_versioned_info.items():
-                python_versions = list(python_versioned_info.keys())
-
-                for key in python_versions:
-                    if key != '3.8':
-                        python_versioned_info.pop(key)
+    elif args.torch_version != 'all':
+        torch_versions = args.torch_version.split(',')
+        # only keep the torch versions specified
+        for key in all_torch_versions:
+            if key not in torch_versions:
+                wheel_info.pop(key)
+
     build_colossalai(wheel_info)
 
 if __name__ == '__main__':

diff --git a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
@@ -33,14 +33,19 @@ def handle_gradient(self):
             # Pack the buckets.
             for param in self._model.parameters():
                 group = getattr(param, 'pipeline_shared_module_pg', None)
-                if param.requires_grad and param.grad is not None and group is not None:
+                if param.requires_grad and group is not None and (
+                    (hasattr(param, 'colo_attr') and not param.colo_attr.saved_grad.is_null())
+                        or param.grad is not None):
                     tp = param.data.type()
                     buckets[group][tp].append(param)
 
             # For each bucket, all-reduce and copy all-reduced grads.
             for group, group_buckets in buckets.items():
                 for tp, bucket in group_buckets.items():
-                    grads = [param.grad.data for param in bucket]
+                    grads = [
+                        param.colo_attr.grad_payload if hasattr(param, 'colo_attr') else param.grad.data
+                        for param in bucket
+                    ]
                     coalesced = _flatten_dense_tensors(grads).to(torch.cuda.current_device())
                     dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=group)
                     for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):

diff --git a/colossalai/fx/passes/adding_split_node_pass.py b/colossalai/fx/passes/adding_split_node_pass.py
@@ -10,7 +10,9 @@ def pipe_split():
 
 
 def balanced_split_pass(gm: torch.fx.GraphModule, pp_size: int):
-    # TODO(lyl): balanced policy V2, split module by node size(weight+bias+output)
+    """
+    In balanced_split_pass, we split module by the size of parameters(weights+bias).
+    """
     mod_graph = gm.graph
     total_param_amount = 0
     for param in mod_graph.owning_module.parameters():
@@ -39,6 +41,36 @@ def balanced_split_pass(gm: torch.fx.GraphModule, pp_size: int):
     return gm
 
 
+def balanced_split_pass_v2(gm: torch.fx.GraphModule, pp_size: int):
+    """
+    In balanced_split_pass_v12, we split module by the size of nodes(weights+bias+outputs).
+    """
+    mod_graph = gm.graph
+    # To use balanced_split_pass_v2, we need run meta_info_prop interpreter first.
+    # If nodes don't have meta info, this pass will fall back to normal balanced split pass.
+    check_node = list(mod_graph.nodes)[0]
+    if 'tensor_meta' not in check_node.meta:
+        return balanced_split_pass(gm, pp_size)
+
+    total_element_size = 0
+    for node in mod_graph.nodes:
+        total_element_size += node.node_size
+
+    partition_size = total_element_size // pp_size
+    accumulate_node_size = 0
+    for node in mod_graph.nodes:
+        if pp_size <= 1:
+            break
+        accumulate_node_size += node.node_size
+        if accumulate_node_size >= partition_size:
+            accumulate_node_size = 0
+            pp_size -= 1
+            with mod_graph.inserting_after(node):
+                split_node = mod_graph.create_node('call_function', pipe_split)
+    gm.recompile()
+    return gm
+
+
 def uniform_split_pass(gm: torch.fx.GraphModule, pp_size: int):
     mod_graph = gm.graph
     valid_children_size = 0

diff --git a/colossalai/fx/passes/meta_info_prop.py b/colossalai/fx/passes/meta_info_prop.py
@@ -67,7 +67,6 @@ class MetaInfoProp(torch.fx.Interpreter):
 
     def run_node(self, n: Node) -> Any:
         result = super().run_node(n)
-
         found_tensor = False
 
         def extract_tensor_meta(obj):
@@ -83,7 +82,25 @@ def extract_tensor_meta(obj):
             n.meta['tensor_meta'] = meta
         else:
             n.meta['tensor_meta'] = TensorMetadata(None, None, False, None, 0)
-
+        # counting the total size of node outputs
+        total_node_size = 0
+        if isinstance(n.meta['tensor_meta'], TensorMetadata):
+            total_node_size += n.meta['tensor_meta'].numel
+        else:
+            for element in n.meta['tensor_meta']:
+                assert isinstance(
+                    element, TensorMetadata
+                ), f"``n.meta['tensor_meta']`` should be either TensorMetadata or a tuple of TensorMetadata."
+                total_node_size += element.numel
+        # counting the total size of parameters
+        total_param_size = 0
+        if n.op == 'call_module':
+            target_module = n.graph.owning_module.get_submodule(n.target)
+            for param in target_module.parameters():
+                total_param_size += param.numel()
+
+        total_node_size += total_param_size
+        n.node_size = total_node_size
         n.meta['type'] = type(result)
         return result