From 91e83c1d8706dfe8c8c2e57fb775733726485805 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 10:37:41 +0800 Subject: [PATCH 01/12] [workflow] refactored the example check workflow --- ...eekly_check.yml => auto_example_check.yml} | 52 +++++++++++-------- ...example.yml => dispatch_example_check.yml} | 37 ++++++------- .../example_checks/check_dispatch_inputs.py | 27 ++++++++++ .../check_example_weekly.py} | 9 ++-- .../detect_changed_example.py} | 11 ++-- .../workflows/scripts/input_check_example.py | 23 -------- 6 files changed, 83 insertions(+), 76 deletions(-) rename .github/workflows/{changed_file_trigger_examples_check_and_weekly_check.yml => auto_example_check.yml} (70%) rename .github/workflows/{workflow_dispatch_example.yml => dispatch_example_check.yml} (60%) create mode 100644 .github/workflows/scripts/example_checks/check_dispatch_inputs.py rename .github/workflows/scripts/{weekly_check_example.py => example_checks/check_example_weekly.py} (76%) rename .github/workflows/scripts/{changed_example.py => example_checks/detect_changed_example.py} (52%) delete mode 100644 .github/workflows/scripts/input_check_example.py diff --git a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml b/.github/workflows/auto_example_check.yml similarity index 70% rename from .github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml rename to .github/workflows/auto_example_check.yml index 2b7ec31252e4..adc8a46f43c5 100644 --- a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml +++ b/.github/workflows/auto_example_check.yml @@ -1,7 +1,7 @@ name: Test Example on: pull_request: - # So only the changes in examples folder will trigger jobs below. + # any change in the examples folder will trigger check for the corresponding example. paths: - 'examples/**' # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00 @@ -17,12 +17,14 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' runs-on: ubuntu-latest outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} + matrix: ${{ steps.setup-matrix.outputs.matrix }} + anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }} name: Check out all files steps: - uses: actions/checkout@v3 with: - fetch-depth: 2 + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha }} - name: Get all changed example files id: changed-files uses: tj-actions/changed-files@v35 @@ -30,21 +32,27 @@ jobs: with: since_last_remote_commit: true - name: setup matrix - id: set-matrix + id: setup-matrix run: | changedFileName="" for file in ${{ steps.changed-files.outputs.all_changed_files }}; do changedFileName="${file}:${changedFileName}" done echo "$changedFileName was changed" - res=`python .github/workflows/scripts/changed_example.py --fileNameList $changedFileName` - echo "All changed files are $res" - loc=$( IFS=',' ; echo "${res[*]}" ) - echo "$loc" - echo "::set-output name=matrix::{\"loc\":$(echo "$loc")}" + res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName` + echo "All changed examples are $res" + + if [ "$x" = "[]" ]; then + echo "anyChanged=false" >> $GITHUB_OUTPUT + echo "matrix=null" >> $GITHUB_OUTPUT + else + dirs=$( IFS=',' ; echo "${res[*]}" ) + echo "anyChanged=true" >> $GITHUB_OUTPUT + echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT + fi # If no file is changed, it will prompt an error and shows the matrix do not have value. - check-all-changed-files: + check-changed-example: # Add this condition to avoid executing this job if the trigger event is workflow_dispatch. if: | github.event.pull_request.draft == false && @@ -57,18 +65,16 @@ jobs: matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + options: --gpus all --rm -v /data/scratch/examples-data:/data/ steps: - uses: actions/checkout@v3 - with: - fetch-depth: 2 - name: Install dependancies run: | - pip install -r ./requirements/requirements.txt pip install colossalai - name: List all changed example files run: | - res=${{ matrix.loc }} - cd "${PWD}/examples/${res}" + example_dir=${{ matrix.directory }} + cd "${PWD}/examples/${example_dir}" bash test_ci.sh # This is for all files' weekly check. Specifically, this job is to find all the directories. @@ -80,17 +86,17 @@ jobs: name: Prepare Directory List for All files runs-on: ubuntu-latest outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} + matrix: ${{ steps.setup-matrix.outputs.matrix }} steps: - name: 📚 Checkout uses: actions/checkout@v3 - name: setup matrix - id: set-matrix + id: setup-matrix run: | - res=`python .github/workflows/scripts/weekly_check_example.py` + res=`python .github/workflows/scripts/example_checks/check_example_weekly.py` all_loc=$( IFS=',' ; echo "${res[*]}" ) - echo "$all_loc" - echo "::set-output name=matrix::{\"all_loc\":$(echo "$all_loc")}" + echo "Found the examples: $all_loc" + echo "matrix={\"directory\":$(echo "$all_loc")}" >> $GITHUB_OUTPUT weekly_check: if: | @@ -113,7 +119,7 @@ jobs: pip install colossalai - name: Traverse all files run: | - dir=${{ matrix.all_loc }} - echo "${dir} is current directory" - cd "${PWD}/examples/${dir}" + example_dir=${{ matrix.diretory }} + echo "Testing ${example_dir} now" + cd "${PWD}/examples/${example_dir}" bash test_ci.sh diff --git a/.github/workflows/workflow_dispatch_example.yml b/.github/workflows/dispatch_example_check.yml similarity index 60% rename from .github/workflows/workflow_dispatch_example.yml rename to .github/workflows/dispatch_example_check.yml index d9d5769109a3..90cc44ab7163 100644 --- a/.github/workflows/workflow_dispatch_example.yml +++ b/.github/workflows/dispatch_example_check.yml @@ -8,7 +8,7 @@ on: required: true jobs: - manual_check_matrix_preparation: + matrix_preparation: if: | github.event.pull_request.draft == false && github.base_ref == 'main' && @@ -16,31 +16,24 @@ jobs: name: Check the examples user want runs-on: ubuntu-latest outputs: - matrix: ${{ steps.set-matrix-1.outputs.matrix }} + matrix: ${{ steps.set-matrix.outputs.matrix }} steps: - name: 📚 Checkout uses: actions/checkout@v3 - - name: Get manual directories - id: set-matrix-1 + - name: Set up matrix for inputs + id: set-matrix env: check_dir: ${{ inputs.example_directory }} run: | - all_mannual_check_dir=() - for cdi in $check_dir - do - all_mannual_check_dir+=("\"${cdi}\"") - done - man_loc=$( IFS=',' ; echo "${all_mannual_check_dir[*]}" ) - res=`python .github/workflows/scripts/input_check_example.py --fileNameList $man_loc` - echo "${res} is file existance. 1 for all exist, -1 for at least one file not exist." - if [ res == -1 ];then - exit(1) + res=`python .github/workflows/scripts/example_checks/check_dispatch_inputs.py --fileNameList $check_dir` + if [ res == "failure" ];then + exit -1 fi - man_loc="[${man_loc}]" - echo "$man_loc" - echo "::set-output name=matrix::{\"man_loc\":$(echo "$man_loc")}" + dirs="[${check_dir}]" + echo "Testing examples in $dirs" + echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT - manual_check: + test_example: if: | github.event.pull_request.draft == false && github.base_ref == 'main' && @@ -52,16 +45,16 @@ jobs: matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + options: --gpus all --rm -v /data/scratch/examples-data:/data/ steps: - name: 📚 Checkout uses: actions/checkout@v3 - name: Install the requirements run: | - pip install -r ./requirements/requirements.txt pip install colossalai - - name: Traverse all files + - name: Test the example run: | - dir=${{ matrix.man_loc }} - echo "${dir} is current directory" + dir=${{ matrix.directory }} + echo "Testing ${dir} now" cd "${PWD}/examples/${dir}" bash test_ci.sh diff --git a/.github/workflows/scripts/example_checks/check_dispatch_inputs.py b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py new file mode 100644 index 000000000000..04d2063ec5fc --- /dev/null +++ b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py @@ -0,0 +1,27 @@ +import argparse +import os + + +def check_inputs(input_list): + for path in input_list: + real_path = os.path.join('examples', path) + if not os.path.exists(real_path): + return False + return True + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fileNameList', type=str, help="List of file names") + args = parser.parse_args() + name_list = args.fileNameList.split(",") + is_correct = check_inputs(name_list) + + if is_correct: + print('success') + else: + print('failure') + + +if __name__ == '__main__': + main() diff --git a/.github/workflows/scripts/weekly_check_example.py b/.github/workflows/scripts/example_checks/check_example_weekly.py similarity index 76% rename from .github/workflows/scripts/weekly_check_example.py rename to .github/workflows/scripts/example_checks/check_example_weekly.py index dfedc46287f2..941e90901f3d 100644 --- a/.github/workflows/scripts/weekly_check_example.py +++ b/.github/workflows/scripts/example_checks/check_example_weekly.py @@ -5,9 +5,9 @@ def show_files(path, all_files): # Traverse all the folder/file in current directory file_list = os.listdir(path) # Determine the element is folder or file. If file, pass it into list, if folder, recurse. - for file in file_list: + for file_name in file_list: # Get the abs directory using os.path.join() and store into cur_path. - cur_path = os.path.join(path, file) + cur_path = os.path.join(path, file_name) # Determine whether folder if os.path.isdir(cur_path): show_files(cur_path, all_files) @@ -26,9 +26,8 @@ def main(): for file_loc in contents: split_loc = file_loc.split('/') # must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not. - if len(split_loc) - split_loc.index('examples') >= 3: - tmp_loc = split_loc[(split_loc.index('examples') + 1):(split_loc.index('examples') + 3)] - re_loc = join(tmp_loc, '/') + if len(split_loc) >= 4: + re_loc = '/'.join(split_loc[1:3]) if re_loc not in all_loc: all_loc.append(re_loc) print(all_loc) diff --git a/.github/workflows/scripts/changed_example.py b/.github/workflows/scripts/example_checks/detect_changed_example.py similarity index 52% rename from .github/workflows/scripts/changed_example.py rename to .github/workflows/scripts/example_checks/detect_changed_example.py index ac2f0864eb72..df4fd67368fc 100644 --- a/.github/workflows/scripts/changed_example.py +++ b/.github/workflows/scripts/example_checks/detect_changed_example.py @@ -3,14 +3,19 @@ def main(): parser = argparse.ArgumentParser() - parser.add_argument('--fileNameList', type=str) + parser.add_argument('-f', '--fileNameList', type=str, help="The list of changed files") args = parser.parse_args() name_list = args.fileNameList.split(":") folder_need_check = set() for loc in name_list: - # Find only the sub-folder of 'example' folder + # Find only the sub-sub-folder of 'example' folder + # the examples folder structure is like + # - examples + # - area + # - application + # - file if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4: - folder_need_check.add(loc.split("/")[1] + "/" + loc.split("/")[2]) + folder_need_check.add('/'.join(loc.split("/")[1:3])) # Output the result using print. Then the shell can get the values. print(list(folder_need_check)) diff --git a/.github/workflows/scripts/input_check_example.py b/.github/workflows/scripts/input_check_example.py deleted file mode 100644 index 5602d8f0904a..000000000000 --- a/.github/workflows/scripts/input_check_example.py +++ /dev/null @@ -1,23 +0,0 @@ -import argparse -import os - - -def detect_correct(loc_li): - for loc in loc_li: - real_loc = 'examples/' + eval(loc) - if not os.path.exists(real_loc): - return -1 - return 1 - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--fileNameList', type=str) - args = parser.parse_args() - name_list = args.fileNameList.split(",") - result = detect_correct(name_list) - print(result) - - -if __name__ == '__main__': - main() From 9d57960df5e301f3f7785b5bb4023ed62e9b224f Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 10:40:17 +0800 Subject: [PATCH 02/12] polish code --- examples/tutorial/hybrid_parallel/test_ci.sh | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 examples/tutorial/hybrid_parallel/test_ci.sh diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh new file mode 100644 index 000000000000..c56a09bd899a --- /dev/null +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +colossalai run --nproc_per_node 4 train.py --config config.py -s \ No newline at end of file From 7b5595bf6cddc544a684709153da6a77d7fee710 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 10:50:40 +0800 Subject: [PATCH 03/12] polish code --- .github/workflows/auto_example_check.yml | 15 +++++++-------- .github/workflows/dispatch_example_check.yml | 6 +++--- examples/tutorial/hybrid_parallel/test_ci.sh | 8 +++++++- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml index adc8a46f43c5..eb5265cbfccc 100644 --- a/.github/workflows/auto_example_check.yml +++ b/.github/workflows/auto_example_check.yml @@ -19,7 +19,7 @@ jobs: outputs: matrix: ${{ steps.setup-matrix.outputs.matrix }} anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }} - name: Check out all files + name: Detect changed example files steps: - uses: actions/checkout@v3 with: @@ -58,7 +58,7 @@ jobs: github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' - name: Test each changed example files + name: Test the changed example needs: detect-changed-example runs-on: [self-hosted, gpu] strategy: @@ -68,10 +68,10 @@ jobs: options: --gpus all --rm -v /data/scratch/examples-data:/data/ steps: - uses: actions/checkout@v3 - - name: Install dependancies + - name: Install Colossal-AI run: | - pip install colossalai - - name: List all changed example files + pip install -v . + - name: Test the example run: | example_dir=${{ matrix.directory }} cd "${PWD}/examples/${example_dir}" @@ -113,10 +113,9 @@ jobs: steps: - name: 📚 Checkout uses: actions/checkout@v3 - - name: Install the requirements + - name: Install Colossal-AI run: | - pip install -r ./requirements/requirements.txt - pip install colossalai + pip install -v . - name: Traverse all files run: | example_dir=${{ matrix.diretory }} diff --git a/.github/workflows/dispatch_example_check.yml b/.github/workflows/dispatch_example_check.yml index 90cc44ab7163..ea11e79980ba 100644 --- a/.github/workflows/dispatch_example_check.yml +++ b/.github/workflows/dispatch_example_check.yml @@ -20,7 +20,7 @@ jobs: steps: - name: 📚 Checkout uses: actions/checkout@v3 - - name: Set up matrix for inputs + - name: Set up matrix id: set-matrix env: check_dir: ${{ inputs.example_directory }} @@ -49,9 +49,9 @@ jobs: steps: - name: 📚 Checkout uses: actions/checkout@v3 - - name: Install the requirements + - name: Install Colossal-AI run: | - pip install colossalai + pip install -v . - name: Test the example run: | dir=${{ matrix.directory }} diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh index c56a09bd899a..c8d8813ee4f0 100644 --- a/examples/tutorial/hybrid_parallel/test_ci.sh +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -1,3 +1,9 @@ #!/bin/bash -colossalai run --nproc_per_node 4 train.py --config config.py -s \ No newline at end of file +colossalai run --nproc_per_node 4 train.py --config config.py -s + +ret=$? +if [ $ret -ne 0 ]; then + "This example failed, please fix the bugs above." + exit -1 +fi \ No newline at end of file From 3e8ca8fa1c4bec732f4a66327d6433d232797c3b Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 10:52:39 +0800 Subject: [PATCH 04/12] polish code --- examples/tutorial/hybrid_parallel/requirements.txt | 1 + examples/tutorial/hybrid_parallel/test_ci.sh | 2 ++ 2 files changed, 3 insertions(+) diff --git a/examples/tutorial/hybrid_parallel/requirements.txt b/examples/tutorial/hybrid_parallel/requirements.txt index 137a69e80498..dbf6aaf3e4e2 100644 --- a/examples/tutorial/hybrid_parallel/requirements.txt +++ b/examples/tutorial/hybrid_parallel/requirements.txt @@ -1,2 +1,3 @@ colossalai >= 0.1.12 torch >= 1.8.1 +titans \ No newline at end of file diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh index c8d8813ee4f0..86412367c13b 100644 --- a/examples/tutorial/hybrid_parallel/test_ci.sh +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -1,5 +1,7 @@ #!/bin/bash +pip install -r requirements.txt + colossalai run --nproc_per_node 4 train.py --config config.py -s ret=$? From 75c98616f8e392122e32550a1baea0bfd16a85dc Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 10:54:11 +0800 Subject: [PATCH 05/12] polish code --- examples/tutorial/hybrid_parallel/test_ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh index 86412367c13b..a3c048594ddf 100644 --- a/examples/tutorial/hybrid_parallel/test_ci.sh +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -1,5 +1,5 @@ #!/bin/bash - +set -e pip install -r requirements.txt colossalai run --nproc_per_node 4 train.py --config config.py -s From fd3e0c92df8ba3f44f7b7b6055353f7f8b4f4692 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 10:57:04 +0800 Subject: [PATCH 06/12] polish code --- .github/workflows/auto_example_check.yml | 4 ++++ .github/workflows/dispatch_example_check.yml | 2 ++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml index eb5265cbfccc..8f580a0c77af 100644 --- a/.github/workflows/auto_example_check.yml +++ b/.github/workflows/auto_example_check.yml @@ -76,6 +76,8 @@ jobs: example_dir=${{ matrix.directory }} cd "${PWD}/examples/${example_dir}" bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 # This is for all files' weekly check. Specifically, this job is to find all the directories. matrix_preparation: @@ -122,3 +124,5 @@ jobs: echo "Testing ${example_dir} now" cd "${PWD}/examples/${example_dir}" bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/dispatch_example_check.yml b/.github/workflows/dispatch_example_check.yml index ea11e79980ba..f6bef5732542 100644 --- a/.github/workflows/dispatch_example_check.yml +++ b/.github/workflows/dispatch_example_check.yml @@ -58,3 +58,5 @@ jobs: echo "Testing ${dir} now" cd "${PWD}/examples/${dir}" bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 From 21f85e26b994147c0a764cafbb05ec94ee6e753a Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 10:58:02 +0800 Subject: [PATCH 07/12] polish code --- examples/tutorial/hybrid_parallel/test_ci.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh index a3c048594ddf..c7430da5303c 100644 --- a/examples/tutorial/hybrid_parallel/test_ci.sh +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -1,11 +1,6 @@ #!/bin/bash -set -e +set -euxo pipefail + pip install -r requirements.txt colossalai run --nproc_per_node 4 train.py --config config.py -s - -ret=$? -if [ $ret -ne 0 ]; then - "This example failed, please fix the bugs above." - exit -1 -fi \ No newline at end of file From e62fc9acd45c09d9463a780f738a4cbc6f2745fd Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 11:03:58 +0800 Subject: [PATCH 08/12] polish code --- .github/workflows/auto_example_check.yml | 2 ++ .github/workflows/dispatch_example_check.yml | 1 + examples/tutorial/hybrid_parallel/config.py | 4 ++-- examples/tutorial/hybrid_parallel/test_ci.sh | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml index 8f580a0c77af..63d9638dee2c 100644 --- a/.github/workflows/auto_example_check.yml +++ b/.github/workflows/auto_example_check.yml @@ -66,6 +66,7 @@ jobs: container: image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ + timeout-minutes: 10 steps: - uses: actions/checkout@v3 - name: Install Colossal-AI @@ -112,6 +113,7 @@ jobs: matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + timeout-minutes: 10 steps: - name: 📚 Checkout uses: actions/checkout@v3 diff --git a/.github/workflows/dispatch_example_check.yml b/.github/workflows/dispatch_example_check.yml index f6bef5732542..e0333422f50d 100644 --- a/.github/workflows/dispatch_example_check.yml +++ b/.github/workflows/dispatch_example_check.yml @@ -46,6 +46,7 @@ jobs: container: image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ + timeout-minutes: 10 steps: - name: 📚 Checkout uses: actions/checkout@v3 diff --git a/examples/tutorial/hybrid_parallel/config.py b/examples/tutorial/hybrid_parallel/config.py index 2450ab1c7a72..ac273c305006 100644 --- a/examples/tutorial/hybrid_parallel/config.py +++ b/examples/tutorial/hybrid_parallel/config.py @@ -6,8 +6,8 @@ BATCH_SIZE = 256 LEARNING_RATE = 3e-3 WEIGHT_DECAY = 0.3 -NUM_EPOCHS = 10 -WARMUP_EPOCHS = 3 +NUM_EPOCHS = 2 +WARMUP_EPOCHS = 1 # model config IMG_SIZE = 224 diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh index c7430da5303c..6bf7c1217e41 100644 --- a/examples/tutorial/hybrid_parallel/test_ci.sh +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -1,6 +1,6 @@ #!/bin/bash set -euxo pipefail -pip install -r requirements.txt +# pip install -r requirements.txt colossalai run --nproc_per_node 4 train.py --config config.py -s From 7dc4defeed333318e49925605cfe12f4c634c2bd Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 11:11:20 +0800 Subject: [PATCH 09/12] polish code --- .github/workflows/auto_example_check.yml | 2 +- examples/tutorial/hybrid_parallel/test_ci.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml index 63d9638dee2c..7f1e357e33e8 100644 --- a/.github/workflows/auto_example_check.yml +++ b/.github/workflows/auto_example_check.yml @@ -86,7 +86,7 @@ jobs: github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule' - name: Prepare Directory List for All files + name: Prepare matrix for weekly check runs-on: ubuntu-latest outputs: matrix: ${{ steps.setup-matrix.outputs.matrix }} diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh index 6bf7c1217e41..60b18be5f7aa 100644 --- a/examples/tutorial/hybrid_parallel/test_ci.sh +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -3,4 +3,5 @@ set -euxo pipefail # pip install -r requirements.txt -colossalai run --nproc_per_node 4 train.py --config config.py -s +python -c "import titans" +# colossalai run --nproc_per_node 4 train.py --config config.py -s From 27b02c07d50728c53fff24fc020c0e8903ef2d19 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 11:13:01 +0800 Subject: [PATCH 10/12] polish code --- examples/tutorial/hybrid_parallel/test_ci.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh index 60b18be5f7aa..e513f6cd4b4a 100644 --- a/examples/tutorial/hybrid_parallel/test_ci.sh +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -3,5 +3,4 @@ set -euxo pipefail # pip install -r requirements.txt -python -c "import titans" -# colossalai run --nproc_per_node 4 train.py --config config.py -s +torchrun --standalone --nproc_per_node 4 train.py --config config.py -s From ac8ac461ad4430de64be68db329d886003dd8513 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 11:17:53 +0800 Subject: [PATCH 11/12] polish code --- examples/tutorial/hybrid_parallel/test_ci.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh index e513f6cd4b4a..8860b72a2fb3 100644 --- a/examples/tutorial/hybrid_parallel/test_ci.sh +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -1,6 +1,5 @@ #!/bin/bash set -euxo pipefail -# pip install -r requirements.txt - +pip install -r requirements.txt torchrun --standalone --nproc_per_node 4 train.py --config config.py -s From 4fedc8362aa7ba482b74a9f15369d5c335d37c02 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 10 Jan 2023 11:20:31 +0800 Subject: [PATCH 12/12] polish code --- examples/tutorial/hybrid_parallel/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py index 0f2a207cb172..2a8576db747b 100644 --- a/examples/tutorial/hybrid_parallel/train.py +++ b/examples/tutorial/hybrid_parallel/train.py @@ -98,9 +98,9 @@ def main(): root = os.environ.get('DATA', '../data') if args.synthetic: # if we use synthetic dataset - # we train for 30 steps and eval for 10 steps per epoch - train_dataloader = DummyDataloader(length=30, batch_size=gpc.config.BATCH_SIZE) - test_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE) + # we train for 10 steps and eval for 5 steps per epoch + train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE) + test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE) else: train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)