From 8337779cd1ef40ab23aad219e5ac92444d410086 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Fri, 3 Feb 2023 10:31:15 +0800 Subject: [PATCH 1/2] [workflow] fixed example check workflow --- .github/workflows/README.md | 107 ++++++++++-------- .../workflows/example_check_on_dispatch.yml | 64 +++++++++++ .github/workflows/example_check_on_pr.yml | 91 +++++++++++++++ .../workflows/example_check_on_schedule.yml | 57 ++++++++++ 4 files changed, 269 insertions(+), 50 deletions(-) create mode 100644 .github/workflows/example_check_on_dispatch.yml create mode 100644 .github/workflows/example_check_on_pr.yml create mode 100644 .github/workflows/example_check_on_schedule.yml diff --git a/.github/workflows/README.md b/.github/workflows/README.md index cda6a3139a1b..980f7b5701ce 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -6,13 +6,14 @@ - [Table of Contents](#table-of-contents) - [Overview](#overview) - [Workflows](#workflows) - - [Checks on Pull Requests](#checks-on-pull-requests) - - [Regular Checks](#regular-checks) + - [Code Style Check](#code-style-check) + - [Unit Test](#unit-test) + - [Example Test](#example-test) + - [Dispatch Example Test](#dispatch-example-test) + - [Compatibility Test](#compatibility-test) + - [Compatibility Test](#compatibility-test-1) - [Release](#release) - - [Manual Dispatch](#manual-dispatch) - [Release bdist wheel](#release-bdist-wheel) - - [Dispatch Example Test](#dispatch-example-test) - - [Compatibility Test](#compatibility-test) - [User Friendliness](#user-friendliness) - [Configuration](#configuration) - [Progress Log](#progress-log) @@ -26,25 +27,54 @@ In the section below, we will dive into the details of different workflows avail ## Workflows -### Checks on Pull Requests +Refer to this [documentation](https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow) on how to manually trigger a workflow. +I will provide the details of each workflow below. + +### Code Style Check + +| Workflow Name | File name | Description | +| --------------------------- | ------------------------------ | ---------------------------------------------------------------------------------------------------------- | +| `Pre-commit` | `pre_commit.yml` | This workflow runs pre-commit checks for code style consistency for PRs. | +| `Report pre-commit failure` | `report_precommit_failure.yml` | This PR will put up a comment in the PR to explain the precommit failure and remedy if `Pre-commit` fails. | + +### Unit Test + +| Workflow Name | File name | Description | +| ---------------------- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Build` | `build.yml` | This workflow is triggered when the label `Run build and Test` is assigned to a PR. It will run all the unit tests in the repository with 4 GPUs. | +| `Build on 8 GPUs` | `build_gpu_8.yml` | This workflow will run the unit tests everyday with 8 GPUs. | +| `Report test coverage` | `report_test_coverage.yml` | This PR will put up a comment to report the test coverage results when `Build` is done. | + +### Example Test + +| Workflow Name | File name | Description | +| -------------------------- | ------------------------------- | --------------------------------------------------------------------------- | +| `Test example on PR` | `example_check_on_pr.yml` | The example will be automatically tested if its files are changed in the PR | +| `Test example on Schedule` | `example_check_on_schedule.yml` | This workflow will test all examples every Sunday | +| `Example Test on Dispatch` | `example_check_on_dispatch.yml` | Manually test a specified example. | + +#### Dispatch Example Test + +parameters: +- `example_directory`: the example directory to test. Multiple directories are supported and must be separated by comma. For example, language/gpt, images/vit. Simply input language or simply gpt does not work. + +### Compatibility Test -| Workflow Name | File name | Description | -| --------------------------- | ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------- | -| `Build` | `build.yml` | This workflow is triggered when the label `Run build and Test` is assigned to a PR. It will run all the unit tests in the repository with 4 GPUs. | -| `Pre-commit` | `pre_commit.yml` | This workflow runs pre-commit checks for code style consistency. | -| `Report pre-commit failure` | `report_precommit_failure.yml` | This PR will put up a comment in the PR to explain the precommit failure and remedy. This is executed when `Pre-commit` is done | -| `Report test coverage` | `report_test_coverage.yml` | This PR will put up a comment to report the test coverage results. This is executed when `Build` is completed. | -| `Test example` | `auto_example_check.yml` | The example will be automatically tested if its files are changed in the PR | +| Workflow Name | File name | Description | +| ---------------------------- | -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `Compatibility Test` | `auto_compatibility_test.yml` | This workflow will check the compatiblity of Colossal-AI against PyTorch and CUDA specified in `.compatibility` every Sunday. | +| `Auto Compatibility Test` | `auto_compatibility_test.yml` | Check Colossal-AI's compatiblity when `version.txt` is changed in a PR. | +| `Dispatch Compatiblity Test` | `dispatch_compatiblity_test.yml` | Test PyTorch and Python Compatibility. | -### Regular Checks -| Workflow Name | File name | Description | -| ----------------------- | ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `Test example` | `auto_example_check.yml` | This workflow will test all examples every Sunday | -| `Compatibility Test` | `auto_compatibility_test.yml` | This workflow will check the compatiblity of Colossal-AI against PyTorch and CUDA every Sunday. The PyTorch and CUDA versions are specified in `.compatibility`. | -| `Build on 8 GPUs` | `build_gpu_8.yml` | This workflow will run the unit tests everyday with 8 GPUs. | -| `Synchronize submodule` | `submodule.yml` | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. | -| `Close inactive issues` | `close_inactive.yml` | This workflow will close issues which are stale for 14 days. | +#### Compatibility Test + +Parameters: +- `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels). +- `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda). + +> It only test the compatiblity of the main branch + ### Release @@ -56,18 +86,8 @@ In the section below, we will dive into the details of different workflows avail | `Release Docker` | `release_docker.yml` | Build and release the Docker image to DockerHub. Triggered when the change of `version.txt` is merged. | | `Release bdist wheel` | `release_bdist.yml` | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section. | | `Auto Release bdist wheel` | `auto_release_bdist.yml` | Build binary wheels with pre-built PyTorch extensions.Triggered when the change of `version.txt` is merged. Build specificatons are stored in `.bdist.json` | -| `Auto Compatibility Test` | `auto_compatibility_test.yml` | Check Colossal-AI's compatiblity against the PyTorch and CUDA version specified in `.compatibility`. Triggered when `version.txt` is changed in a PR. | - -### Manual Dispatch +| `Release bdist wheel` | `release_bdist.yml` | Build binary wheels with pre-built PyTorch extensions. | -| Workflow Name | File name | Description | -| ---------------------------- | -------------------------------- | ------------------------------------------------------ | -| `Release bdist wheel` | `release_bdist.yml` | Build binary wheels with pre-built PyTorch extensions. | -| `Dispatch Example Test` | `dispatch_example_check.yml` | Manually test a specified example. | -| `Dispatch Compatiblity Test` | `dispatch_compatiblity_test.yml` | Test PyTorch and Python Compatibility. | - -Refer to this [documentation](https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow) on how to manually trigger a workflow. -I will provide the details of each workflow below. #### Release bdist wheel @@ -76,26 +96,13 @@ Parameters: - `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda). - `ref`: input the branch or tag name to build the wheel for this ref. -#### Dispatch Example Test - -parameters: -- `example_directory`: the example directory to test. Multiple directories are supported and must be separated by comma. For example, language/gpt, images/vit. Simply input language or simply gpt does not work. - - -#### Compatibility Test - -Parameters: -- `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels). -- `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda). - -> It only test the compatiblity of the main branch - - ### User Friendliness -| Workflow Name | File name | Description | -| ----------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | -| `issue-translate` | `translate_comment.yml` | This workflow is triggered when a new issue comment is created. The comment will be translated into English if not written in English. | +| Workflow Name | File name | Description | +| ----------------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | +| `issue-translate` | `translate_comment.yml` | This workflow is triggered when a new issue comment is created. The comment will be translated into English if not written in English. | +| `Synchronize submodule` | `submodule.yml` | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. | +| `Close inactive issues` | `close_inactive.yml` | This workflow will close issues which are stale for 14 days. | ## Configuration diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml new file mode 100644 index 000000000000..620d4771af55 --- /dev/null +++ b/.github/workflows/example_check_on_dispatch.yml @@ -0,0 +1,64 @@ +name: Test Example on Dispatch +on: + workflow_dispatch: + inputs: + example_directory: + type: string + description: example directory, separated by space. For example, language/gpt, images/vit. Simply input language or simply gpt does not work. + required: true + +jobs: + matrix_preparation: + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' + name: Check the examples user want + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - name: 📚 Checkout + uses: actions/checkout@v3 + - name: Set up matrix + id: set-matrix + env: + check_dir: ${{ inputs.example_directory }} + run: | + res=`python .github/workflows/scripts/example_checks/check_dispatch_inputs.py --fileNameList $check_dir` + if [ res == "failure" ];then + exit -1 + fi + dirs="[${check_dir}]" + echo "Testing examples in $dirs" + echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT + + test_example: + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' + name: Manually check example files + needs: manual_check_matrix_preparation + runs-on: [self-hosted, gpu] + strategy: + fail-fast: false + matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} + container: + image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + options: --gpus all --rm -v /data/scratch/examples-data:/data/ + timeout-minutes: 10 + steps: + - name: 📚 Checkout + uses: actions/checkout@v3 + - name: Install Colossal-AI + run: | + pip install -v . + - name: Test the example + run: | + dir=${{ matrix.directory }} + echo "Testing ${dir} now" + cd "${PWD}/examples/${dir}" + bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml new file mode 100644 index 000000000000..723bce568a55 --- /dev/null +++ b/.github/workflows/example_check_on_pr.yml @@ -0,0 +1,91 @@ +name: Test Example on PR +on: + pull_request: + # any change in the examples folder will trigger check for the corresponding example. + paths: + - 'examples/**' + +jobs: + # This is for changed example files detect and output a matrix containing all the corresponding directory name. + detect-changed-example: + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.setup-matrix.outputs.matrix }} + anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }} + name: Detect changed example files + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha }} + + - name: Locate base commit + id: locate-base-sha + run: | + curBranch=$(git rev-parse --abbrev-ref HEAD) + commonCommit=$(git merge-base origin/main $curBranch) + echo $commonCommit + echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT + + - name: Get all changed example files + id: changed-files + uses: tj-actions/changed-files@v35 + with: + base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }} + + - name: setup matrix + id: setup-matrix + run: | + changedFileName="" + for file in ${{ steps.changed-files.outputs.all_changed_files }}; do + changedFileName="${file}:${changedFileName}" + done + echo "$changedFileName was changed" + res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName` + echo "All changed examples are $res" + + if [ "$res" = "[]" ]; then + echo "anyChanged=false" >> $GITHUB_OUTPUT + echo "matrix=null" >> $GITHUB_OUTPUT + else + dirs=$( IFS=',' ; echo "${res[*]}" ) + echo "anyChanged=true" >> $GITHUB_OUTPUT + echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT + fi + + # If no file is changed, it will prompt an error and shows the matrix do not have value. + check-changed-example: + # Add this condition to avoid executing this job if the trigger event is workflow_dispatch. + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' && + needs.detect-changed-example.outputs.anyChanged == 'true' + name: Test the changed example + needs: detect-changed-example + runs-on: [self-hosted, gpu] + strategy: + fail-fast: false + matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} + container: + image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + options: --gpus all --rm -v /data/scratch/examples-data:/data/ + timeout-minutes: 10 + steps: + - uses: actions/checkout@v3 + + - name: Install Colossal-AI + run: | + pip install -v . + + - name: Test the example + run: | + example_dir=${{ matrix.directory }} + cd "${PWD}/examples/${example_dir}" + bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 \ No newline at end of file diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml new file mode 100644 index 000000000000..07424ecbede2 --- /dev/null +++ b/.github/workflows/example_check_on_schedule.yml @@ -0,0 +1,57 @@ +name: Test Example on Schedule +on: + # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00 + schedule: + - cron: '0 16 * * 6' + +jobs: + # This is for all files' weekly check. Specifically, this job is to find all the directories. + matrix_preparation: + if: | + github.repository == 'hpcaitech/ColossalAI' && + github.event_name == 'schedule' + name: Prepare matrix for weekly check + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.setup-matrix.outputs.matrix }} + steps: + - name: 📚 Checkout + uses: actions/checkout@v3 + + - name: setup matrix + id: setup-matrix + run: | + res=`python .github/workflows/scripts/example_checks/check_example_weekly.py` + all_loc=$( IFS=',' ; echo "${res[*]}" ) + echo "Found the examples: $all_loc" + echo "matrix={\"directory\":$(echo "$all_loc")}" >> $GITHUB_OUTPUT + + weekly_check: + if: | + github.repository == 'hpcaitech/ColossalAI' && + github.event_name == 'schedule' + name: Weekly check all examples + needs: matrix_preparation + runs-on: [self-hosted, gpu] + strategy: + fail-fast: false + matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} + container: + image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + timeout-minutes: 10 + steps: + - name: 📚 Checkout + uses: actions/checkout@v3 + + - name: Install Colossal-AI + run: | + pip install -v . + + - name: Traverse all files + run: | + example_dir=${{ matrix.diretory }} + echo "Testing ${example_dir} now" + cd "${PWD}/examples/${example_dir}" + bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 From 8a771234b5d864a6e302d9ebbece38baf9b743c1 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Fri, 3 Feb 2023 10:44:12 +0800 Subject: [PATCH 2/2] polish yaml --- .github/workflows/example_check_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml index 723bce568a55..ebc2a277c1de 100644 --- a/.github/workflows/example_check_on_pr.yml +++ b/.github/workflows/example_check_on_pr.yml @@ -88,4 +88,4 @@ jobs: cd "${PWD}/examples/${example_dir}" bash test_ci.sh env: - NCCL_SHM_DISABLE: 1 \ No newline at end of file + NCCL_SHM_DISABLE: 1