diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 0dc1262b008c..d67cd9fcc9ac 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -3,6 +3,7 @@ - [ ] I have created an issue for this PR for traceability - [ ] The title follows the standard format: `[doc/gemini/tensor/...]: A concise description` - [ ] I have added relevant tags if possible for us to better distinguish different PRs +- [ ] I have installed pre-commit: `pip install pre-commit && pre-commit install` ## 🚨 Issue number diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 2cad504f3391..5bdadca783b3 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -117,7 +117,7 @@ jobs: cd TensorNVMe conda install cmake pip install -r requirements.txt - pip install -v . + DISABLE_URING=1 pip install -v . - name: Store TensorNVMe Cache run: | diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index 3ff19b37b4bf..e560d0c004b1 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -44,7 +44,7 @@ jobs: cd TensorNVMe conda install cmake pip install -r requirements.txt - pip install -v . + DISABLE_URING=1 pip install -v . - uses: actions/checkout@v2 if: steps.check-avai.outputs.avai == 'true' diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml index 76493880651c..95a94c27bfd5 100644 --- a/.github/workflows/compatiblity_test_on_dispatch.yml +++ b/.github/workflows/compatiblity_test_on_dispatch.yml @@ -66,7 +66,7 @@ jobs: cd TensorNVMe apt update && apt install -y cmake pip install -r requirements.txt - pip install -v . + DISABLE_URING=1 pip install -v . - uses: actions/checkout@v2 with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index f582b30907bf..aef4816efcfe 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -60,7 +60,7 @@ jobs: cd TensorNVMe apt update && apt install -y cmake pip install -r requirements.txt - pip install -v . + DISABLE_URING=1 pip install -v . - uses: actions/checkout@v2 with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml index 3348b51ecc6e..3dc8a5a328a6 100644 --- a/.github/workflows/compatiblity_test_on_schedule.yml +++ b/.github/workflows/compatiblity_test_on_schedule.yml @@ -56,7 +56,7 @@ jobs: cd TensorNVMe apt update && apt install -y cmake pip install -r requirements.txt - pip install -v . + DISABLE_URING=1 pip install -v . - uses: actions/checkout@v2 with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml index 24e726b4f16d..d877b06cee1c 100644 --- a/.github/workflows/example_check_on_dispatch.yml +++ b/.github/workflows/example_check_on_dispatch.yml @@ -46,7 +46,7 @@ jobs: matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 - options: --gpus all --rm -v /data/scratch/examples-data:/data/ + options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 15 steps: - name: 📚 Checkout @@ -60,5 +60,3 @@ jobs: echo "Testing ${dir} now" cd "${PWD}/examples/${dir}" bash test_ci.sh - env: - NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml index 728f059c1bb3..6170628e10a9 100644 --- a/.github/workflows/example_check_on_pr.yml +++ b/.github/workflows/example_check_on_pr.yml @@ -78,7 +78,7 @@ jobs: matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 - options: --gpus all --rm -v /data/scratch/examples-data:/data/ + options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 20 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example-${{ matrix.directory }} @@ -95,5 +95,3 @@ jobs: example_dir=${{ matrix.directory }} cd "${PWD}/examples/${example_dir}" bash test_ci.sh - env: - NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml index efb131a864cb..2588ac8243d9 100644 --- a/.github/workflows/example_check_on_schedule.yml +++ b/.github/workflows/example_check_on_schedule.yml @@ -35,6 +35,7 @@ jobs: matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 10 steps: - name: 📚 Checkout @@ -50,8 +51,6 @@ jobs: echo "Testing ${example_dir} now" cd "${PWD}/examples/${example_dir}" bash test_ci.sh - env: - NCCL_SHM_DISABLE: 1 - name: Notify Lark id: message-preparation diff --git a/.github/workflows/post_commit.yml b/.github/workflows/post_commit.yml deleted file mode 100644 index 1bbc0d2f5c34..000000000000 --- a/.github/workflows/post_commit.yml +++ /dev/null @@ -1,97 +0,0 @@ -name: post-commit - -on: - pull_request: - types: - - closed - -jobs: - # this job will run after a PR is merged to run pre-commit on any changed file - # so that the user does not need to learn pre-commit and pre-commit can still - # be auto-executed by the workflow - pre-commit: - runs-on: ubuntu-latest - if: github.event.pull_request.merged == true && github.repository == 'hpcaitech/ColossalAI' - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - ref: ${{ github.event.pull_request.head.sha }} - - # the PR branch and the hpcaitech/colossal-ai main branch - # must share a common commit, we need to locate that commit, - # which is the commit checked-out or forked when the PR branch is created - # such that we can look for files changed since that commit - - name: Locate base commit - id: locate-base-sha - run: | - curBranch=$(git rev-parse --abbrev-ref HEAD) - commonCommit=$(git merge-base origin/main $curBranch) - echo $commonCommit - echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT - - - name: Find the changed files - id: find-changed-files - uses: tj-actions/changed-files@v35 - with: - base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }} - - - name: List all changed files - run: | - for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do - echo "$file was changed" - done - - # check out the main branch - - uses: actions/checkout@v2 - with: - ref: 'main' - - - uses: actions/setup-python@v3 - - - name: Cache pre-commit hooks - uses: actions/cache@v3 - with: - path: ~/.cache/pre-commit - key: ${{ runner.os }}-pre-commit-hooks - - - name: Set up pre-commit - run: | - pip install pre-commit - pre-commit install - - # run pre-commit on changed files - - name: Run Pre-commit - run: | - for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do - pre-commit run --files $file || true - done - - # create commit for pre-commit - # when all files are well formatted, there is no need to create a commit - # therefore, this step will produce an error, which should be allowed - - name: Create commits - id: commit - continue-on-error: true - run: | - git config --global user.name 'github-actions' - git config --global user.email 'github-actions@github.com' - git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }} - git add -A - git commit -am "[format] applied code formatting on changed files in pull request ${{ github.event.pull_request.number }}" - - # create pull request - - name: Create Pull Request - if: steps.commit.outcome == 'success' - id: cpr - uses: peter-evans/create-pull-request@v4 - with: - branch: pre-commit-${{ github.event.pull_request.number }} - title: "[format] applied code formatting on changed files in PR ${{ github.event.pull_request.number }}" - - - name: Enable Auto-merge for the New PR - if: steps.commit.outcome == 'success' - uses: peter-evans/enable-pull-request-automerge@v2 - with: - pull-request-number: ${{ steps.cpr.outputs.pull-request-number }} - merge-method: squash diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml index bb0ceb4a8296..ba997f144cd7 100644 --- a/.github/workflows/run_chatgpt_examples.yml +++ b/.github/workflows/run_chatgpt_examples.yml @@ -19,8 +19,8 @@ jobs: runs-on: [self-hosted, gpu] container: image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 - options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb - timeout-minutes: 30 + options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data --shm-size=10.24gb + timeout-minutes: 60 defaults: run: shell: bash @@ -28,26 +28,35 @@ jobs: - name: Checkout ColossalAI uses: actions/checkout@v2 + - name: Install Colossal-AI + run: | + BUILD_EXT=1 pip install -v -e . + - name: Install ChatGPT run: | - cd applications/Chat + cd applications/ColossalChat pip install -v . + export BUILD_EXT=1 pip install -r examples/requirements.txt - name: Install Transformers run: | - pip install transformers==4.30.2 + pip install transformers==4.34.1 - name: Execute Examples run: | - cd applications/Chat + cd applications/ColossalChat rm -rf ~/.cache/colossalai - ./tests/test_inference.sh - ./tests/test_benchmarks.sh + mkdir models + mkdir sft_data + mkdir prompt_data + mkdir preference_data + ./tests/test_data_preparation.sh ./tests/test_train.sh env: NCCL_SHM_DISABLE: 1 MAX_JOBS: 8 - SFT_DATASET: /data/scratch/github_actions/chat/data.json - PROMPT_DATASET: /data/scratch/github_actions/chat/prompts_en.jsonl - PRETRAIN_DATASET: /data/scratch/github_actions/chat/alpaca_data.json + PRETRAINED_MODEL_PATH: ./models + SFT_DATASET: ./sft_data + PROMPT_DATASET: ./prompt_data + PREFERENCE_DATASET: ./preference_data diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml index 7986889e006b..1d8a53e4feed 100644 --- a/.github/workflows/run_chatgpt_unit_tests.yml +++ b/.github/workflows/run_chatgpt_unit_tests.yml @@ -21,7 +21,7 @@ jobs: runs-on: [self-hosted, gpu] container: image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 - options: --gpus all --rm -v /data/scratch/chatgpt:/data/scratch/chatgpt + options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data timeout-minutes: 30 defaults: run: @@ -32,15 +32,17 @@ jobs: - name: Install ChatGPT run: | - cd applications/Chat + cd applications/ColossalChat pip install -v . - pip install -r requirements-test.txt + pip install pytest - name: Execute Unit Testing run: | - cd applications/Chat + cd applications/ColossalChat rm -rf ~/.cache/colossalai pytest tests/ + cd ./tests + ./test_templating.sh env: NCCL_SHM_DISABLE: 1 MAX_JOBS: 8 diff --git a/.gitignore b/.gitignore index 81113fa99dd5..8bc74b4c8c2c 100644 --- a/.gitignore +++ b/.gitignore @@ -159,3 +159,7 @@ coverage.xml # ignore testmon and coverage files .coverage .testmondata* + +# log, test files - ColossalChat +applications/ColossalChat/logs +applications/ColossalChat/tests/logs diff --git a/README.md b/README.md index 7c234b15e75e..26776bdf6d9f 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ ## Latest News +* [2024/03] [314 Billion Parameter Grok-1 Inference Accelerated by 3.8x, Efficient and Easy-to-Use PyTorch+HuggingFace version is Here](https://hpc-ai.com/blog/314-billion-parameter-grok-1-inference-accelerated-by-3.8x-efficient-and-easy-to-use-pytorchhuggingface-version-is-here) * [2024/03] [Open-Sora: Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models](https://hpc-ai.com/blog/open-sora-v1.0) * [2024/03] [Open-Sora:Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million](https://hpc-ai.com/blog/open-sora) * [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer) @@ -72,6 +73,7 @@
  • Inference