diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index d112d61dd91d..291d6adac2b2 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -61,7 +61,7 @@ jobs: run: shell: bash concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache cancel-in-progress: true steps: - name: Copy testmon cache @@ -87,7 +87,7 @@ jobs: anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }} runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change cancel-in-progress: true steps: - uses: actions/checkout@v2 @@ -147,7 +147,7 @@ jobs: run: shell: bash concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test cancel-in-progress: true steps: - name: Checkout TensorNVMe @@ -208,7 +208,7 @@ jobs: - name: Execute Unit Testing run: | - CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. --durations=10 tests/ + CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/ env: DATA: /data/scratch/cifar-10 NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml index 1778d64ee287..2f03c8ced98d 100644 --- a/.github/workflows/compatiblity_test_on_dispatch.yml +++ b/.github/workflows/compatiblity_test_on_dispatch.yml @@ -44,7 +44,7 @@ jobs: name: Test for PyTorch Compatibility needs: matrix_preparation if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] + runs-on: [self-hosted, 8-gpu] strategy: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} @@ -64,7 +64,7 @@ jobs: - name: Install tensornvme run: | cd TensorNVMe - conda install cmake + apt update && apt install -y cmake pip install -r requirements.txt pip install -v . - uses: actions/checkout@v2 @@ -83,8 +83,7 @@ jobs: fi - name: Install Colossal-AI run: | - pip install -r requirements/requirements.txt - pip install -v --no-cache-dir . + CUDA_EXT=1 pip install -v . pip install -r requirements/requirements-test.txt - name: Unit Testing run: | diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index 0aa9dffeb632..a621c7e3427d 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -13,7 +13,7 @@ jobs: outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-prepare-matrix cancel-in-progress: true steps: - uses: actions/checkout@v3 @@ -35,7 +35,7 @@ jobs: name: Test for PyTorch Compatibility needs: matrix_preparation if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] + runs-on: [self-hosted, 8-gpu] strategy: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} @@ -44,7 +44,7 @@ jobs: options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 timeout-minutes: 120 concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test-${{ matrix.container }} cancel-in-progress: true steps: - name: Install dependencies @@ -58,7 +58,7 @@ jobs: - name: Install tensornvme run: | cd TensorNVMe - conda install cmake + apt update && apt install -y cmake pip install -r requirements.txt pip install -v . - uses: actions/checkout@v2 @@ -78,7 +78,7 @@ jobs: - name: Install Colossal-AI run: | - pip install -v --no-cache-dir . + CUDA_EXT=1 pip install -v . pip install -r requirements/requirements-test.txt - name: Unit Testing run: | diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml index 15ac4f1a92bb..9933224f5675 100644 --- a/.github/workflows/compatiblity_test_on_schedule.yml +++ b/.github/workflows/compatiblity_test_on_schedule.yml @@ -32,7 +32,7 @@ jobs: name: Test for PyTorch Compatibility needs: matrix_preparation if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] + runs-on: [self-hosted, 8-gpu] strategy: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} @@ -54,7 +54,7 @@ jobs: - name: Install tensornvme run: | cd TensorNVMe - conda install cmake + apt update && apt install -y cmake pip install -r requirements.txt pip install -v . - uses: actions/checkout@v2 @@ -75,7 +75,7 @@ jobs: - name: Install Colossal-AI run: | - pip install -v --no-cache-dir . + CUDA_EXT=1 pip install -v . pip install -r requirements/requirements-test.txt - name: Unit Testing diff --git a/.github/workflows/doc_check_on_pr.yml b/.github/workflows/doc_check_on_pr.yml index ae9e311649f7..ee8a82128dd7 100644 --- a/.github/workflows/doc_check_on_pr.yml +++ b/.github/workflows/doc_check_on_pr.yml @@ -17,7 +17,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-i18n cancel-in-progress: true steps: - uses: actions/checkout@v2 @@ -35,7 +35,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-doc cancel-in-progress: true steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml index bf9ed64c8a7e..a3df2c50e6d3 100644 --- a/.github/workflows/doc_test_on_pr.yml +++ b/.github/workflows/doc_test_on_pr.yml @@ -20,7 +20,7 @@ jobs: any_changed: ${{ steps.changed-files.outputs.any_changed }} changed_files: ${{ steps.changed-files.outputs.all_changed_files }} concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change cancel-in-progress: true name: Detect changed example files steps: @@ -63,7 +63,7 @@ jobs: run: shell: bash concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-doctest cancel-in-progress: true steps: - name: Checkout ColossalAI-Documentation diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml index d990a76ca6db..ec23b9d1c59f 100644 --- a/.github/workflows/example_check_on_pr.yml +++ b/.github/workflows/example_check_on_pr.yml @@ -21,7 +21,7 @@ jobs: anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }} name: Detect changed example files concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change cancel-in-progress: true steps: - uses: actions/checkout@v3 @@ -81,7 +81,7 @@ jobs: options: --gpus all --rm -v /data/scratch/examples-data:/data/ timeout-minutes: 10 concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example-${{ matrix.directory }} cancel-in-progress: true steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml index 650689498fda..a336526897e2 100644 --- a/.github/workflows/run_chatgpt_examples.yml +++ b/.github/workflows/run_chatgpt_examples.yml @@ -28,9 +28,8 @@ jobs: - name: Checkout ColossalAI uses: actions/checkout@v2 - - name: Install ColossalAI and ChatGPT + - name: Install ChatGPT run: | - pip install -e . cd applications/Chat pip install -v . pip install -r examples/requirements.txt diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml index 47c80fc9a9fe..ec5c8ffa319f 100644 --- a/.github/workflows/run_chatgpt_unit_tests.yml +++ b/.github/workflows/run_chatgpt_unit_tests.yml @@ -30,9 +30,8 @@ jobs: - name: Checkout ColossalAI uses: actions/checkout@v2 - - name: Install ColossalAI and ChatGPT + - name: Install ChatGPT run: | - pip install -e . cd applications/Chat pip install -v . pip install -r requirements-test.txt diff --git a/LICENSE b/LICENSE index c7a5bb16880e..06629068faa5 100644 --- a/LICENSE +++ b/LICENSE @@ -396,3 +396,35 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved. CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ---------------- LICENSE FOR VLLM TEAM ---------------- + + from VLLM TEAM: + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://github.com/vllm-project/vllm/blob/main/LICENSE + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + ---------------- LICENSE FOR LIGHTLLM TEAM ---------------- + + from LIGHTLLM TEAM: + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://github.com/ModelTC/lightllm/blob/main/LICENSE + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 44e4f97f1f4e..42549ac55873 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ ## Latest News +* [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training) * [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth) * [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining) * [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) @@ -50,7 +51,7 @@
  • Parallel Training Demo