diff --git a/.github/workflows/build-skip.yml b/.github/workflows/build-skip.yml deleted file mode 100644 index a983d6dda4..0000000000 --- a/.github/workflows/build-skip.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: "build" -on: - pull_request: - paths-ignore: - - "include/**" - - "cmake/**" - - "config/**" - - "python/**" - - "src/**" - - ".github/workflows/helpers/install_dependencies.sh" - - ".github/workflows/build.yml" - workflow_dispatch: -concurrency: - group: build-skip-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - cmake-build: - name: Build FlexFlow with CMake - runs-on: ubuntu-20.04 - strategy: - matrix: - gpu_backend: ["cuda", "hip_rocm"] - fail-fast: false - steps: - - run: 'echo "No build required"' - - makefile-build: - name: Build FlexFlow with the Makefile - runs-on: ubuntu-20.04 - steps: - - run: 'echo "No build required"' diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 9e44a59720..0000000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,160 +0,0 @@ -name: "build" -on: - pull_request: - paths: - - "include/**" - - "cmake/**" - - "config/**" - - "python/**" - - "src/**" - - ".github/workflows/helpers/install_dependencies.sh" - - ".github/workflows/build.yml" - push: - branches: - - "master" - paths: - - "include/**" - - "cmake/**" - - "config/**" - - "python/**" - - "src/**" - - ".github/workflows/helpers/install_dependencies.sh" - - ".github/workflows/build.yml" - workflow_dispatch: -concurrency: - group: build-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - cmake-build: - name: Build FlexFlow with CMake - runs-on: ubuntu-20.04 - defaults: - run: - shell: bash -l {0} # required to use an activated conda environment - strategy: - matrix: - gpu_backend: ["cuda", "hip_rocm"] - fail-fast: false - steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Free additional space on runner - run: .github/workflows/helpers/free_space_on_runner.sh - - - name: Install CUDA - uses: Jimver/cuda-toolkit@v0.2.8 - id: cuda-toolkit - with: - cuda: "11.1.1" - # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement - use-github-cache: "false" - - - name: Install system dependencies - run: FF_GPU_BACKEND=${{ matrix.gpu_backend }} .github/workflows/helpers/install_dependencies.sh - - - name: Install conda and FlexFlow dependencies - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: flexflow - environment-file: conda/environment.yml - auto-activate-base: false - - - name: Build FlexFlow - run: | - export CUDNN_DIR=/usr/local/cuda - export CUDA_DIR=/usr/local/cuda - export FF_HOME=$(pwd) - export FF_GPU_BACKEND=${{ matrix.gpu_backend }} - export FF_CUDA_ARCH=70 - cores_available=$(nproc --all) - n_build_cores=$(( cores_available -1 )) - if (( $n_build_cores < 1 )) ; then n_build_cores=1 ; fi - mkdir build - cd build - if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then - export FF_BUILD_ALL_EXAMPLES=ON - export FF_BUILD_UNIT_TESTS=ON - fi - ../config/config.linux - make -j $n_build_cores - - - name: Install FlexFlow - run: | - export CUDNN_DIR=/usr/local/cuda - export CUDA_DIR=/usr/local/cuda - export FF_HOME=$(pwd) - export FF_GPU_BACKEND=${{ matrix.gpu_backend }} - export FF_CUDA_ARCH=70 - cd build - if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then - export FF_BUILD_ALL_EXAMPLES=ON - export FF_BUILD_UNIT_TESTS=ON - fi - ../config/config.linux - sudo make install - sudo ldconfig - - - name: Check availability of Python flexflow.core module - if: ${{ matrix.gpu_backend == 'cuda' }} - run: | - python -c "import flexflow.core; exit()" - - - name: Run C++ unit tests - if: ${{ matrix.gpu_backend == 'cuda' }} - run: | - export CUDNN_DIR=/usr/local/cuda - export CUDA_DIR=/usr/local/cuda - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$CUDA_DIR/lib64/stubs:$LD_LIBRARY_PATH - sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 - cd build - ./tests/unit/unit-test - - makefile-build: - name: Build FlexFlow with the Makefile - runs-on: ubuntu-20.04 - defaults: - run: - shell: bash -l {0} # required to use an activated conda environment - steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Free additional space on runner - run: .github/workflows/helpers/free_space_on_runner.sh - - - name: Install CUDA - uses: Jimver/cuda-toolkit@v0.2.8 - id: cuda-toolkit - with: - cuda: "11.1.1" - use-github-cache: "false" - - - name: Install system dependencies - run: .github/workflows/helpers/install_dependencies.sh - - - name: Install conda and FlexFlow dependencies - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: flexflow - environment-file: conda/environment.yml - auto-activate-base: false - - - name: Build FlexFlow - run: | - export CUDNN_DIR=/usr/local/cuda - export CUDA_DIR=/usr/local/cuda - export FF_HOME=$(pwd) - cores_available=$(nproc --all) - n_build_cores=$(( cores_available -1 )) - if (( $n_build_cores < 1 )) ; then n_build_cores=1 ; fi - - cd python - make -j $n_build_cores - python -c 'import flexflow.core' diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index fb93fd6b5b..672644388c 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -5,20 +5,15 @@ jobs: name: Formatting Check runs-on: ubuntu-latest strategy: - fail-fast: false + fail-fast: true matrix: path: - - check: "lib/compiler" - - check: "lib/ffi" - - check: "lib/kernels" - - check: "lib/op-attrs" - - check: "lib/pcg" - - check: "lib/runtime" - - check: "lib/substitutions" - - check: "lib/utils" + - check: "lib" - check: "tests" - check: "examples" - check: "bindings" + - check: "bin" + exclude: '\.proto$' steps: - uses: actions/checkout@v2 - name: Run clang-format style check for C/C++/Protobuf programs. diff --git a/.github/workflows/docker-build-skip.yml b/.github/workflows/docker-build-skip.yml deleted file mode 100644 index a09979283f..0000000000 --- a/.github/workflows/docker-build-skip.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: "docker-build" -on: - pull_request: - paths-ignore: - - "docker/**" - - "!docker/README.md" - - ".github/workflows/docker-build.yml" - workflow_dispatch: - -# Cancel outdated workflows if they are still running -concurrency: - group: docker-build-skip-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - docker-build: - name: Build and Install FlexFlow in a Docker Container - runs-on: ubuntu-20.04 - strategy: - matrix: - gpu_backend: ["cuda", "hip_rocm"] - fail-fast: false - steps: - - run: 'echo "No docker-build required"' diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml deleted file mode 100644 index d54750c9d4..0000000000 --- a/.github/workflows/docker-build.yml +++ /dev/null @@ -1,79 +0,0 @@ -name: "docker-build" -on: - pull_request: - paths: - - "docker/**" - - "!docker/README.md" - - ".github/workflows/docker-build.yml" - push: - branches: - - "master" - schedule: - # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated - - cron: "0 8 * * 0" - workflow_dispatch: - -# Cancel outdated workflows if they are still running -concurrency: - group: docker-build-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - docker-build: - name: Build and Install FlexFlow in a Docker Container - runs-on: ubuntu-20.04 - strategy: - matrix: - gpu_backend: ["cuda", "hip_rocm"] - fail-fast: false - steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Free additional space on runner - run: .github/workflows/helpers/free_space_on_runner.sh - - - name: Build Docker container - env: - FF_GPU_BACKEND: ${{ matrix.gpu_backend }} - run: | - # On push to master, build for all compatible architectures, so that we can publish - # a pre-built general-purpose image. On all other cases, only build for one architecture - # to save time. - if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "master" ]]; then - export FF_CUDA_ARCH=all - else - export FF_CUDA_ARCH=70 - fi - ./docker/build.sh flexflow - - - name: Check availability of Python flexflow.core module - if: ${{ matrix.gpu_backend == 'cuda' }} - run: docker run --entrypoint python flexflow-cuda:latest -c "import flexflow.core; exit()" - - - name: Publish Docker environment image (on push to master) - if: github.repository_owner == 'flexflow' - env: - FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }} - FF_GPU_BACKEND: ${{ matrix.gpu_backend }} - run: | - if [[ ( ${{ github.event_name }} == 'push' || ${{ github.event_name }} == 'schedule' ) && ${GITHUB_REF#refs/heads/} == "master" ]]; then - ./docker/publish.sh "flexflow-environment-${FF_GPU_BACKEND}" - ./docker/publish.sh "flexflow-${FF_GPU_BACKEND}" - else - echo "No need to update Docker containers in ghrc.io registry at this time." - fi - - notify-slack: - name: Notify Slack in case of failure - runs-on: ubuntu-20.04 - needs: docker-build - if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }} - steps: - - name: Send Slack message - env: - SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} - run: | - curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Weekly FlexFlow Docker images build failed! :x: \"}" $SLACK_WEBHOOK diff --git a/.github/workflows/gpu-ci-daemon.yml b/.github/workflows/gpu-ci-daemon.yml deleted file mode 100644 index 603b44c34e..0000000000 --- a/.github/workflows/gpu-ci-daemon.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: "gpu-ci-daemon" -on: - schedule: - # Run every 5 mins - - cron: "*/5 * * * *" - workflow_dispatch: - -concurrency: - group: gpu-ci-daemon-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - gpu-ci-daemon: - name: GPU CI Daemon - # Prevent Github from running the workflow on forks - if: github.repository_owner == 'flexflow' - runs-on: ubuntu-20.04 - env: - FLEXFLOW_TOKEN: ${{ secrets.FLEXFLOW_TOKEN }} - FLEXFLOW_RUNNER_INSTANCE_ID: ${{ secrets.FLEXFLOW_RUNNER_INSTANCE_ID }} - - steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 - - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: us-east-2 - - - name: Run daemon - run: | - pip3 install pip --upgrade - pip3 install pyopenssl --upgrade - pip3 install pygithub - python3 .github/workflows/helpers/gpu_ci_helper.py --daemon diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml deleted file mode 100644 index 012302a57f..0000000000 --- a/.github/workflows/gpu-ci-skip.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: "gpu-ci" -on: - pull_request: - paths-ignore: - - "cmake/**" - - "config/**" - - "python/**" - - "setup.py" - - "include/**" - - "src/**" - - ".github/workflows/gpu-ci.yml" - - "tests/multi_gpu_tests.sh" - workflow_dispatch: - -concurrency: - group: gpu-ci-skip-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - gpu-ci-concierge: - name: GPU CI Concierge - runs-on: ubuntu-20.04 - steps: - - run: 'echo "No gpu-ci required"' - - python-interface-check: - name: Check Python Interface - runs-on: ubuntu-20.04 - needs: gpu-ci-concierge - steps: - - run: 'echo "No gpu-ci required"' - - gpu-ci-flexflow: - name: Single Machine, Multiple GPUs Tests - runs-on: ubuntu-20.04 - needs: gpu-ci-concierge - steps: - - run: 'echo "No gpu-ci required"' diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml deleted file mode 100644 index d524ee4115..0000000000 --- a/.github/workflows/gpu-ci.yml +++ /dev/null @@ -1,163 +0,0 @@ -name: "gpu-ci" -on: - pull_request: - paths: - - "cmake/**" - - "config/**" - - "python/**" - - "setup.py" - - "include/**" - - "src/**" - - ".github/workflows/gpu-ci.yml" - - "tests/cpp_gpu_tests.sh" - - "tests/multi_gpu_tests.sh" - - "tests/python_interface_test.sh" - push: - branches: - - "master" - paths: - - "cmake/**" - - "config/**" - - "python/**" - - "setup.py" - - "include/**" - - "src/**" - - ".github/workflows/gpu-ci.yml" - - "tests/cpp_gpu_tests.sh" - - "tests/multi_gpu_tests.sh" - - "tests/python_interface_test.sh" - workflow_dispatch: - -concurrency: - group: gpu-ci-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - gpu-ci-concierge: - name: GPU CI Concierge - runs-on: ubuntu-20.04 - env: - FLEXFLOW_TOKEN: ${{ secrets.GITHUB_TOKEN }} - steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 - - - name: Wait for daemon to be done - run: | - pip3 install pip --upgrade - pip3 install pyopenssl --upgrade - pip3 install pygithub - python3 .github/workflows/helpers/gpu_ci_helper.py - - python-interface-check: - name: Check Python Interface - runs-on: self-hosted - defaults: - run: - shell: bash -l {0} # required to use an activated conda environment - env: - CONDA: "3" - needs: gpu-ci-concierge - container: - image: ghcr.io/flexflow/flexflow-environment-cuda:latest - options: --gpus all --shm-size=8192m - steps: - - name: Install updated git version - run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git - - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Install conda and FlexFlow dependencies - uses: conda-incubator/setup-miniconda@v2 - with: - miniconda-version: "latest" - activate-environment: flexflow - environment-file: conda/flexflow-cpu.yml - auto-activate-base: false - - - name: Install conda and Pytorch dependencies for pytorch alignment test - run: | - conda env create -f conda/pytorch-gpu.yml - - - name: Build FlexFlow - run: | - export PATH=$CONDA_PREFIX/bin:$PATH - export FF_HOME=$(pwd) - mkdir build - cd build - ../config/config.linux - make -j - - - name: Check FlexFlow Python interface (before installation) - run: | - export PATH=$CONDA_PREFIX/bin:$PATH - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - ./tests/python_interface_test.sh before-installation - - - name: Install FlexFlow - run: | - export PATH=$CONDA_PREFIX/bin:$PATH - export FF_HOME=$(pwd) - cd build - ../config/config.linux - make install - ldconfig - - - name: Check FlexFlow Python interface (after installation) - run: | - export PATH=$CONDA_PREFIX/bin:$PATH - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - ./tests/python_interface_test.sh after-installation - - - name: Run flexflow alignment with pytorch - run: | - # run alingment tests - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - ./tests/align/test_all_operators.sh - - gpu-ci-flexflow: - name: Single Machine, Multiple GPUs Tests - runs-on: self-hosted - needs: gpu-ci-concierge - container: - image: ghcr.io/flexflow/flexflow-environment-cuda:latest - options: --gpus all --shm-size=8192m - steps: - - name: Install updated git version - run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Build and Install FlexFlow - run: | - export PATH=/opt/conda/bin:$PATH - export FF_HOME=$(pwd) - export FF_BUILD_ALL_EXAMPLES=ON - pip install . --verbose - - - name: Check FlexFlow Python interface (pip) - run: | - export PATH=/opt/conda/bin:$PATH - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib - ./tests/python_interface_test.sh after-installation - - - name: Run multi-gpu tests - run: | - export PATH=/opt/conda/bin:$PATH - export CUDNN_DIR=/usr/local/cuda - export CUDA_DIR=/usr/local/cuda - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib - # C++ tests - ./tests/cpp_gpu_tests.sh 4 - # Python tests - ./tests/multi_gpu_tests.sh 4 - diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml deleted file mode 100644 index cfe3629211..0000000000 --- a/.github/workflows/multinode-test.yml +++ /dev/null @@ -1,191 +0,0 @@ -name: "multinode-test" -on: - schedule: - # Run every other day (Monday, Wednesday, Friday, and Saturday) at midnight PT (3am ET / 8am UTC) - - cron: "0 8 * * 1,3,5,6" - workflow_dispatch: - -concurrency: - group: multinode-test-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - gpu-ci-concierge: - name: GPU CI Concierge - # Prevent Github from running the workflow on forks - if: github.repository_owner == 'flexflow' - runs-on: ubuntu-20.04 - env: - FLEXFLOW_TOKEN: ${{ secrets.GITHUB_TOKEN }} - steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 - - - name: Wait for daemon to be done - run: | - pip3 install pip --upgrade - pip3 install pyopenssl --upgrade - pip3 install pygithub - python3 .github/workflows/helpers/gpu_ci_helper.py - - multinode-gpu-test-mpi: - name: Multinode GPU Test with MPI - # Prevent Github from running the workflow on forks - if: github.repository_owner == 'flexflow' - runs-on: self-hosted - needs: gpu-ci-concierge - # 10h timeout, instead of default of 360min (6h) - timeout-minutes: 600 - container: - image: ghcr.io/flexflow/flexflow-environment-cuda:latest - options: --gpus all --shm-size=8192m - steps: - - name: Install updated git version - run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git - - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Install MPI - run: sudo apt-get install -y --no-install-recommends openmpi-bin openmpi-common libopenmpi-dev - - - name: Build and Install FlexFlow - run: | - export PATH=/opt/conda/bin:$PATH - export FF_HOME=$(pwd) - export FF_LEGION_NETWORKS=gasnet - export FF_GASNET_CONDUIT=mpi - pip install . --verbose - - - name: Check FlexFlow Python interface (pip) - run: | - export FF_HOME=$(pwd) - export PATH=/opt/conda/bin:$PATH - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib - ./tests/python_interface_test.sh after-installation - - - name: Run multi-gpu tests - run: | - export PATH=/opt/conda/bin:$PATH - export CUDNN_DIR=/usr/local/cuda - export CUDA_DIR=/usr/local/cuda - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib - export OMPI_ALLOW_RUN_AS_ROOT=1 - export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 - export OMPI_MCA_btl_vader_single_copy_mechanism=none - ./tests/multi_gpu_tests.sh 2 2 - - multinode-gpu-test-ucx: - name: Multinode GPU Test with UCX - # Prevent Github from running the workflow on forks - if: github.repository_owner == 'flexflow' - runs-on: self-hosted - needs: gpu-ci-concierge - container: - image: ghcr.io/flexflow/flexflow-environment-cuda:latest - options: --gpus all --shm-size=8192m - # 10h timeout, instead of default of 360min (6h) - timeout-minutes: 600 - steps: - - name: Install updated git version - run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git - - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Install MPI - run: sudo apt-get install -y --no-install-recommends openmpi-bin openmpi-common libopenmpi-dev - - - name: Build and Install FlexFlow - run: | - export PATH=/opt/conda/bin:$PATH - export FF_HOME=$(pwd) - export FF_LEGION_NETWORKS=gasnet - export FF_GASNET_CONDUIT=ucx - pip install . --verbose - - - name: Check FlexFlow Python interface (pip) - run: | - export FF_HOME=$(pwd) - export PATH=/opt/conda/bin:$PATH - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib - ./tests/python_interface_test.sh after-installation - - - name: Run multi-gpu tests - run: | - export PATH=/opt/conda/bin:$PATH - export CUDNN_DIR=/usr/local/cuda - export CUDA_DIR=/usr/local/cuda - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib - export OMPI_ALLOW_RUN_AS_ROOT=1 - export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 - export OMPI_MCA_btl_vader_single_copy_mechanism=none - ./tests/multi_gpu_tests.sh 2 2 - - multinode-gpu-test-native-ucx: - name: Multinode GPU Test with native UCX - # Prevent Github from running the workflow on forks - if: github.repository_owner == 'flexflow' - runs-on: self-hosted - needs: gpu-ci-concierge - container: - image: ghcr.io/flexflow/flexflow-environment-cuda:latest - options: --gpus all --shm-size=8192m - steps: - - name: Install updated git version - run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git - - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Install MPI - run: sudo apt-get install -y --no-install-recommends openmpi-bin openmpi-common libopenmpi-dev - - - name: Build and Install FlexFlow - run: | - export PATH=/opt/conda/bin:$PATH - export FF_HOME=$(pwd) - export FF_LEGION_NETWORKS=ucx - pip install . --verbose - - - name: Check FlexFlow Python interface (pip) - run: | - export FF_HOME=$(pwd) - export PATH=/opt/conda/bin:$PATH - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib - ./tests/python_interface_test.sh after-installation - - - name: Run multi-gpu tests - run: | - export PATH=/opt/conda/bin:$PATH - export CUDNN_DIR=/usr/local/cuda - export CUDA_DIR=/usr/local/cuda - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib - export OMPI_ALLOW_RUN_AS_ROOT=1 - export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 - export OMPI_MCA_btl_vader_single_copy_mechanism=none - ./tests/multi_gpu_tests.sh 2 2 - - notify-slack: - name: Notify Slack in case of failure - runs-on: ubuntu-20.04 - needs: [multinode-gpu-test-mpi, multinode-gpu-test-ucx, multinode-gpu-test-native-ucx] - if: ${{ failure() && github.event_name == 'schedule' }} - steps: - - name: Send Slack message - env: - SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} - run: | - curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Weekly multinode GPU test failed! :x: \"}" $SLACK_WEBHOOK diff --git a/.github/workflows/per-lib-check.yml b/.github/workflows/per-lib-check.yml index 839a0f395c..f21621b265 100644 --- a/.github/workflows/per-lib-check.yml +++ b/.github/workflows/per-lib-check.yml @@ -14,8 +14,7 @@ jobs: strategy: max-parallel: 1 matrix: - gpu_backend: ["cuda","hip_rocm"] - library: ["runtime", "ffi","compiler","kernels","op-attrs","pcg","substitutions","utils"] + gpu_backend: ["cuda"] fail-fast: false steps: - name: Checkout Git Repository @@ -33,26 +32,29 @@ jobs: cuda: "12.1.0" # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement use-github-cache: "false" + linux-local-args: '["--toolkit"]' + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 - name: Install system dependencies run: FF_GPU_BACKEND=${{ matrix.gpu_backend }} .github/workflows/helpers/install_dependencies.sh - - name: Install conda and FlexFlow dependencies - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: flexflow - environment-file: packaging/conda/environment.yml - auto-activate-base: false + # - name: Install conda and FlexFlow dependencies + # uses: conda-incubator/setup-miniconda@v2 + # with: + # activate-environment: flexflow + # environment-file: packaging/conda/environment.yml + # auto-activate-base: false - - name: Build lib ${{ matrix.library }} + - name: Run cmake run: | export CUDNN_DIR=/usr/local/cuda export CUDA_DIR=/usr/local/cuda export FF_HOME=$(pwd) export FF_GPU_BACKEND=${{ matrix.gpu_backend }} export FF_CUDA_ARCH=70 - cores_available=$(nproc --all) - n_build_cores=$(( cores_available -1 )) + n_build_cores=$(( $(nproc) cores_available -1 )) if (( $n_build_cores < 1 )) ; then n_build_cores=1 ; fi mkdir build cd build @@ -60,8 +62,24 @@ jobs: # export FF_BUILD_ALL_EXAMPLES=ON # export FF_BUILD_UNIT_TESTS=ON #fi - ../config/config.linux - make -j $n_build_cores ${{ matrix.library }} + ../config/config.linux -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache + - name: Build utils + run: | + cd build + make -j $(( $(nproc) < 2 ? 1 : $(nproc)-1 )) utils + - name: Build op-attrs + run: | + cd build + make -j $(( $(nproc) < 2 ? 1 : $(nproc)-1 )) op-attrs + - name: Build pcg + run: | + cd build + make -j $(( $(nproc) < 2 ? 1 : $(nproc)-1 )) pcg + + - name: Build kernels + run: | + cd build + make -j $(( $(nproc) < 2 ? 1 : $(nproc)-1 )) kernels diff --git a/.github/workflows/pip-install-skip.yml b/.github/workflows/pip-install-skip.yml deleted file mode 100644 index 68b1afb9ff..0000000000 --- a/.github/workflows/pip-install-skip.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: "pip-install" -on: - pull_request: - paths-ignore: - - "cmake/**" - - "config/**" - - "python/**" - - "setup.py" - - ".github/workflows/helpers/install_dependencies.sh" - - ".github/workflows/pip-install.yml" - workflow_dispatch: -concurrency: - group: pip-install-skip-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - pip-install-flexflow: - name: Install FlexFlow with pip - runs-on: ubuntu-20.04 - steps: - - run: 'echo "No pip-install required"' diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml deleted file mode 100644 index 48e0798008..0000000000 --- a/.github/workflows/pip-install.yml +++ /dev/null @@ -1,68 +0,0 @@ -name: "pip-install" -on: - pull_request: - paths: - - "cmake/**" - - "config/**" - - "python/**" - - "setup.py" - - ".github/workflows/helpers/install_dependencies.sh" - - ".github/workflows/pip-install.yml" - push: - branches: - - "master" - paths: - - "cmake/**" - - "config/**" - - "python/**" - - "setup.py" - - ".github/workflows/helpers/install_dependencies.sh" - - ".github/workflows/pip-install.yml" - workflow_dispatch: -concurrency: - group: pip-install-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - pip-install-flexflow: - name: Install FlexFlow with pip - runs-on: ubuntu-20.04 - defaults: - run: - shell: bash -l {0} # required to use an activated conda environment - steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Free additional space on runner - run: .github/workflows/helpers/free_space_on_runner.sh - - - name: Install CUDA - uses: Jimver/cuda-toolkit@v0.2.8 - id: cuda-toolkit - with: - cuda: "11.1.1" - # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement - use-github-cache: "false" - - - name: Install system dependencies - run: .github/workflows/helpers/install_dependencies.sh - - - name: Install conda and FlexFlow dependencies - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: flexflow - environment-file: conda/environment.yml - auto-activate-base: false - - - name: Build and Install FlexFlow - run: | - export FF_HOME=$(pwd) - export FF_CUDA_ARCH=70 - pip install . --verbose - - - name: Check availability of Python flexflow.core module - run: | - python -c "import flexflow.core; exit()" diff --git a/CMakeLists.txt b/CMakeLists.txt index 3bbdf13b22..418a2a7538 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,18 @@ project(FlexFlow) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake) +# Detect OS type and Linux version (if it applies) +set(LINUX_VERSION "") +if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") + find_program(LSB_RELEASE_EXEC lsb_release) + if(LSB_RELEASE_EXEC) + execute_process(COMMAND ${LSB_RELEASE_EXEC} -r --short + OUTPUT_VARIABLE LINUX_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + message(STATUS "Linux Version: ${LINUX_VERSION}") + endif() +endif() + set(FF_MAX_DIM "5" CACHE STRING "Maximum tensor order") set(FF_MAX_OPNAME "128" CACHE STRING "Maximum op name length") set(FF_MAX_NUM_OUTPUTS "256" CACHE STRING "Maximum number of outputs (per operator)") diff --git a/config/config.linux b/config/config.linux index a0771e1271..2b87ec0eb5 100755 --- a/config/config.linux +++ b/config/config.linux @@ -79,14 +79,5 @@ function get_build_configs() { BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" } -if [ -n "$1" ]; then - if [ "$1" != "get-docker-configs" ]; then - . $(dirname $0)/config.inc - # You can pass the name of the variable you want to print out as $1. This - # is used in the python setup script to get the cmake config - echo "${!1}" - fi -else - . $(dirname $0)/config.inc - run_cmake $* -fi +. $(dirname $0)/config.inc +run_cmake $* diff --git a/deps/fmt b/deps/fmt index a33701196a..f5e54359df 160000 --- a/deps/fmt +++ b/deps/fmt @@ -1 +1 @@ -Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50 +Subproject commit f5e54359df4c26b6230fc61d38aa294581393084 diff --git a/packaging/docker/build.sh b/packaging/docker/build.sh deleted file mode 100755 index 7e8587bfdf..0000000000 --- a/packaging/docker/build.sh +++ /dev/null @@ -1,88 +0,0 @@ -#! /usr/bin/env bash -set -euo pipefail - -# Usage: ./build.sh - -# https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -# Cd into $FF_HOME. Assumes this script is in $FF_HOME/docker -cd "$SCRIPT_DIR/.." - -# Get name of desired Docker image as input -image="${1:-flexflow}" -if [[ "$image" != @(flexflow-environment|flexflow) ]]; then - echo "Error, image name ${image} is invalid. Choose between 'flexflow-environment' and 'flexflow'." - exit 1 -fi - -# Set up GPU backend -FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} -if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then - echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid. Pick between 'cuda', 'hip_cuda', 'hip_rocm' or 'intel'." - exit 1 -elif [[ "${FF_GPU_BACKEND}" != "cuda" ]]; then - echo "Configuring FlexFlow to build for gpu backend: ${FF_GPU_BACKEND}" -else - echo "Letting FlexFlow build for a default GPU backend: cuda" -fi - -# Build the FlexFlow Enviroment docker image -docker build --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" -t "flexflow-environment-${FF_GPU_BACKEND}" -f docker/flexflow-environment/Dockerfile . - -# If the user only wants to build the environment image, we are done -if [[ "$image" == "flexflow-environment" ]]; then - exit 0 -fi - -# Gather arguments needed to build the FlexFlow image -# Get number of cores available on the machine. Build with all cores but one, to prevent RAM choking -cores_available=$(nproc --all) -n_build_cores=$(( cores_available -1 )) - -# If FF_CUDA_ARCH is set to autodetect, we need to perform the autodetection here because the Docker -# image will not have access to GPUs during the build phase (due to a Docker restriction). In all other -# cases, we pass the value of FF_CUDA_ARCH directly to Cmake. -if [[ "${FF_CUDA_ARCH:-autodetect}" == "autodetect" ]]; then - # Get CUDA architecture(s), if GPUs are available - cat << EOF > ./get_gpu_arch.cu -#include -int main() { - int count = 0; - if (cudaSuccess != cudaGetDeviceCount(&count)) return -1; - if (count == 0) return -1; - for (int device = 0; device < count; ++device) { - cudaDeviceProp prop; - if (cudaSuccess == cudaGetDeviceProperties(&prop, device)) - printf("%d ", prop.major*10+prop.minor); - } - return 0; -} -EOF - gpu_arch_codes="" - if command -v nvcc &> /dev/null - then - nvcc ./get_gpu_arch.cu -o ./get_gpu_arch - gpu_arch_codes="$(./get_gpu_arch)" - fi - gpu_arch_codes="$(echo "$gpu_arch_codes" | xargs -n1 | sort -u | xargs)" - gpu_arch_codes="${gpu_arch_codes// /,}" - rm -f ./get_gpu_arch.cu ./get_gpu_arch - - if [[ -n "$gpu_arch_codes" ]]; then - echo "Host machine has GPUs with architecture codes: $gpu_arch_codes" - echo "Configuring FlexFlow to build for the $gpu_arch_codes code(s)." - FF_CUDA_ARCH="${gpu_arch_codes}" - export FF_CUDA_ARCH - else - echo "FF_CUDA_ARCH is set to 'autodetect', but the host machine does not have any compatible GPUs." - exit 1 - fi -fi - -# Build FlexFlow Docker image -# shellcheck source=/dev/null -. config/config.linux get-docker-configs -# Set value of BUILD_CONFIGS -get_build_configs - -docker build --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" -t "flexflow-${FF_GPU_BACKEND}" -f docker/flexflow/Dockerfile . diff --git a/packaging/docker/flexflow-environment/Dockerfile b/packaging/docker/flexflow-environment/Dockerfile deleted file mode 100644 index 061b63352b..0000000000 --- a/packaging/docker/flexflow-environment/Dockerfile +++ /dev/null @@ -1,57 +0,0 @@ -FROM nvidia/cuda:11.7.0-cudnn8-devel-ubuntu20.04 - -LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow -LABEL org.opencontainers.image.description="FlexFlow environment container" - -# Install basic dependencies -RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano libhdf5-dev && \ - rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends software-properties-common && \ - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends build-essential apt-utils \ - ca-certificates libssl-dev curl unzip htop && DEBIAN_FRONTEND=noninteractive \ - apt-get install -y software-properties-common && \ - add-apt-repository ppa:ubuntu-toolchain-r/test && \ - apt-get update -y && \ - apt-get upgrade -y libstdc++6 - -# Install Python3 with Miniconda -RUN wget -c -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - mv Miniconda3-latest-Linux-x86_64.sh ~/Miniconda3-latest-Linux-x86_64.sh && \ - chmod +x ~/Miniconda3-latest-Linux-x86_64.sh && \ - bash ~/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ - rm ~/Miniconda3-latest-Linux-x86_64.sh && \ - /opt/conda/bin/conda upgrade --all && \ - /opt/conda/bin/conda install conda-build conda-verify && \ - /opt/conda/bin/conda clean -ya - -# Optionally install HIP dependencies -# Note that amd's docs say to also install the `hip-runtime-nvidia` package. This -# package attempts to re-install cuda even though cuda is already installed -# in the container. It also attempts to install packages for a graphical install. -# For our container, we don't need `hip-runtime-nvidia` -ARG FF_GPU_BACKEND "cuda" -RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then \ - echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"; \ - wget https://repo.radeon.com/amdgpu-install/22.20.5/ubuntu/bionic/amdgpu-install_22.20.50205-1_all.deb; \ - apt-get install -y ./amdgpu-install_22.20.50205-1_all.deb; \ - rm ./amdgpu-install_22.20.50205-1_all.deb; \ - amdgpu-install -y --usecase=hip,rocm --no-dkms; \ - apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk; \ - else \ - echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"; \ - fi -RUN rm -rf /var/lib/apt/lists/* - -# Set env vars -ENV PATH /opt/conda/bin:$PATH -ENV CUDNN_DIR /usr/local/cuda -ENV CUDA_DIR /usr/local/cuda - -# Install python packages and other dependencies -RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing -# Install CPU-only Pytorch and related dependencies -RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch -RUN conda install -c conda-forge onnx transformers sentencepiece -RUN pip3 install tensorflow - -ENTRYPOINT ["/bin/bash"] diff --git a/packaging/docker/flexflow/Dockerfile b/packaging/docker/flexflow/Dockerfile deleted file mode 100644 index 06e69ba4d3..0000000000 --- a/packaging/docker/flexflow/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -ARG FF_GPU_BACKEND "cuda" -FROM flexflow-environment-$FF_GPU_BACKEND:latest - -LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow -LABEL org.opencontainers.image.description="FlexFlow container" - -# Copy FlexFlow repository -RUN mkdir FlexFlow -ENV FF_HOME /usr/FlexFlow -WORKDIR ${FF_HOME} -COPY . . - -# Args to build FlexFlow -ARG BUILD_CONFIGS -ARG N_BUILD_CORES - -# Build and install C++ and Python versions of FlexFlow -RUN mkdir -p build && cd build && \ - eval "$BUILD_CONFIGS" ../config/config.linux && \ - make -j $N_BUILD_CORES && \ - eval "$BUILD_CONFIGS" ../config/config.linux && \ - make install && \ - ldconfig - -ENTRYPOINT ["/bin/bash"] diff --git a/run.sh b/run.sh deleted file mode 100644 index 108a859a7f..0000000000 --- a/run.sh +++ /dev/null @@ -1,3 +0,0 @@ - - -cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DFF_CUDA_ARCH=75 -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_CXX_COMPILER="clang++" -DCMAKE_C_COMPILER="clang" -DCMAKE_CUDA_COMPILER="clang++" -DCMAKE_CUDA_HOST_COMPILER="clang++" .. \ No newline at end of file