From a61b08f0fe4704fb4f6b434617581632ff35fab9 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Sun, 5 Feb 2023 17:31:56 +0800 Subject: [PATCH 1/3] [workflow] added notification if scheduled build fails --- .github/workflows/build_on_schedule.yml | 36 +++++++++++++++++-- .../workflows/scripts/send_message_to_lark.py | 20 +++++++++++ test.sh | 6 ++++ 3 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/scripts/send_message_to_lark.py create mode 100644 test.sh diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index ea1f4879ce51..c75ff16a930c 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -21,25 +21,57 @@ jobs: repository: hpcaitech/TensorNVMe ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} path: TensorNVMe + - name: Install tensornvme run: | cd TensorNVMe conda install cmake pip install -r requirements.txt pip install -v . + - uses: actions/checkout@v2 with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} + + - name: Check GPU Availability # ensure all GPUs have enough memory + id: check-avai + run: | + avai=true + for i in $(seq 0 7); + do + gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits) + [ "$gpu_used" -le "10000" ] && avai=false + done + + echo "GPU is available: $avai" + echo "avai=$avai" >> $GITHUB_OUTPUT + - name: Install Colossal-AI + if: steps.check-avai.outputs.avai == "true" run: | [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/ CUDA_EXT=1 pip install -v -e . cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/ pip install -r requirements/requirements-test.txt + - name: Unit Testing + if: steps.check-avai.outputs.avai == "true" run: | - gpu_used=$(nvidia-smi -i 0 --query-gpu=memory.used --format=csv,noheader,nounits) - [ "$gpu_used" -le "10000" ] && PYTHONPATH=$PWD pytest tests + PYTHONPATH=$PWD pytest tests env: DATA: /data/scratch/cifar-10 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + + - name: Notify Lark + id: message-preparation + if: ${{ failure() }} + run: | + url=$SERVER_URL/$REPO/actions/runs/$RUN_ID + msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details" + echo $msg + python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL + env: + SERVER_URL: ${{github.server_url }} + REPO: ${{ github.repository }} + RUN_ID: ${{ github.run_id }} + WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }} diff --git a/.github/workflows/scripts/send_message_to_lark.py b/.github/workflows/scripts/send_message_to_lark.py new file mode 100644 index 000000000000..a113327a786e --- /dev/null +++ b/.github/workflows/scripts/send_message_to_lark.py @@ -0,0 +1,20 @@ +import argparse + +import requests + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-m', '--message', type=str) + parser.add_argument('-u', '--url', type=str) + return parser.parse_args() + + +def send_message_to_lark(message, webhook_url): + data = {"msg_type": "text", "content": {"text": message}} + requests.post(webhook_url, json=data) + + +if __name__ == '__main__': + args = parse_args() + send_message_to_lark(args.message, args.url) diff --git a/test.sh b/test.sh new file mode 100644 index 000000000000..8dcecc6ddc55 --- /dev/null +++ b/test.sh @@ -0,0 +1,6 @@ +avai=true +for i in $(seq 0 7); +do + gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits) + [ "$gpu_used" -le "10000" ] && avai=false +done From 6bf6512b5c87e220c83a6974c8aa9738f4241e33 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Mon, 6 Feb 2023 14:00:34 +0800 Subject: [PATCH 2/3] polish code --- .github/workflows/build_on_schedule.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index c75ff16a930c..3ec3cf1c95cf 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -47,7 +47,7 @@ jobs: echo "avai=$avai" >> $GITHUB_OUTPUT - name: Install Colossal-AI - if: steps.check-avai.outputs.avai == "true" + if: steps.check-avai.outputs.avai == 'true' run: | [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/ CUDA_EXT=1 pip install -v -e . @@ -55,7 +55,7 @@ jobs: pip install -r requirements/requirements-test.txt - name: Unit Testing - if: steps.check-avai.outputs.avai == "true" + if: steps.check-avai.outputs.avai == 'true' run: | PYTHONPATH=$PWD pytest tests env: From 216cd12f6840f2d010976c529eb7f38acd3c34d3 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Mon, 6 Feb 2023 14:02:08 +0800 Subject: [PATCH 3/3] polish code --- .github/workflows/build_on_schedule.yml | 29 ++++++++++++++----------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index 3ec3cf1c95cf..32b518ac5394 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -16,13 +16,28 @@ jobs: options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 timeout-minutes: 40 steps: + - name: Check GPU Availability # ensure all GPUs have enough memory + id: check-avai + run: | + avai=true + for i in $(seq 0 7); + do + gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits) + [ "$gpu_used" -le "10000" ] && avai=false + done + + echo "GPU is available: $avai" + echo "avai=$avai" >> $GITHUB_OUTPUT + - uses: actions/checkout@v2 + if: steps.check-avai.outputs.avai == 'true' with: repository: hpcaitech/TensorNVMe ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} path: TensorNVMe - name: Install tensornvme + if: steps.check-avai.outputs.avai == 'true' run: | cd TensorNVMe conda install cmake @@ -30,22 +45,10 @@ jobs: pip install -v . - uses: actions/checkout@v2 + if: steps.check-avai.outputs.avai == 'true' with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} - - name: Check GPU Availability # ensure all GPUs have enough memory - id: check-avai - run: | - avai=true - for i in $(seq 0 7); - do - gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits) - [ "$gpu_used" -le "10000" ] && avai=false - done - - echo "GPU is available: $avai" - echo "avai=$avai" >> $GITHUB_OUTPUT - - name: Install Colossal-AI if: steps.check-avai.outputs.avai == 'true' run: |