Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions .github/workflows/build_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,65 @@ jobs:
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
timeout-minutes: 40
steps:
- name: Check GPU Availability # ensure all GPUs have enough memory
id: check-avai
run: |
avai=true
for i in $(seq 0 7);
do
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -le "10000" ] && avai=false
done

echo "GPU is available: $avai"
echo "avai=$avai" >> $GITHUB_OUTPUT

- uses: actions/checkout@v2
if: steps.check-avai.outputs.avai == 'true'
with:
repository: hpcaitech/TensorNVMe
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
path: TensorNVMe

- name: Install tensornvme
if: steps.check-avai.outputs.avai == 'true'
run: |
cd TensorNVMe
conda install cmake
pip install -r requirements.txt
pip install -v .

- uses: actions/checkout@v2
if: steps.check-avai.outputs.avai == 'true'
with:
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}

- name: Install Colossal-AI
if: steps.check-avai.outputs.avai == 'true'
run: |
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
CUDA_EXT=1 pip install -v -e .
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
pip install -r requirements/requirements-test.txt

- name: Unit Testing
if: steps.check-avai.outputs.avai == 'true'
run: |
gpu_used=$(nvidia-smi -i 0 --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -le "10000" ] && PYTHONPATH=$PWD pytest tests
PYTHONPATH=$PWD pytest tests
env:
DATA: /data/scratch/cifar-10
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64

- name: Notify Lark
id: message-preparation
if: ${{ failure() }}
run: |
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
echo $msg
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
env:
SERVER_URL: ${{github.server_url }}
REPO: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
20 changes: 20 additions & 0 deletions .github/workflows/scripts/send_message_to_lark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import argparse

import requests


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--message', type=str)
parser.add_argument('-u', '--url', type=str)
return parser.parse_args()


def send_message_to_lark(message, webhook_url):
data = {"msg_type": "text", "content": {"text": message}}
requests.post(webhook_url, json=data)


if __name__ == '__main__':
args = parse_args()
send_message_to_lark(args.message, args.url)
6 changes: 6 additions & 0 deletions test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
avai=true
for i in $(seq 0 7);
do
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -le "10000" ] && avai=false
done