From 3327dd98b52ce7163cc540bec8fa0fad83d87055 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 11:00:45 -0600 Subject: [PATCH 01/30] adding pr label auto validation --- .github/workflows/label-validation.yml | 120 +++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 .github/workflows/label-validation.yml diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml new file mode 100644 index 000000000..f9c9348fe --- /dev/null +++ b/.github/workflows/label-validation.yml @@ -0,0 +1,120 @@ +name: PR Label Validation +run-name: Validate ${{ github.event.label.name }} PR \#${{ github.event.pull_request.number }} + +on: + pull_request: + types: [labeled] + branches: + - main + +jobs: + parse-label: + runs-on: ubuntu-latest + outputs: + runner-type: ${{ steps.parse-label.outputs.runner-type }} + model-prefix: ${{ steps.parse-label.outputs.model-prefix }} + steps: + - name: Parse label + shell: python + run: | + import yaml + import re + with open(runners_config, 'r') as f: + runners = yaml.safe_load(f) + + # Matches strings like h200-trt_gptoss + runner_model_pattern = r'^([^_]+)_([^_]+)$' + match = re.match(runner_model_pattern, '${{ github.event.label.name }}') + if match: + runner_type = match.group(1) + model_prefix = match.group(2) + + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write(f'runner-type={runner_type}\n') + f.write(f'model-prefix={model_prefix}\n') + + + + get-jobs: + needs: parse-label + if: ${{ needs.parse-label.outputs.runner-type != '' && needs.parse-label.outputs.model-prefix != ''}} + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-jobs + run: | + pip install pydantic + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py \ + full-sweep \ + --runner-type ${{ needs.parse-label.outputs.runner-type }} \ + --model-prefix ${{ needs.parse-label.outputs.model-prefix }} \ + --seq-lens 1k1k \ + --test-mode \ + --config-files \ + .github/configs/nvidia-master.yaml \ + .github/configs/amd-master.yaml \ + --runner-config .github/configs/runners.yaml) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + validate: + needs: get-jobs + # Prolly unnecessary + if: ${{ needs.get-jobs.outputs.search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: Validate ${{ github.event.label.name }} + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + calc-success-rate: + needs: validate + if: ${{ always() }} + runs-on: ubuntu-latest + + env: + RESULTS_DIR: "results/" + STATS_FILENAME: "run_stats" + GITHUB_TOKEN: ${{ secrets.REPO_PAT }} + + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Download results artifacts + uses: actions/download-artifact@v4 + with: + path: ${{ env.RESULTS_DIR }} + pattern: results_* + + - name: Install python dependencies + run: pip install PyGithub + + - name: Calculate success rate + run: python3 utils/calc_success_rate.py $STATS_FILENAME + + - uses: actions/upload-artifact@v4 + with: + name: "run-stats" + path: ${{ env.STATS_FILENAME }}.json From f523541087af04b1e49e1f17ac3e777f2a7d460b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:07:05 -0600 Subject: [PATCH 02/30] cant escalpe # in github actions i guess --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index f9c9348fe..f344acb0a 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -1,5 +1,5 @@ name: PR Label Validation -run-name: Validate ${{ github.event.label.name }} PR \#${{ github.event.pull_request.number }} +run-name: Validate ${{ github.event.label.name }} PR ${{ github.event.pull_request.number }} on: pull_request: From 3e8a6b9f1c14dae6f051e2f8727d000557c59f00 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:07:42 -0600 Subject: [PATCH 03/30] yes u can --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index f344acb0a..b8c82ad09 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -1,5 +1,5 @@ name: PR Label Validation -run-name: Validate ${{ github.event.label.name }} PR ${{ github.event.pull_request.number }} +run-name: "Validate ${{ github.event.label.name }} PR #${{ github.event.pull_request.number }}" on: pull_request: From 16957813252ff02ff150ff53521648ed8e131a8d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:08:51 -0600 Subject: [PATCH 04/30] updating runners yaml file path --- .github/workflows/label-validation.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index b8c82ad09..2ed624947 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -19,7 +19,7 @@ jobs: run: | import yaml import re - with open(runners_config, 'r') as f: + with open('.github/configs/runners.yaml', 'r') as f: runners = yaml.safe_load(f) # Matches strings like h200-trt_gptoss @@ -55,9 +55,9 @@ jobs: --seq-lens 1k1k \ --test-mode \ --config-files \ - .github/configs/nvidia-master.yaml \ - .github/configs/amd-master.yaml \ - --runner-config .github/configs/runners.yaml) + ${GITHUB_WORKSPACE}..github/configs/nvidia-master.yaml \ + ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ + --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT validate: From 9e9bab6570f68e549313a300ec9afebad292fe1d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:10:38 -0600 Subject: [PATCH 05/30] run on syncornize now too --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 2ed624947..8b70e6fec 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -3,7 +3,7 @@ run-name: "Validate ${{ github.event.label.name }} PR #${{ github.event.pull_req on: pull_request: - types: [labeled] + types: [labeled, synchronize] branches: - main From ddef572d1afd6f4dbaef658bdab62827561ec511 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:11:30 -0600 Subject: [PATCH 06/30] append github workspace --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 8b70e6fec..f457a5113 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -19,7 +19,7 @@ jobs: run: | import yaml import re - with open('.github/configs/runners.yaml', 'r') as f: + with open('${{ github.workspace }}.github/configs/runners.yaml', 'r') as f: runners = yaml.safe_load(f) # Matches strings like h200-trt_gptoss From 30b48d968deb0c9acb16b611c77eab1add718c2b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:12:02 -0600 Subject: [PATCH 07/30] append github workspace pt 2 --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index f457a5113..ba52a9fcc 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -19,7 +19,7 @@ jobs: run: | import yaml import re - with open('${{ github.workspace }}.github/configs/runners.yaml', 'r') as f: + with open('${{ github.workspace }}/.github/configs/runners.yaml', 'r') as f: runners = yaml.safe_load(f) # Matches strings like h200-trt_gptoss From c7a077dff3527ff710f7eaa0a650e3205e383104 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:15:12 -0600 Subject: [PATCH 08/30] actually we dont need runners config at a ll --- .github/workflows/label-validation.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index ba52a9fcc..27a4b48af 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -17,10 +17,7 @@ jobs: - name: Parse label shell: python run: | - import yaml import re - with open('${{ github.workspace }}/.github/configs/runners.yaml', 'r') as f: - runners = yaml.safe_load(f) # Matches strings like h200-trt_gptoss runner_model_pattern = r'^([^_]+)_([^_]+)$' From ff039ca3c0696a15496d41e215a4592d02753bd6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:16:22 -0600 Subject: [PATCH 09/30] debug --- .github/workflows/label-validation.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 27a4b48af..b28f184eb 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -19,6 +19,8 @@ jobs: run: | import re + print("${{ github.event.label.name }}") + # Matches strings like h200-trt_gptoss runner_model_pattern = r'^([^_]+)_([^_]+)$' match = re.match(runner_model_pattern, '${{ github.event.label.name }}') From 11980b06c888f5a0e33488ac079bcffe59588519 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:21:32 -0600 Subject: [PATCH 10/30] debug 2 --- .github/workflows/label-validation.yml | 28 +++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index b28f184eb..2026052fb 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -1,5 +1,5 @@ name: PR Label Validation -run-name: "Validate ${{ github.event.label.name }} PR #${{ github.event.pull_request.number }}" +run-name: "Validate PR #${{ github.event.pull_request.number }}" on: pull_request: @@ -15,22 +15,23 @@ jobs: model-prefix: ${{ steps.parse-label.outputs.model-prefix }} steps: - name: Parse label + id: parse-label shell: python run: | + import json import re + import os - print("${{ github.event.label.name }}") + # Print all PR labels + labels_json = '${{ toJson(github.event.pull_request.labels) }}' + print(f"All PR labels JSON: {labels_json}") - # Matches strings like h200-trt_gptoss - runner_model_pattern = r'^([^_]+)_([^_]+)$' - match = re.match(runner_model_pattern, '${{ github.event.label.name }}') - if match: - runner_type = match.group(1) - model_prefix = match.group(2) + labels = '${{ github.event.pull_request.labels }}' + print(f"All PR labels: {labels}") - with open(os.environ['GITHUB_OUTPUT'], 'a') as f: - f.write(f'runner-type={runner_type}\n') - f.write(f'model-prefix={model_prefix}\n') + labels = json.loads(labels_json) + label_names = [label['name'] for label in labels] + print(f"Label names: {label_names}") @@ -54,17 +55,16 @@ jobs: --seq-lens 1k1k \ --test-mode \ --config-files \ - ${GITHUB_WORKSPACE}..github/configs/nvidia-master.yaml \ + ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml \ ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT validate: needs: get-jobs - # Prolly unnecessary + # Prolly unnecessary if: ${{ needs.get-jobs.outputs.search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: Validate ${{ github.event.label.name }} strategy: fail-fast: false matrix: From dc61215fe1c6fc54084152de49ce4958d19ed199 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:23:47 -0600 Subject: [PATCH 11/30] debug 3 --- .github/workflows/label-validation.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 2026052fb..ce3476cd8 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -23,13 +23,13 @@ jobs: import os # Print all PR labels - labels_json = '${{ toJson(github.event.pull_request.labels) }}' - print(f"All PR labels JSON: {labels_json}") + # labels_json = '${{ toJson(github.event.pull_request.labels) }}' + print(f"All PR labels JSON: ${{ toJson(github.event.pull_request.labels) }}") - labels = '${{ github.event.pull_request.labels }}' - print(f"All PR labels: {labels}") + # labels = '${{ github.event.pull_request.labels }}' + print(f"All PR labels: ${{ github.event.pull_request.labels }}") - labels = json.loads(labels_json) + labels = json.loads('${{ toJson(github.event.pull_request.labels) }}') label_names = [label['name'] for label in labels] print(f"Label names: {label_names}") From a99a6c7f7055c7ee3fc0e918a6c11f559a5aff61 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:24:43 -0600 Subject: [PATCH 12/30] debug 4 --- .github/workflows/label-validation.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index ce3476cd8..2b8af89b2 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -23,10 +23,8 @@ jobs: import os # Print all PR labels - # labels_json = '${{ toJson(github.event.pull_request.labels) }}' print(f"All PR labels JSON: ${{ toJson(github.event.pull_request.labels) }}") - # labels = '${{ github.event.pull_request.labels }}' print(f"All PR labels: ${{ github.event.pull_request.labels }}") labels = json.loads('${{ toJson(github.event.pull_request.labels) }}') From 2590b94dfa95ea667b7853644cce9a04b7b4fad4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:25:39 -0600 Subject: [PATCH 13/30] debug 5 --- .github/workflows/label-validation.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 2b8af89b2..a56e5589e 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -23,11 +23,11 @@ jobs: import os # Print all PR labels - print(f"All PR labels JSON: ${{ toJson(github.event.pull_request.labels) }}") + labels_json = r'''${{ toJson(github.event.pull_request.labels) }}''' + print("All PR labels JSON:") + print(labels_json) - print(f"All PR labels: ${{ github.event.pull_request.labels }}") - - labels = json.loads('${{ toJson(github.event.pull_request.labels) }}') + labels = json.loads(labels_json) label_names = [label['name'] for label in labels] print(f"Label names: {label_names}") From 657a4daf73baf2d966cb3822f4e1763907855de4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:27:01 -0600 Subject: [PATCH 14/30] debug 6 --- .github/workflows/label-validation.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index a56e5589e..2fb697ebe 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -22,12 +22,7 @@ jobs: import re import os - # Print all PR labels - labels_json = r'''${{ toJson(github.event.pull_request.labels) }}''' - print("All PR labels JSON:") - print(labels_json) - - labels = json.loads(labels_json) + labels = json.loads(r'''${{ toJson(github.event.pull_request.labels) }}''') label_names = [label['name'] for label in labels] print(f"Label names: {label_names}") From 0c762c780b44555c1f00b81d3944546a9c33908d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:35:04 -0600 Subject: [PATCH 15/30] support multiple labels --- .github/workflows/label-validation.yml | 72 +++++++++++++++----------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 2fb697ebe..ca6e54555 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -8,50 +8,62 @@ on: - main jobs: - parse-label: + get-jobs: runs-on: ubuntu-latest outputs: - runner-type: ${{ steps.parse-label.outputs.runner-type }} - model-prefix: ${{ steps.parse-label.outputs.model-prefix }} + search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} steps: - - name: Parse label - id: parse-label + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-jobs shell: python run: | import json + import subprocess import re import os + # Get matching labels labels = json.loads(r'''${{ toJson(github.event.pull_request.labels) }}''') - label_names = [label['name'] for label in labels] - print(f"Label names: {label_names}") + pattern = r'^([^_]+)_([^_]+)$' + matching = [] + for label in labels: + match = re.match(pattern, label['name']) + if match: + matching.append({'runner-type': match.group(1), 'model-prefix': match.group(2)}) + print(f"Matched label: {label['name']}") + if not matching: + print("No matching labels found") + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('search-space-config=[]\n') + exit(0) - get-jobs: - needs: parse-label - if: ${{ needs.parse-label.outputs.runner-type != '' && needs.parse-label.outputs.model-prefix != ''}} - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@v4 + # Generate configs for all matching labels + subprocess.run(['pip', 'install', 'pydantic'], check=True) - - id: get-jobs - run: | - pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py \ - full-sweep \ - --runner-type ${{ needs.parse-label.outputs.runner-type }} \ - --model-prefix ${{ needs.parse-label.outputs.model-prefix }} \ - --seq-lens 1k1k \ - --test-mode \ - --config-files \ - ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml \ - ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ - --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + all_configs = [] + for label in matching: + result = subprocess.run([ + 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py", + 'full-sweep', + '--runner-type', label['runner-type'], + '--model-prefix', label['model-prefix'], + '--seq-lens', '1k1k', + '--test-mode', + '--config-files', + f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml", + f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml", + '--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml" + ], capture_output=True, text=True, check=True) + + all_configs.extend(json.loads(result.stdout)) + + print(f"Total configs: {len(all_configs)}") + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write(f'search-space-config={json.dumps(all_configs)}\n') validate: needs: get-jobs From d098ffcc7af652fecb3b37fc41b478b4323f46cc Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 12:36:02 -0600 Subject: [PATCH 16/30] debug 7 --- .github/workflows/label-validation.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index ca6e54555..96888d1e9 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -57,7 +57,13 @@ jobs: f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml", f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml", '--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml" - ], capture_output=True, text=True, check=True) + ], capture_output=True, text=True) + + if result.returncode != 0: + print(f"Error generating configs:") + print(f"STDOUT: {result.stdout}") + print(f"STDERR: {result.stderr}") + exit(1) all_configs.extend(json.loads(result.stdout)) From 9d264aa582055dc54e1dfa74cc3bb1150e3e9e73 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 13:51:29 -0600 Subject: [PATCH 17/30] debug 8 --- .github/workflows/label-validation.yml | 250 +++++++++++++------------ 1 file changed, 127 insertions(+), 123 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 96888d1e9..b24d64c9d 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -1,130 +1,134 @@ name: PR Label Validation run-name: "Validate PR #${{ github.event.pull_request.number }}" +concurrency: + group: "PR#${{ github.event.pull_request.number }}" + cancel-in-progress: true + on: - pull_request: - types: [labeled, synchronize] - branches: - - main + pull_request: + types: [labeled, synchronize] + branches: + - main jobs: - get-jobs: - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - id: get-jobs - shell: python - run: | - import json - import subprocess - import re - import os - - # Get matching labels - labels = json.loads(r'''${{ toJson(github.event.pull_request.labels) }}''') - pattern = r'^([^_]+)_([^_]+)$' - - matching = [] - for label in labels: - match = re.match(pattern, label['name']) - if match: - matching.append({'runner-type': match.group(1), 'model-prefix': match.group(2)}) - print(f"Matched label: {label['name']}") - - if not matching: - print("No matching labels found") - with open(os.environ['GITHUB_OUTPUT'], 'a') as f: - f.write('search-space-config=[]\n') - exit(0) - - # Generate configs for all matching labels - subprocess.run(['pip', 'install', 'pydantic'], check=True) - - all_configs = [] - for label in matching: - result = subprocess.run([ - 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py", - 'full-sweep', - '--runner-type', label['runner-type'], - '--model-prefix', label['model-prefix'], - '--seq-lens', '1k1k', - '--test-mode', - '--config-files', - f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml", - f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml", - '--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml" - ], capture_output=True, text=True) - - if result.returncode != 0: - print(f"Error generating configs:") - print(f"STDOUT: {result.stdout}") - print(f"STDERR: {result.stderr}") - exit(1) - - all_configs.extend(json.loads(result.stdout)) - - print(f"Total configs: {len(all_configs)}") - with open(os.environ['GITHUB_OUTPUT'], 'a') as f: - f.write(f'search-space-config={json.dumps(all_configs)}\n') - - validate: - needs: get-jobs - # Prolly unnecessary - if: ${{ needs.get-jobs.outputs.search-space-config != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} - secrets: inherit + get-jobs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-jobs + shell: python + run: | + import json + import subprocess + import re + import os + + # Get matching labels + labels = json.loads(r'''${{ toJson(github.event.pull_request.labels) }}''') + pattern = r'^([^_]+)_([^_]+)$' + + matching = [] + for label in labels: + match = re.match(pattern, label['name']) + if match: + matching.append({'runner-type': match.group(1), 'model-prefix': match.group(2)}) + print(f"Matched label: {label['name']}") + + if not matching: + print("No matching labels found") + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('search-space-config=[]\n') + exit(0) + + # Generate configs for all matching labels + subprocess.run(['pip', 'install', 'pydantic'], check=True) + + all_configs = [] + for label in matching: + result = subprocess.run([ + 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py", + 'full-sweep', + '--runner-type', label['runner-type'], + '--model-prefix', label['model-prefix'], + '--seq-lens', '1k1k', + '--test-mode', + '--config-files', + f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml", + f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml", + '--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml" + ], capture_output=True, text=True) + + if result.returncode != 0: + print(f"Error generating configs:") + print(f"STDOUT: {result.stdout}") + print(f"STDERR: {result.stderr}") + exit(1) + + all_configs.extend(json.loads(result.stdout)) + + print(f"Total configs: {len(all_configs)}") + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write(f'search-space-config={json.dumps(all_configs)}\n') + + validate: + needs: get-jobs + # Prolly unnecessary + if: ${{ needs.get-jobs.outputs.search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + calc-success-rate: + needs: validate + if: ${{ always() }} + runs-on: ubuntu-latest + + env: + RESULTS_DIR: "results/" + STATS_FILENAME: "run_stats" + GITHUB_TOKEN: ${{ secrets.REPO_PAT }} + + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Download results artifacts + uses: actions/download-artifact@v4 + with: + path: ${{ env.RESULTS_DIR }} + pattern: results_* + + - name: Install python dependencies + run: pip install PyGithub + + - name: Calculate success rate + run: python3 utils/calc_success_rate.py $STATS_FILENAME + + - uses: actions/upload-artifact@v4 with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - calc-success-rate: - needs: validate - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@v3 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@v4 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@v4 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json + name: "run-stats" + path: ${{ env.STATS_FILENAME }}.json From 67efd3cb7d914e954e3c2d582ba5791c8b3a2018 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 14:23:01 -0600 Subject: [PATCH 18/30] debug 10 --- .github/workflows/label-validation.yml | 109 ++++++++++++++++++------- 1 file changed, 78 insertions(+), 31 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index b24d64c9d..023fe6345 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -16,6 +16,7 @@ jobs: runs-on: ubuntu-latest outputs: search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} + gb200-config: ${{ steps.get-jobs.outputs.gb200-config }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -33,51 +34,74 @@ jobs: pattern = r'^([^_]+)_([^_]+)$' matching = [] + gb200_labels = [] for label in labels: match = re.match(pattern, label['name']) if match: - matching.append({'runner-type': match.group(1), 'model-prefix': match.group(2)}) - print(f"Matched label: {label['name']}") + runner_type = match.group(1) + model_prefix = match.group(2) - if not matching: + if runner_type == 'gb200': + gb200_labels.append({'runner-type': runner_type, 'model-prefix': model_prefix}) + print(f"Matched GB200 label: {label['name']}") + else: + matching.append({'runner-type': runner_type, 'model-prefix': model_prefix}) + print(f"Matched label: {label['name']}") + + if not matching and not gb200_labels: print("No matching labels found") with open(os.environ['GITHUB_OUTPUT'], 'a') as f: f.write('search-space-config=[]\n') + f.write('gb200-config=[]\n') exit(0) - # Generate configs for all matching labels - subprocess.run(['pip', 'install', 'pydantic'], check=True) - + # Generate configs for standard labels all_configs = [] - for label in matching: - result = subprocess.run([ - 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py", - 'full-sweep', - '--runner-type', label['runner-type'], - '--model-prefix', label['model-prefix'], - '--seq-lens', '1k1k', - '--test-mode', - '--config-files', - f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml", - f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml", - '--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml" - ], capture_output=True, text=True) - - if result.returncode != 0: - print(f"Error generating configs:") - print(f"STDOUT: {result.stdout}") - print(f"STDERR: {result.stderr}") - exit(1) - - all_configs.extend(json.loads(result.stdout)) - - print(f"Total configs: {len(all_configs)}") + if matching: + subprocess.run(['pip', 'install', 'pydantic'], check=True) + + for label in matching: + result = subprocess.run([ + 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py", + 'full-sweep', + '--runner-type', label['runner-type'], + '--model-prefix', label['model-prefix'], + '--seq-lens', '1k1k', + '--test-mode', + '--config-files', + f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml", + f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml", + '--runner-config', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/runners.yaml" + ], capture_output=True, text=True) + + if result.returncode != 0: + print(f"Error generating configs:") + print(f"STDOUT: {result.stdout}") + print(f"STDERR: {result.stderr}") + exit(1) + + all_configs.extend(json.loads(result.stdout)) + + # Handle GB200 configs (use static config like in full-sweep-test.yml) + gb200_configs = [] + if gb200_labels: + # Static GB200 config from full-sweep-test.yml + gb200_configs = [ + {"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "on"}, + {"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "off"}, + {"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "on"}, + {"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "off"} + ] + + print(f"Total standard configs: {len(all_configs)}") + print(f"Total GB200 configs: {len(gb200_configs)}") + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: f.write(f'search-space-config={json.dumps(all_configs)}\n') + f.write(f'gb200-config={json.dumps(gb200_configs)}\n') validate: needs: get-jobs - # Prolly unnecessary if: ${{ needs.get-jobs.outputs.search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml strategy: @@ -85,6 +109,7 @@ jobs: matrix: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit + name: validate ${{ fromJson(needs.get-jobs.outputs.search-space-config).runner }} ${{ fromJson(needs.get-jobs.outputs.search-space-config).image }} with: exp-name: ${{ matrix.config.exp-name }} isl: ${{ matrix.config.isl }} @@ -100,8 +125,30 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + validate-gb200: + needs: get-jobs + if: ${{ needs.get-jobs.outputs.gb200-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gb200 validation + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.gb200-config) }} + secrets: inherit + with: + runner: gb200 + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: dsr1_1k1k + isl: "1024" + osl: "1024" + max-model-len: 2048 + mtp-mode: ${{ matrix.config.mtp }} + calc-success-rate: - needs: validate + needs: [validate, validate-gb200] if: ${{ always() }} runs-on: ubuntu-latest From 94dca3fbbfdbde8650449e2509b1c3b6af7e5cbc Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 14:26:34 -0600 Subject: [PATCH 19/30] debug 11 --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 023fe6345..f69f69525 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -109,7 +109,7 @@ jobs: matrix: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit - name: validate ${{ fromJson(needs.get-jobs.outputs.search-space-config).runner }} ${{ fromJson(needs.get-jobs.outputs.search-space-config).image }} + name: validate ${{ fromJson(needs.get-jobs.outputs.search-space-config)['runner'] }} ${{ fromJson(needs.get-jobs.outputs.search-space-config)['image'] }} with: exp-name: ${{ matrix.config.exp-name }} isl: ${{ matrix.config.isl }} From 1eff9c6c38851abc941368c628abb368e1c8130c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 14:56:46 -0600 Subject: [PATCH 20/30] debug 12 --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index f69f69525..7051839a4 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -109,7 +109,7 @@ jobs: matrix: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit - name: validate ${{ fromJson(needs.get-jobs.outputs.search-space-config)['runner'] }} ${{ fromJson(needs.get-jobs.outputs.search-space-config)['image'] }} + name: validate ${{ matrix.config.runner }} ${{ matrix.config.name }} with: exp-name: ${{ matrix.config.exp-name }} isl: ${{ matrix.config.isl }} From c3ce2b634a3e5a34c33de00cbb4b8b19b7b115fd Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 14:58:26 -0600 Subject: [PATCH 21/30] debug 13 --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 7051839a4..e07cbb526 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -109,7 +109,7 @@ jobs: matrix: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit - name: validate ${{ matrix.config.runner }} ${{ matrix.config.name }} + name: validate ${{ matrix.config.runner }} ${{ matrix.config.model }} with: exp-name: ${{ matrix.config.exp-name }} isl: ${{ matrix.config.isl }} From 06dfb773fd7e353da8e45c4ed8c725d9f63d0913 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 15:01:19 -0600 Subject: [PATCH 22/30] adding forward slash for prettier grouping matrix --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index e07cbb526..26ea2fa2b 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -109,7 +109,7 @@ jobs: matrix: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit - name: validate ${{ matrix.config.runner }} ${{ matrix.config.model }} + name: validate ${{ matrix.config.runner }} ${{ matrix.config.model }} / with: exp-name: ${{ matrix.config.exp-name }} isl: ${{ matrix.config.isl }} From 936fd2d05ccfa9081069738c29e3fbe06a3cc0f0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 15:06:11 -0600 Subject: [PATCH 23/30] debug 14 --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 26ea2fa2b..4ad631623 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -109,7 +109,7 @@ jobs: matrix: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit - name: validate ${{ matrix.config.runner }} ${{ matrix.config.model }} / + name: validate ${{ matrix.config.runner }} / with: exp-name: ${{ matrix.config.exp-name }} isl: ${{ matrix.config.isl }} From a370ae66d283a04c364bb3ad399c4e58dc43d5f2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 15:08:01 -0600 Subject: [PATCH 24/30] debug 15 --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 4ad631623..451c77751 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -109,7 +109,7 @@ jobs: matrix: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit - name: validate ${{ matrix.config.runner }} / + name: validate ${{ matrix.config.runner }} with: exp-name: ${{ matrix.config.exp-name }} isl: ${{ matrix.config.isl }} From 56cfa91f454e0d873a2a8ecf444a9fb019b63f2d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 15:38:46 -0600 Subject: [PATCH 25/30] add validate gb200 logic --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 451c77751..329434142 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -129,7 +129,7 @@ jobs: needs: get-jobs if: ${{ needs.get-jobs.outputs.gb200-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 validation + name: validate gb200 / strategy: fail-fast: false matrix: From af1d1bd3ae6cb80271a3be8c928b786a1aa13383 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 15:42:58 -0600 Subject: [PATCH 26/30] add validate gb200 logic pt 2 --- .github/workflows/label-validation.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 329434142..cd3c128c6 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -83,6 +83,7 @@ jobs: all_configs.extend(json.loads(result.stdout)) # Handle GB200 configs (use static config like in full-sweep-test.yml) + # FIXME: https://github.com/InferenceMAX/InferenceMAX/issues/171 gb200_configs = [] if gb200_labels: # Static GB200 config from full-sweep-test.yml @@ -125,6 +126,7 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + # FIXME: https://github.com/InferenceMAX/InferenceMAX/issues/171 validate-gb200: needs: get-jobs if: ${{ needs.get-jobs.outputs.gb200-config != '[]' }} From c9d4f735a312c596ed03819df18ebfab5b7891e3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 5 Nov 2025 17:11:55 -0600 Subject: [PATCH 27/30] add validate gb200 logic pt 2 --- .github/workflows/label-validation.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index cd3c128c6..ca7e03ae7 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -8,8 +8,6 @@ concurrency: on: pull_request: types: [labeled, synchronize] - branches: - - main jobs: get-jobs: From d16a63e6fb0866216cae225072c4ae6ef93e61f0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 6 Nov 2025 10:40:58 -0600 Subject: [PATCH 28/30] add unlabeled event trigger --- .github/workflows/label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index ca7e03ae7..3b05b2fdc 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -7,7 +7,7 @@ concurrency: on: pull_request: - types: [labeled, synchronize] + types: [labeled, unlabeled, synchronize] jobs: get-jobs: From 1a2dded80393b5ec4fbc8062102237b03fd5310f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 6 Nov 2025 11:45:58 -0600 Subject: [PATCH 29/30] remove gb200 --- .github/workflows/label-validation.yml | 52 ++------------------------ 1 file changed, 4 insertions(+), 48 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 3b05b2fdc..dbf145eda 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -14,7 +14,6 @@ jobs: runs-on: ubuntu-latest outputs: search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} - gb200-config: ${{ steps.get-jobs.outputs.gb200-config }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -32,25 +31,19 @@ jobs: pattern = r'^([^_]+)_([^_]+)$' matching = [] - gb200_labels = [] for label in labels: match = re.match(pattern, label['name']) if match: runner_type = match.group(1) model_prefix = match.group(2) - if runner_type == 'gb200': - gb200_labels.append({'runner-type': runner_type, 'model-prefix': model_prefix}) - print(f"Matched GB200 label: {label['name']}") - else: - matching.append({'runner-type': runner_type, 'model-prefix': model_prefix}) - print(f"Matched label: {label['name']}") + matching.append({'runner-type': runner_type, 'model-prefix': model_prefix}) + print(f"Matched label: {label['name']}") - if not matching and not gb200_labels: + if not matching: print("No matching labels found") with open(os.environ['GITHUB_OUTPUT'], 'a') as f: f.write('search-space-config=[]\n') - f.write('gb200-config=[]\n') exit(0) # Generate configs for standard labels @@ -80,24 +73,10 @@ jobs: all_configs.extend(json.loads(result.stdout)) - # Handle GB200 configs (use static config like in full-sweep-test.yml) - # FIXME: https://github.com/InferenceMAX/InferenceMAX/issues/171 - gb200_configs = [] - if gb200_labels: - # Static GB200 config from full-sweep-test.yml - gb200_configs = [ - {"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "on"}, - {"image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", "model": "deepseek-ai/DeepSeek-R1-0528", "framework": "dynamo-sglang", "precision": "fp4", "mtp": "off"}, - {"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "on"}, - {"image": "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "framework": "dynamo-trtllm", "precision": "fp4", "mtp": "off"} - ] - print(f"Total standard configs: {len(all_configs)}") - print(f"Total GB200 configs: {len(gb200_configs)}") with open(os.environ['GITHUB_OUTPUT'], 'a') as f: f.write(f'search-space-config={json.dumps(all_configs)}\n') - f.write(f'gb200-config={json.dumps(gb200_configs)}\n') validate: needs: get-jobs @@ -124,31 +103,8 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - # FIXME: https://github.com/InferenceMAX/InferenceMAX/issues/171 - validate-gb200: - needs: get-jobs - if: ${{ needs.get-jobs.outputs.gb200-config != '[]' }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: validate gb200 / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-jobs.outputs.gb200-config) }} - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: dsr1_1k1k - isl: "1024" - osl: "1024" - max-model-len: 2048 - mtp-mode: ${{ matrix.config.mtp }} - calc-success-rate: - needs: [validate, validate-gb200] + needs: validate if: ${{ always() }} runs-on: ubuntu-latest From cea369244f74e43c1dfc1f661dd193dbe7bad70a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 6 Nov 2025 12:47:57 -0600 Subject: [PATCH 30/30] add collect results --- .github/workflows/label-validation.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index dbf145eda..d4f3c7194 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -103,9 +103,15 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - calc-success-rate: + collect-results: needs: validate if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + + calc-success-rate: + needs: collect-results + if: ${{ always() }} runs-on: ubuntu-latest env: