Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/full-sweep-1k1k-scheduler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ jobs:
env:
RESULTS_DIR: "results/"
STATS_FILENAME: "run_stats"
GITHUB_TOKEN: ${{ secrets.REPO_PAT }}

steps:
- uses: actions/checkout@v3
Expand All @@ -46,8 +47,11 @@ jobs:
path: ${{ env.RESULTS_DIR }}
pattern: results_*

- name: Install python dependencies
run: pip install PyGithub

- name: Calculate success rate
run: python3 utils/calc_success_rate.py $RESULTS_DIR $STATS_FILENAME
run: python3 utils/calc_success_rate.py $STATS_FILENAME

- uses: actions/upload-artifact@v4
with:
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/full-sweep-1k8k-scheduler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ jobs:
env:
RESULTS_DIR: "results/"
STATS_FILENAME: "run_stats"
GITHUB_TOKEN: ${{ secrets.REPO_PAT }}

steps:
- uses: actions/checkout@v3
Expand All @@ -46,8 +47,11 @@ jobs:
path: ${{ env.RESULTS_DIR }}
pattern: results_*

- name: Install python dependencies
run: pip install PyGithub

- name: Calculate success rate
run: python3 utils/calc_success_rate.py $RESULTS_DIR $STATS_FILENAME
run: python3 utils/calc_success_rate.py $STATS_FILENAME

- uses: actions/upload-artifact@v4
with:
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/full-sweep-8k1k-scheduler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ jobs:
env:
RESULTS_DIR: "results/"
STATS_FILENAME: "run_stats"
GITHUB_TOKEN: ${{ secrets.REPO_PAT }}

steps:
- uses: actions/checkout@v3
Expand All @@ -46,8 +47,11 @@ jobs:
path: ${{ env.RESULTS_DIR }}
pattern: results_*

- name: Install python dependencies
run: pip install PyGithub

- name: Calculate success rate
run: python3 utils/calc_success_rate.py $RESULTS_DIR $STATS_FILENAME
run: python3 utils/calc_success_rate.py $STATS_FILENAME

- uses: actions/upload-artifact@v4
with:
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/full-sweep-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ jobs:
env:
RESULTS_DIR: "results/"
STATS_FILENAME: "run_stats"
GITHUB_TOKEN: ${{ secrets.REPO_PAT }}

steps:
- uses: actions/checkout@v3
Expand All @@ -76,8 +77,11 @@ jobs:
path: ${{ env.RESULTS_DIR }}
pattern: results_*

- name: Install python dependencies
run: pip install PyGithub

- name: Calculate success rate
run: python3 utils/calc_success_rate.py $RESULTS_DIR $STATS_FILENAME
run: python3 utils/calc_success_rate.py $STATS_FILENAME

- uses: actions/upload-artifact@v4
with:
Expand Down
146 changes: 105 additions & 41 deletions utils/calc_success_rate.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,106 @@
import sys
import json
from pathlib import Path

n_iosl = 3 # [1k1k, 8k1k, 1k8k]
n_concs = 5 # [4, 8, 16, 32, 64]

# H200: (70b-tp: [1, 2, 4, 8], dsr1-tp: [8], gptoss-tp: [1, 2, 4, 8]) x [vllm/sglang, trt] + 70b-tp x trt extra conc: [128]
h200_runs = (4 + 1 + 4) * 2 * n_iosl * n_concs + 4 * n_iosl * 1

# B200:
# 70b = [tp1, tp8] x [fp4, fp8] x n_concs
# 70b-trt = [tp1, tp8] x [fp4, fp8] x conc:[4, 8, 16, 32, 64, 128, 256]
# dsr1 = [tp8] x (fp8: n_concs + fp4: [4, 8, 16, 32, 64, 128, 256])
# dsr1-trt = fp8: ([tp8] x n_concs) + fp4: ([tp4, tp8] x conc:[4, 8, 16, 32, 64, 128, 256])
b200_runs = (2 * 2 * n_concs + 2 * 2 * 7 + 1 * (n_concs + 7) + (1 * n_concs + 2 * 7)) * n_iosl

total_runs = {
'h100': (3 + 4) * n_iosl * n_concs, # 70b-tp: [2, 4, 8], gptoss-tp: [1, 2, 4, 8]
'h200': h200_runs,
'b200': b200_runs,
'mi300x': (4 + 1 + 4) * n_iosl * n_concs, # 70b-tp: [1, 2, 4, 8], dsr1-tp: [8], gptoss-tp: [1, 2, 4, 8]
'mi325x': (4 + 1 + 4) * n_iosl * n_concs, # 70b-tp: [1, 2, 4, 8], dsr1-tp: [8], gptoss-tp: [1, 2, 4, 8]
'mi355x': ((4 + 1) * 2 + 2) * n_iosl * n_concs, # (70b-tp: [1, 2, 4, 8], dsr1-tp: [8]) x [fp4, fp8], gptoss-tp: [1, 2, 4, 8]
'gb200': 65, # 45 runs (trtllm) + 20 runs (sglang)
}
success_runs = {'h100': 0, 'h200': 0, 'b200': 0, 'mi300x': 0, 'mi325x': 0, 'mi355x': 0, 'gb200': 0}


for results_filepath in Path(sys.argv[1]).rglob('*.json'):
with open(results_filepath) as f:
results = json.load(f)

for result in results:
hw_type = result['hw'].replace('-trt', '')
success_runs[hw_type] += 1

run_stats = {}
for hw, n_success in success_runs.items():
run_stats[hw] = {'n_success': n_success, 'total': total_runs[hw]}
with open(f'{sys.argv[2]}.json', 'w') as f:
json.dump(run_stats, f, indent=2)
import os
import re
import sys
from enum import Enum

from github import Auth, Github

GPU_SKUS = ["h100", "h200", "gb200", "mi300x", "mi325x", "mi355x", "b200"]
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
RUN_ID = os.environ.get("GITHUB_RUN_ID")
REPO_NAME = os.environ.get("GITHUB_REPOSITORY")

class JobStates(Enum):
SUCCESS = "success"
FAILURE = "failure"
CANCELLED = "cancelled"
SKIPPED = "skipped"


def extract_gpu_from_name(job_name):
job_lower = job_name.lower()

for gpu in GPU_SKUS:
# Match GPU name followed by word boundary or hyphen
# This matches 'b200', 'b200-trt', 'b200-fp8' but not 'gb200'
if re.search(rf'\b{gpu}(?:-|\b)', job_lower):
return gpu


def calculate_gpu_success_rates():
auth = Auth.Token(GITHUB_TOKEN)
g = Github(auth=auth)

try:
user = g.get_user().login
print(f"Authenticated as user: {user}")
except Exception as e:
print(f"Authentication failed: {e}")
return None

try:
repo = g.get_repo(REPO_NAME)
print(f"Found repo: {repo.full_name}")

run = repo.get_workflow_run(int(RUN_ID))
print(f"Found run: {run.id} - {run.name}")

except Exception as e:
print(f"Error: {e}")
raise

success_runs = {sku: 0 for sku in GPU_SKUS}
total_runs = {sku: 0 for sku in GPU_SKUS}

for job in run.jobs():
job_name = job.name
conclusion = job.conclusion # success, failure, cancelled, or skipped
gpu = extract_gpu_from_name(job_name)

if gpu:
if conclusion == JobStates.SKIPPED.value:
continue

total_runs[gpu] += 1

if conclusion == JobStates.SUCCESS.value:
success_runs[gpu] += 1

success_rates = {}
for gpu in success_runs.keys():
success_rates[gpu] = {
"n_success": success_runs[gpu],
"total": total_runs[gpu],
}

return success_rates


def print_success_rates(success_rates):
"""Pretty print the success rates."""
if success_rates is None:
print("No data to display")
return

print("\n" + "=" * 60)
print("GPU Success Rates")
print("=" * 60)
print(f"{'GPU':<10} {'Success':<10} {'Total':<10} {'Rate':<10}")
print("-" * 60)

for gpu, stats in sorted(success_rates.items()):
if stats["total"] > 0:
rate = (stats["n_success"] / stats["total"]) * 100
print(
f"{gpu:<10} {stats['n_success']:<10} {stats['total']:<10} {rate:<10.2f}%"
)
print("=" * 60)


if __name__ == "__main__":
run_stats = calculate_gpu_success_rates()
print_success_rates(run_stats)

with open(f"{sys.argv[1]}.json", "w") as f:
json.dump(run_stats, f, indent=2)