From 3afc60d8ba9337794b9eee2e9ee7999fb518bf87 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 11 Nov 2025 17:04:48 -0600 Subject: [PATCH 1/8] add new mi325x scripts and configs --- .github/configs/runners.yaml | 3 ++ runners/launch_mi325x-amd.sh | 72 ++++++++++-------------------------- 2 files changed, 23 insertions(+), 52 deletions(-) diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 11608d32b..b37c37b81 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -34,6 +34,9 @@ mi300x: - 'mi300x-oci_0' mi325x: - 'mi325x-amd_0' +- 'mi325x-amd_1' +- 'mi325x-amd_2' +- 'mi325x-amd_3' - 'mi325x-tw_0' - 'mi325x-tw_1' - 'mi325x-tw_2' diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 91b9bfad3..13875c645 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -1,56 +1,24 @@ -#!/usr/bin/bash +#!/usr/bin/env bash -sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' +export HF_HUB_CACHE_MOUNT="/nfsdata/sa/hf_hub_cache-${USER: -1}/" +export PORT_OFFSET=${USER: -1} -HF_HUB_CACHE_MOUNT="/home/kimbosemianalysis/hf_hub_cache/" -PORT=8888 - -network_name="bmk-net" -server_name="bmk-server" -client_name="bmk-client" - -docker network create $network_name +PARTITION="compute" +SQUASH_FILE="/nfsdata/sa/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ ---privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ ---cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ --v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ ---entrypoint=/bin/bash \ -$IMAGE \ -benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi325x_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done +salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell +JOB_ID=$(squeue -u $USER -h -o %A | head -n1) + +srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE" +srun --jobid=$JOB_ID \ +--container-image=$SQUASH_FILE \ +--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mount-home \ +--container-writable \ +--container-remap-root \ +--container-workdir=/workspace/ \ +--no-container-entrypoint --export=ALL \ +bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh + +scancel $JOB_ID From 0b0466acf7c5b89cfe15efd44584bb0cb6612ee6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 11 Nov 2025 17:18:10 -0600 Subject: [PATCH 2/8] adding ability to filter runner node on runner-model-sweep in god file --- utils/matrix-logic/generate_sweep_configs.py | 12 +++ .../test_generate_sweep_configs.py | 83 +++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 0c3ccac51..217184a4b 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -471,6 +471,13 @@ def generate_runner_model_sweep_config(args, all_config_data): raise ValueError( f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") + # Filter runner nodes if filter is specified + if hasattr(args, 'runner_node_filter') and args.runner_node_filter: + runner_nodes = [node for node in runner_nodes if args.runner_node_filter in node] + if not runner_nodes: + raise ValueError( + f"No runner nodes found matching filter '{args.runner_node_filter}' for runner type '{args.runner_type}'.") + matrix_values = [] for key, val in all_config_data.items(): # Only consider configs with specified runner @@ -831,6 +838,11 @@ def main(): required=True, help='Configuration file holding runner information' ) + test_config_parser.add_argument( + '--runner-node-filter', + required=False, + help='Filter runner nodes by substring match (e.g., "mi300x-amd" to only include nodes containing that string)' + ) test_config_parser.add_argument( '-h', '--help', action='help', diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py index 15c5f25a3..c184ecbab 100644 --- a/utils/matrix-logic/test_generate_sweep_configs.py +++ b/utils/matrix-logic/test_generate_sweep_configs.py @@ -1208,6 +1208,7 @@ def test_generate_runner_model_sweep_config(sample_master_config, temp_config_fi class Args: runner_type = "h200" runner_config = runner_file + runner_node_filter = None result = generate_runner_model_sweep_config(Args(), sample_master_config) assert len(result) > 0 @@ -1224,11 +1225,72 @@ def test_generate_runner_model_sweep_config_invalid_runner(sample_master_config, class Args: runner_type = "invalid-runner" runner_config = runner_file + runner_node_filter = None with pytest.raises(ValueError, match="does not exist in runner config"): generate_runner_model_sweep_config(Args(), sample_master_config) +def test_generate_runner_model_sweep_config_with_node_filter(sample_master_config, temp_config_files): + """Test runner-model sweep with runner node filter.""" + _, runner_file = temp_config_files + + class Args: + runner_type = "h200" + runner_config = runner_file + runner_node_filter = "nv_1" + + result = generate_runner_model_sweep_config(Args(), sample_master_config) + # Should only have entries for h200-nv_1 + runners = set(entry['runner'] for entry in result) + assert 'h200-nv_1' in runners + assert 'h200-nv_2' not in runners + + +def test_generate_runner_model_sweep_config_with_node_filter_multiple_matches(sample_master_config, temp_config_files): + """Test runner-model sweep with runner node filter matching multiple nodes.""" + _, runner_file = temp_config_files + + class Args: + runner_type = "h200" + runner_config = runner_file + runner_node_filter = "nv" # Should match both nv_1 and nv_2 + + result = generate_runner_model_sweep_config(Args(), sample_master_config) + runners = set(entry['runner'] for entry in result) + assert 'h200-nv_1' in runners + assert 'h200-nv_2' in runners + + +def test_generate_runner_model_sweep_config_with_node_filter_no_matches(sample_master_config, temp_config_files): + """Test runner-model sweep with runner node filter that matches no nodes.""" + _, runner_file = temp_config_files + + class Args: + runner_type = "h200" + runner_config = runner_file + runner_node_filter = "nonexistent" + + with pytest.raises(ValueError, match="No runner nodes found matching filter"): + generate_runner_model_sweep_config(Args(), sample_master_config) + + +def test_generate_runner_model_sweep_config_without_node_filter(sample_master_config, temp_config_files): + """Test runner-model sweep without runner node filter (default behavior).""" + _, runner_file = temp_config_files + + class Args: + runner_type = "h200" + runner_config = runner_file + runner_node_filter = None + + result = generate_runner_model_sweep_config(Args(), sample_master_config) + # Should have entries for all h200 nodes + runners = set(entry['runner'] for entry in result) + assert 'h200-nv_1' in runners + assert 'h200-nv_2' in runners + + # Tests for generate_runner_sweep_config def test_generate_runner_sweep_config(sample_master_config, temp_config_files): """Test runner sweep config generation.""" @@ -1387,6 +1449,27 @@ def test_main_runner_model_sweep(temp_config_files): assert len(result) > 0 +def test_main_runner_model_sweep_with_node_filter(temp_config_files): + """Test main function with runner-model-sweep command with node filter.""" + master_file, runner_file = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "runner-model-sweep", + "--config-files", master_file, + "--runner-config", runner_file, + "--runner-type", "h200", + "--runner-node-filter", "nv_1" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) > 0 + runners = set(entry['runner'] for entry in result) + assert 'h200-nv_1' in runners + assert 'h200-nv_2' not in runners + + def test_main_runner_sweep(temp_config_files): """Test main function with runner-sweep command.""" master_file, runner_file = temp_config_files From f3894b5e76ace439e3d37eb0de554646b0e9abc5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 15:01:52 -0600 Subject: [PATCH 3/8] use hf instead of huggingface-cli; add debug info --- benchmarks/dsr1_fp8_h200_slurm.sh | 2 +- benchmarks/dsr1_fp8_mi300x_slurm.sh | 2 +- benchmarks/dsr1_fp8_mi325x_slurm.sh | 8 +++++++- benchmarks/gptoss_fp4_mi300x_slurm.sh | 2 +- benchmarks/gptoss_fp4_mi325x_slurm.sh | 2 +- 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 74a005a78..86ea0024f 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -17,7 +17,7 @@ echo "JOB \$SLURM_JOB_ID running on \$SLURMD_NODENAME" pip3 install --user sentencepiece -huggingface-cli download $MODEL +hf download $MODEL PORT=$(( 8888 + $PORT_OFFSET )) SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index c1c4276c2..90babeaee 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -15,7 +15,7 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -huggingface-cli download $MODEL +hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=8888 diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index d502093d8..e2f17ccfa 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -2,9 +2,15 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +echo "=== CPU Information Inside Container ===" +nproc +cat /proc/cpuinfo | grep processor | wc -l +cat /sys/fs/cgroup/cpuset/cpuset.cpus 2>/dev/null || echo "cpuset info not available" +echo "========================================" + SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=8888 -huggingface-cli download $MODEL +hf download $MODEL # Reference # https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-sglang-deepseek-r1-fp8.html#run-the-inference-benchmark diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index 8b657a085..0ab5a250f 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -15,7 +15,7 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -huggingface-cli download $MODEL +hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=8888 diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index d89ed501c..cab549cbc 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -16,7 +16,7 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -huggingface-cli download $MODEL +hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=8888 From dcfb618031058b9b4edc454922c2a3ec3ad39e7b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 15:24:38 -0600 Subject: [PATCH 4/8] no cpus per task --- runners/launch_mi325x-amd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 13875c645..a3187ab64 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -7,7 +7,7 @@ PARTITION="compute" SQUASH_FILE="/nfsdata/sa/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" set -x -salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell +salloc --partition=$PARTITION --gres=gpu:$TP --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE" From ae685088b6814d247213248a9b7b6f77b2dc5803 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 15:28:08 -0600 Subject: [PATCH 5/8] 256 cpus per task --- runners/launch_mi325x-amd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index a3187ab64..1065167d7 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -7,7 +7,7 @@ PARTITION="compute" SQUASH_FILE="/nfsdata/sa/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" set -x -salloc --partition=$PARTITION --gres=gpu:$TP --time=180 --no-shell +salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE" From 216fa057b14750dd3e751584dece876d77bc1142 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 16:24:47 -0600 Subject: [PATCH 6/8] revert erroneous change --- runners/launch_mi325x-amd.sh | 72 ++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 20 deletions(-) diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 1065167d7..91b9bfad3 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -1,24 +1,56 @@ -#!/usr/bin/env bash +#!/usr/bin/bash -export HF_HUB_CACHE_MOUNT="/nfsdata/sa/hf_hub_cache-${USER: -1}/" -export PORT_OFFSET=${USER: -1} +sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' -PARTITION="compute" -SQUASH_FILE="/nfsdata/sa/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +HF_HUB_CACHE_MOUNT="/home/kimbosemianalysis/hf_hub_cache/" +PORT=8888 + +network_name="bmk-net" +server_name="bmk-server" +client_name="bmk-client" + +docker network create $network_name set -x -salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell -JOB_ID=$(squeue -u $USER -h -o %A | head -n1) - -srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE" -srun --jobid=$JOB_ID \ ---container-image=$SQUASH_FILE \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ ---container-mount-home \ ---container-writable \ ---container-remap-root \ ---container-workdir=/workspace/ \ ---no-container-entrypoint --export=ALL \ -bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh - -scancel $JOB_ID +docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +--privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ +--cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ +-e ISL -e OSL \ +--entrypoint=/bin/bash \ +$IMAGE \ +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi325x_docker.sh" + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ Application\ startup\ complete ]]; then + break + fi +done < <(docker logs -f --tail=0 $server_name 2>&1) + +git clone https://github.com/kimbochen/bench_serving.git + +set -x +docker run --rm --network=$network_name --name=$client_name \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +--entrypoint=python3 \ +$IMAGE \ +bench_serving/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json + +while [ -n "$(docker ps -aq)" ]; do + docker stop $server_name + docker network rm $network_name + sleep 5 +done From c33727ce3574df6a32a745fa9c6bffb11c3646bd Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 16:25:21 -0600 Subject: [PATCH 7/8] get rid of debugging --- benchmarks/dsr1_fp8_mi325x_slurm.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index e2f17ccfa..c96fd4d51 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -2,11 +2,6 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "=== CPU Information Inside Container ===" -nproc -cat /proc/cpuinfo | grep processor | wc -l -cat /sys/fs/cgroup/cpuset/cpuset.cpus 2>/dev/null || echo "cpuset info not available" -echo "========================================" SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=8888 From 1df6b969cf69bbc10599fa1b3a9c813076f26a55 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 16:25:56 -0600 Subject: [PATCH 8/8] whitespace --- benchmarks/dsr1_fp8_mi325x_slurm.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index c96fd4d51..09dae4dbb 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -2,7 +2,6 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=8888 hf download $MODEL