From b9e21d1d2675026de4ce1a4ee434c3888f04284f Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 28 Sep 2025 17:00:04 +0000 Subject: [PATCH 1/2] fix: loosen sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh step time check Signed-off-by: Terry Kong --- tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh index 05305ab3f7..1ede241679 100755 --- a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh +++ b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh @@ -37,5 +37,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma 'data["train/loss"]["1"] < 0.82' \ 'data["train/loss"]["250"] < 0.5' \ 'max(data["ray/node.0.gpu.0.mem_gb"]) < 25' \ - 'mean(data["timing/train/total_step_time"], -6, -1) < 0.6' + 'mean(data["timing/train/total_step_time"], -6, -1) < 0.7' + # timing/train/total_step_time observed 0.6-0.64 fi From 32e0890f21a0551a35f8a6dc13f80fbe2be1efac Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Mon, 29 Sep 2025 03:30:51 +0000 Subject: [PATCH 2/2] fix the train loss after num_workers=1 change Signed-off-by: Terry Kong --- tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh index 1ede241679..b5edc8043e 100755 --- a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh +++ b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.sh @@ -35,8 +35,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["1"] < 0.82' \ - 'data["train/loss"]["250"] < 0.5' \ + 'mean(data["train/loss"],-10,-1) < 0.58' \ 'max(data["ray/node.0.gpu.0.mem_gb"]) < 25' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 0.7' + # mean(data["train/loss"],-10,-1) observed to be 0.5557474825117323 # timing/train/total_step_time observed 0.6-0.64 -fi +fi \ No newline at end of file