From 27d92cc7c915e714730b2cb92a8f6a859f210994 Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 21 Apr 2026 05:08:12 +0000 Subject: [PATCH 1/2] fix: gemma_3_270m_squad_peft HF KL regression in ckpt robustness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bump ci.checkpoint_robustness.hf_kl_threshold from 8e-3 to 3.5e-2 to restore the gemma_3_270m_squad_peft checkpoint-robustness CI job that started failing after the transformers v5.5 upgrade (#1734). Mirrors the sibling non-PEFT fix (#1932) and earlier qwen3_moe/gpt_oss fix (#1867). Phase 3 (automodel-from-consolidated) KL is still 0 — this is a forward-pass numerical drift in v5.5's Gemma3 text-only stack, not a save/reload correctness bug. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: adil-a --- .../llm_finetune/gemma/gemma_3_270m_squad_peft.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/llm_finetune/gemma/gemma_3_270m_squad_peft.yaml b/examples/llm_finetune/gemma/gemma_3_270m_squad_peft.yaml index f61d09bcc0..35ce8831ec 100644 --- a/examples/llm_finetune/gemma/gemma_3_270m_squad_peft.yaml +++ b/examples/llm_finetune/gemma/gemma_3_270m_squad_peft.yaml @@ -103,7 +103,17 @@ ci: recipe_owner: HuiyingLi time: "00:20:00" checkpoint_robustness: - hf_kl_threshold: 8e-3 + # Bumped from 8e-3 to 3.5e-2 after the transformers v5.5 upgrade (#1734). + # Same root cause as the non-PEFT sibling (#1932): with v5.5's Gemma3 text-only + # stack, the training-time forward (FSDP2 + kernel patches) and the vanilla HF + # eager forward diverge numerically even when saved weights match bit-for-bit + # (Phase 3 automodel-from-consolidated KL is still 0). For the PEFT variant + # Phase 4 additionally composes a LoRA adapter on top of a freshly-loaded HF + # base, which amplifies the drift further when the YAML runs with its + # example-level max_steps=100 (observed ~2.8e-2 on cw-dfw). Under the CI + # launcher's robustness override (max_steps=5) observed Phase 4 KL is + # 8.44e-3; keep enough headroom to cover both use cases. + hf_kl_threshold: 3.5e-2 tokenizer_name: google/gemma-3-270m dataset.limit_dataset_samples: 500 validation_dataset.limit_dataset_samples: 500 From e11dc2f860abde504a8ecbd32bc0d48b3cb23b0a Mon Sep 17 00:00:00 2001 From: Adil <47084919+adil-a@users.noreply.github.com> Date: Tue, 21 Apr 2026 01:12:48 -0400 Subject: [PATCH 2/2] Update gemma_3_270m_squad_peft.yaml --- .../llm_finetune/gemma/gemma_3_270m_squad_peft.yaml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/examples/llm_finetune/gemma/gemma_3_270m_squad_peft.yaml b/examples/llm_finetune/gemma/gemma_3_270m_squad_peft.yaml index 35ce8831ec..5868c3d56c 100644 --- a/examples/llm_finetune/gemma/gemma_3_270m_squad_peft.yaml +++ b/examples/llm_finetune/gemma/gemma_3_270m_squad_peft.yaml @@ -103,16 +103,6 @@ ci: recipe_owner: HuiyingLi time: "00:20:00" checkpoint_robustness: - # Bumped from 8e-3 to 3.5e-2 after the transformers v5.5 upgrade (#1734). - # Same root cause as the non-PEFT sibling (#1932): with v5.5's Gemma3 text-only - # stack, the training-time forward (FSDP2 + kernel patches) and the vanilla HF - # eager forward diverge numerically even when saved weights match bit-for-bit - # (Phase 3 automodel-from-consolidated KL is still 0). For the PEFT variant - # Phase 4 additionally composes a LoRA adapter on top of a freshly-loaded HF - # base, which amplifies the drift further when the YAML runs with its - # example-level max_steps=100 (observed ~2.8e-2 on cw-dfw). Under the CI - # launcher's robustness override (max_steps=5) observed Phase 4 KL is - # 8.44e-3; keep enough headroom to cover both use cases. hf_kl_threshold: 3.5e-2 tokenizer_name: google/gemma-3-270m dataset.limit_dataset_samples: 500