From 6bafe14deb6475248785f46993b0fd8081a0b297 Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 21 Apr 2026 15:49:27 +0000 Subject: [PATCH 1/3] fix: llama3_3_nemotron_super_49B_squad checkpoint robustness thresholds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI job for sft_ckpt_robustness failed at Phase 2 with `ValueError: inputs_embeds must be provided for pipeline stages without embed_tokens` in `nemo_automodel/components/distributed/pipelining/hf_utils.py`. The underlying test-harness bug (raw `model_parts[0].forward` can't be called on non-first PP stages) was already fixed on main by PR #1923 / 83dfbc7c ("fix: make _get_logits pp aware in ckpt robustness") — the next CI container rebuild will pick it up. Once Phase 2 is unblocked, the test will proceed to Phase 4 (vanilla-HF load of the consolidated safetensors) and Phase 6 (training resumption). This YAML bumps the two post-v5.5 thresholds that sibling SFT robustness jobs have already needed to widen: - `hf_kl_threshold` 5e-3 -> 2.5e-2: matches the post-transformers-v5.5 forward-pass-drift margin established by #1932 (gemma_3_270m_squad), #1937 (qwen2_5_7b_squad), and #1942 (qwen3_moe_30b_hellaswag). - `resume_loss_threshold: 5e-2`: matches #1937's TP>=2 SFT resume bump (default 5e-3 is too tight for TP=8 non-determinism). Note: the STATUS.md 2026-04-02 "combined QKV Phase 4 failure" for Super-49B is stale — the model runs via DeciLM remote_code, which keeps q/k/v and gate/up as separate Linears at runtime (see CI trace line 818-827), and the current plan selector routes to `get_decilm_nemotron_tp_plan` (separate projections) rather than the fused Llama-Nemotron-Super plan. Consolidated safetensors therefore ship with HF-compatible per-projection keys. Signed-off-by: Adil Asif Signed-off-by: adil-a --- .../nemotron/llama3_3_nemotron_super_49B_squad.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml b/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml index 0593583c96..42bf48b640 100644 --- a/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml +++ b/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml @@ -121,7 +121,8 @@ ci: time: "00:45:00" vllm_deploy: true checkpoint_robustness: - hf_kl_threshold: 5e-3 + hf_kl_threshold: 2.5e-2 + resume_loss_threshold: 5e-2 distributed.tp_size: 8 tokenizer_name: nvidia/Llama-3_3-Nemotron-Super-49B-v1_5 hf_device_map_auto: true From a7ebe213a6a15ceb962c9386507a09daff86502f Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 21 Apr 2026 18:33:12 +0000 Subject: [PATCH 2/3] fix: ensure consolidated config.json preserves model_type/auto_map for trust_remote_code models Problem: The checkpoint_robustness test for llama3_3_nemotron_super_49B (DeciLM, ``model_type=nemotron-nas``, trust_remote_code) fails at Phase 3/4 with: ``Unrecognized model in .../consolidated. Should have a 'model_type' key in its config.json``. The consolidated directory produced by ``ConsolidatedHFAddon.pre_save`` therefore ships a ``config.json`` that is missing ``model_type`` (and, depending on the transformers version, may also miss ``auto_map``), which prevents ``AutoConfig.from_pretrained`` from loading it even with ``trust_remote_code=True``. Root cause: HF's ``PreTrainedConfig.to_json_string`` defaults to ``use_diff=True``, which calls ``to_diff_dict`` and emits only keys whose values differ from those of ``self.__class__()``. For custom configs registered via ``register_for_auto_class`` (DeciLM / Llama-Nemotron-Super), the class-level ``model_type`` attribute can compare equal between the live instance and a fresh class-default instance, causing it to be dropped from the serialized diff. The same path can drop ``auto_map`` under similar conditions. Fix: After writing ``config.json`` via ``to_json_string()``, re-parse it and re-inject ``model_type`` (from the config's class/instance attribute) and ``auto_map`` (from the instance attribute, falling back to the original pretrained ``config.json`` on disk) when missing. Narrow, defensive, idempotent: no-op when the serialized JSON already contains both keys. Does not change behavior for the overwhelming majority of HF-native configs. Signed-off-by: Adil Asif Signed-off-by: adil-a --- .../components/checkpoint/addons.py | 57 ++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/nemo_automodel/components/checkpoint/addons.py b/nemo_automodel/components/checkpoint/addons.py index ac8788f598..b2bb7efa5c 100644 --- a/nemo_automodel/components/checkpoint/addons.py +++ b/nemo_automodel/components/checkpoint/addons.py @@ -75,12 +75,21 @@ def pre_save(self, **kwargs) -> None: config_name = "config.v5.json" _maybe_strip_quantization_config(model_part) - with open(os.path.join(hf_metadata_dir, config_name), "w") as f: + config_path = os.path.join(hf_metadata_dir, config_name) + with open(config_path, "w") as f: if hasattr(model_part.config, "to_json_string"): f.write(model_part.config.to_json_string()) else: # Diffusers models use FrozenDict for config instead of PretrainedConfig json.dump(dict(model_part.config), f, indent=2, default=str) + if hasattr(model_part.config, "to_json_string"): + # Guarantee ``model_type`` and ``auto_map`` land in the serialized JSON. + # HF's ``PreTrainedConfig.to_json_string`` defaults to ``use_diff=True``, + # which can drop these keys for ``trust_remote_code`` configs (e.g. + # DeciLM / Llama-Nemotron-Super) — causing ``AutoConfig.from_pretrained`` + # on the consolidated dir to raise ``Unrecognized model ... Should have + # a 'model_type' key``, breaking checkpoint-robustness reload (Phase 3/4). + _ensure_model_type_and_auto_map(config_path, model_part.config, original_model_path) # save the generation_config.json file if getattr(model_part, "generation_config", None) is not None: @@ -361,6 +370,52 @@ def _extract_target_modules(model: nn.Module, v4_compatible: bool = False) -> li return sorted(final_target_modules) +def _ensure_model_type_and_auto_map(config_path: str, config_obj, original_model_path: str | None) -> None: + """Ensure the saved ``config.json`` has ``model_type`` and (when applicable) ``auto_map``. + + Context: HF ``PreTrainedConfig.to_json_string`` defaults to ``use_diff=True`` and + may omit ``model_type`` or ``auto_map`` for ``trust_remote_code`` configs + (e.g. DeciLM / Llama-Nemotron-Super ``model_type='nemotron-nas'``). Without + ``model_type`` in ``config.json``, ``AutoConfig.from_pretrained`` on the + consolidated directory raises ``Unrecognized model ... Should have a + 'model_type' key``, breaking checkpoint-robustness reload (Phase 3/4). Without + ``auto_map``, HF cannot locate the custom config class even when + ``trust_remote_code=True`` is passed. + """ + try: + with open(config_path) as f: + config_dict = json.load(f) + except (OSError, ValueError): + return + + changed = False + + if not config_dict.get("model_type"): + model_type = getattr(type(config_obj), "model_type", None) or getattr(config_obj, "model_type", None) + if model_type: + config_dict["model_type"] = model_type + changed = True + + if not config_dict.get("auto_map"): + auto_map = getattr(config_obj, "auto_map", None) + if not auto_map and original_model_path and os.path.isdir(original_model_path): + src = os.path.join(original_model_path, "config.json") + if os.path.isfile(src): + try: + with open(src) as f: + original = json.load(f) + except (OSError, ValueError): + original = {} + auto_map = original.get("auto_map") + if auto_map: + config_dict["auto_map"] = auto_map + changed = True + + if changed: + with open(config_path, "w") as f: + json.dump(config_dict, f, indent=2, sort_keys=True) + + def _maybe_strip_quantization_config(model_part: nn.Module) -> None: """Remove ``quantization_config`` from the HF config when no parameters are quantized. From ef751cb972ad104d6b20124912649c908edae883 Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 21 Apr 2026 18:53:00 +0000 Subject: [PATCH 3/3] fix: also preserve architectures in consolidated config.json for trust_remote_code models Extends the previous fix to also preserve ``architectures`` from the original pretrained ``config.json`` when missing from the serialized output. Some transformers versions drop ``architectures`` when serializing configs registered via ``register_for_auto_class``, which can confuse downstream ``AutoModelForCausalLM.from_pretrained`` dispatch even when ``model_type`` and ``auto_map`` are present. Refactors ``_ensure_model_type_and_auto_map`` to read the original ``config.json`` once and use it as the single fallback source for all three keys, simplifying the logic. Behavior is otherwise unchanged for configs that already have all three keys in the ``to_json_string()`` output. Signed-off-by: Adil Asif Signed-off-by: adil-a --- .../components/checkpoint/addons.py | 38 +++++++++++++------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/nemo_automodel/components/checkpoint/addons.py b/nemo_automodel/components/checkpoint/addons.py index b2bb7efa5c..7ad8b90a14 100644 --- a/nemo_automodel/components/checkpoint/addons.py +++ b/nemo_automodel/components/checkpoint/addons.py @@ -386,31 +386,45 @@ def _ensure_model_type_and_auto_map(config_path: str, config_obj, original_model with open(config_path) as f: config_dict = json.load(f) except (OSError, ValueError): - return + config_dict = {} + + # Load the original pretrained config.json once (source of truth for trust_remote_code + # models whose ``config_obj`` may have lost ``auto_map`` on the NeMo code path). + original: dict = {} + if original_model_path and os.path.isdir(original_model_path): + src = os.path.join(original_model_path, "config.json") + if os.path.isfile(src): + try: + with open(src) as f: + original = json.load(f) + except (OSError, ValueError): + original = {} changed = False if not config_dict.get("model_type"): - model_type = getattr(type(config_obj), "model_type", None) or getattr(config_obj, "model_type", None) + model_type = ( + getattr(type(config_obj), "model_type", None) + or getattr(config_obj, "model_type", None) + or original.get("model_type") + ) if model_type: config_dict["model_type"] = model_type changed = True if not config_dict.get("auto_map"): - auto_map = getattr(config_obj, "auto_map", None) - if not auto_map and original_model_path and os.path.isdir(original_model_path): - src = os.path.join(original_model_path, "config.json") - if os.path.isfile(src): - try: - with open(src) as f: - original = json.load(f) - except (OSError, ValueError): - original = {} - auto_map = original.get("auto_map") + auto_map = getattr(config_obj, "auto_map", None) or original.get("auto_map") if auto_map: config_dict["auto_map"] = auto_map changed = True + # Also preserve ``architectures`` from the original if missing; some transformers + # versions drop it when serializing custom configs registered via + # ``register_for_auto_class``. + if not config_dict.get("architectures") and original.get("architectures"): + config_dict["architectures"] = original["architectures"] + changed = True + if changed: with open(config_path, "w") as f: json.dump(config_dict, f, indent=2, sort_keys=True)