Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docker/common/uv-pytorch.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3302,6 +3302,7 @@ dependencies = [
{ name = "opencv-python-headless" },
{ name = "pybind11" },
{ name = "pyyaml" },
{ name = "tiktoken" },
{ name = "torch", marker = "sys_platform == 'never'" },
{ name = "torchao", marker = "sys_platform == 'never'" },
{ name = "torchdata" },
Expand Down Expand Up @@ -3494,6 +3495,7 @@ requires-dist = [
{ name = "qwen-omni-utils", marker = "extra == 'vlm'" },
{ name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" },
{ name = "sentencepiece", marker = "extra == 'extra'" },
{ name = "tiktoken" },
{ name = "timm", marker = "extra == 'vlm'", specifier = "<=1.0.22" },
{ name = "torch", marker = "sys_platform != 'darwin' and sys_platform != 'linux'", specifier = ">=2.6.0,<=2.10.0", index = "https://download.pytorch.org/whl/cpu" },
{ name = "torch", marker = "sys_platform == 'darwin'", specifier = ">=2.6.0,<=2.10.0", index = "https://pypi.org/simple" },
Expand Down
2 changes: 1 addition & 1 deletion examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ ci:
recipe_owner: hemildesai
time: "00:15:00"
checkpoint_robustness:
hf_kl_threshold: 1e-3
hf_kl_threshold: 1e-2
tokenizer_name: Qwen/Qwen3-30B-A3B
no_check_resume: true
dataset.num_samples_limit: 500
Expand Down
2 changes: 1 addition & 1 deletion examples/llm_finetune/qwen/qwen3_moe_30b_uccl_ep.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ distributed:
pp_size: 1
# Set ep_size to the total number of GPUs across all nodes.
# UCCL-EP supports ep_size: 2, 4, 8, 16, 32, 64, 128.
ep_size: 16
ep_size: 8

sequence_parallel: false
activation_checkpointing: true
Expand Down
53 changes: 51 additions & 2 deletions nemo_automodel/_transformers/model_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,8 @@ def get_hf_config(pretrained_model_name_or_path, attn_implementation, **kwargs):
attn_implementation=attn_implementation,
)
except ValueError as e:
if "does not recognize this architecture" in str(e):
err = str(e)
if "does not recognize this architecture" in err:
raise ValueError(
f"{e}\n\n"
f"The checkpoint '{pretrained_model_name_or_path}' has a model type not "
Expand All @@ -217,10 +218,58 @@ def get_hf_config(pretrained_model_name_or_path, attn_implementation, **kwargs):
f"or install from source:\n"
f" pip install git+https://github.com/NVIDIA-NeMo/Automodel.git"
) from e
raise
# Some upstream configs (e.g. stepfun-ai/Step-3.5-Flash) ship
# layer_types longer than num_hidden_layers, which newer transformers
# versions reject during config instantiation. Fix the raw dict and retry.
if "num_hidden_layers" in err and ("layer_types" in err or "layer types" in err):
hf_config = _load_config_with_layer_types_fix(
pretrained_model_name_or_path,
attn_implementation,
trust_remote_code=trust_remote_code,
**kwargs,
)
else:
raise
return hf_config


def _load_config_with_layer_types_fix(pretrained_model_name_or_path, attn_implementation, trust_remote_code, **kwargs):
"""Load an HF config after truncating ``layer_types`` to ``num_hidden_layers``.

Works around buggy upstream configs whose ``layer_types`` list is longer than
``num_hidden_layers`` (e.g. stepfun-ai/Step-3.5-Flash).
"""
from transformers.models.auto.configuration_auto import CONFIG_MAPPING

config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
n = config_dict.get("num_hidden_layers")
lt = config_dict.get("layer_types")
if isinstance(n, int) and isinstance(lt, list) and len(lt) > n:
logger.warning(
"Truncating layer_types (len=%d) to num_hidden_layers=%d for %s",
len(lt),
n,
pretrained_model_name_or_path,
)
config_dict["layer_types"] = lt[:n]

config_cls = None
auto_map = config_dict.get("auto_map") or {}
if trust_remote_code and "AutoConfig" in auto_map:
from transformers.dynamic_module_utils import get_class_from_dynamic_module

config_cls = get_class_from_dynamic_module(auto_map["AutoConfig"], pretrained_model_name_or_path)
if config_cls is None:
model_type = config_dict.get("model_type")
config_cls = CONFIG_MAPPING.get(model_type)
if config_cls is None:
raise ValueError(
f"Could not resolve config class for {pretrained_model_name_or_path} "
f"(model_type={config_dict.get('model_type')!r})"
)
return config_cls.from_dict(config_dict, attn_implementation=attn_implementation)


def get_is_hf_model(config, force_hf):
"""Determine whether the model should use the HF (not custom) implementation."""
if force_hf:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ dependencies = [
"opencv-python-headless==4.10.0.84",
"pybind11",
"pyyaml",
"tiktoken",
"torch>=2.6.0,<=2.10.0",
"torchdata",
"transformers==5.5.0",
Expand Down
129 changes: 129 additions & 0 deletions tests/unit_tests/_transformers/test_model_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
_consume_config_overrides,
_has_safetensors,
_init_model,
_load_config_with_layer_types_fix,
_resolve_model_dir,
_setup_bnb_loading_kwargs,
_stream_load_bnb_weights,
Expand Down Expand Up @@ -413,3 +414,131 @@ def test_model_without_hf_conversion_mapping_is_supported(self):
cls, config = self._make_cls(nn.Linear)
cls._model_mapping = {cfg_type: nn.Linear}
assert _streaming_bnb_supported(cls, cfg_type()) is True


class TestLayerTypesFix:
"""_load_config_with_layer_types_fix must truncate layer_types and resolve the right config class."""

def _config_dict(self, n_layers=45, n_layer_types=48):
return {
"model_type": "step3p5",
"num_hidden_layers": n_layers,
"layer_types": ["full_attention"] + ["sliding_attention"] * (n_layer_types - 1),
"hidden_size": 4096,
"auto_map": {"AutoConfig": "configuration_step3p5.Step3p5Config"},
}

@patch("transformers.dynamic_module_utils.get_class_from_dynamic_module")
@patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict")
def test_truncates_layer_types_via_dynamic_module(self, mock_get_dict, mock_get_cls):
mock_get_dict.return_value = (self._config_dict(), {})
built = MagicMock()
fake_cls = MagicMock()
fake_cls.from_dict.return_value = built
mock_get_cls.return_value = fake_cls

result = _load_config_with_layer_types_fix(
"stepfun-ai/Step-3.5-Flash", "sdpa", trust_remote_code=True
)

assert result is built
passed_dict = fake_cls.from_dict.call_args[0][0]
assert len(passed_dict["layer_types"]) == 45
assert passed_dict["layer_types"][0] == "full_attention"
assert fake_cls.from_dict.call_args[1]["attn_implementation"] == "sdpa"
mock_get_cls.assert_called_once_with(
"configuration_step3p5.Step3p5Config", "stepfun-ai/Step-3.5-Flash"
)

@patch("transformers.models.auto.configuration_auto.CONFIG_MAPPING", new_callable=MagicMock)
@patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict")
def test_resolves_via_config_mapping_when_not_trust_remote_code(self, mock_get_dict, mock_mapping):
cfg_dict = self._config_dict()
cfg_dict.pop("auto_map")
mock_get_dict.return_value = (cfg_dict, {})

fake_cls = MagicMock()
fake_cls.from_dict.return_value = "built"
mock_mapping.get.side_effect = lambda k: fake_cls if k == "step3p5" else None

result = _load_config_with_layer_types_fix(
"some/model", "flash_attention_2", trust_remote_code=False
)

assert result == "built"
passed_dict = fake_cls.from_dict.call_args[0][0]
assert len(passed_dict["layer_types"]) == 45
mock_mapping.get.assert_called_once_with("step3p5")

@patch("transformers.models.auto.configuration_auto.CONFIG_MAPPING", new_callable=MagicMock)
@patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict")
def test_matching_lengths_leaves_layer_types_untouched(self, mock_get_dict, mock_mapping):
cfg_dict = self._config_dict(n_layers=45, n_layer_types=45)
original = list(cfg_dict["layer_types"])
mock_get_dict.return_value = (cfg_dict, {})

fake_cls = MagicMock()
fake_cls.from_dict.return_value = MagicMock()
mock_mapping.get.return_value = fake_cls

_load_config_with_layer_types_fix("some/model", "sdpa", trust_remote_code=False)

passed_dict = fake_cls.from_dict.call_args[0][0]
assert passed_dict["layer_types"] == original

@patch("transformers.models.auto.configuration_auto.CONFIG_MAPPING", new_callable=MagicMock)
@patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict")
def test_raises_when_config_class_cannot_be_resolved(self, mock_get_dict, mock_mapping):
cfg_dict = self._config_dict()
cfg_dict.pop("auto_map")
cfg_dict["model_type"] = "definitely_not_a_real_model_type_xyz"
mock_get_dict.return_value = (cfg_dict, {})
mock_mapping.get.return_value = None

with pytest.raises(ValueError, match="Could not resolve config class"):
_load_config_with_layer_types_fix("some/model", "sdpa", trust_remote_code=False)


class TestGetHfConfigLayerTypesRetry:
"""get_hf_config should retry via the layer_types fix helper when AutoConfig raises."""

@patch("nemo_automodel._transformers.model_init._load_config_with_layer_types_fix")
@patch("nemo_automodel._transformers.model_init.resolve_trust_remote_code", return_value=True)
@patch("nemo_automodel._transformers.model_init.AutoConfig.from_pretrained")
def test_retry_on_layer_types_mismatch(self, mock_from_pretrained, _mock_trust, mock_fix):
mock_from_pretrained.side_effect = ValueError(
"`num_hidden_layers` (45) must be equal to the number of layer types (48)."
)
fixed_cfg = MagicMock()
mock_fix.return_value = fixed_cfg

result = get_hf_config("stepfun-ai/Step-3.5-Flash", "sdpa")

assert result is fixed_cfg
mock_fix.assert_called_once()
call_kwargs = mock_fix.call_args[1]
assert call_kwargs["trust_remote_code"] is True

@patch("nemo_automodel._transformers.model_init._load_config_with_layer_types_fix")
@patch("nemo_automodel._transformers.model_init.resolve_trust_remote_code", return_value=False)
@patch("nemo_automodel._transformers.model_init.AutoConfig.from_pretrained")
def test_unrelated_value_error_is_reraised(self, mock_from_pretrained, _mock_trust, mock_fix):
mock_from_pretrained.side_effect = ValueError("some totally unrelated failure")

with pytest.raises(ValueError, match="totally unrelated failure"):
get_hf_config("fake/model", "sdpa")
mock_fix.assert_not_called()

@patch("nemo_automodel._transformers.model_init._load_config_with_layer_types_fix")
@patch("nemo_automodel._transformers.model_init.resolve_trust_remote_code", return_value=False)
@patch("nemo_automodel._transformers.model_init.AutoConfig.from_pretrained")
def test_unrecognized_architecture_still_raises_helpful_error(
self, mock_from_pretrained, _mock_trust, mock_fix
):
mock_from_pretrained.side_effect = ValueError(
"Unknown model (fake/model) does not recognize this architecture"
)

with pytest.raises(ValueError, match="pip install --upgrade nemo_automodel"):
get_hf_config("fake/model", "sdpa")
mock_fix.assert_not_called()
2 changes: 2 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading