diff --git a/docker/common/uv-pytorch.lock b/docker/common/uv-pytorch.lock index baa5f8df2d..388197c937 100644 --- a/docker/common/uv-pytorch.lock +++ b/docker/common/uv-pytorch.lock @@ -3302,6 +3302,7 @@ dependencies = [ { name = "opencv-python-headless" }, { name = "pybind11" }, { name = "pyyaml" }, + { name = "tiktoken" }, { name = "torch", marker = "sys_platform == 'never'" }, { name = "torchao", marker = "sys_platform == 'never'" }, { name = "torchdata" }, @@ -3494,6 +3495,7 @@ requires-dist = [ { name = "qwen-omni-utils", marker = "extra == 'vlm'" }, { name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, { name = "sentencepiece", marker = "extra == 'extra'" }, + { name = "tiktoken" }, { name = "timm", marker = "extra == 'vlm'", specifier = "<=1.0.22" }, { name = "torch", marker = "sys_platform != 'darwin' and sys_platform != 'linux'", specifier = ">=2.6.0,<=2.10.0", index = "https://download.pytorch.org/whl/cpu" }, { name = "torch", marker = "sys_platform == 'darwin'", specifier = ">=2.6.0,<=2.10.0", index = "https://pypi.org/simple" }, diff --git a/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml b/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml index d24fb31495..2d015d53c5 100644 --- a/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml +++ b/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml @@ -94,7 +94,7 @@ ci: recipe_owner: hemildesai time: "00:15:00" checkpoint_robustness: - hf_kl_threshold: 1e-3 + hf_kl_threshold: 1e-2 tokenizer_name: Qwen/Qwen3-30B-A3B no_check_resume: true dataset.num_samples_limit: 500 diff --git a/examples/llm_finetune/qwen/qwen3_moe_30b_uccl_ep.yaml b/examples/llm_finetune/qwen/qwen3_moe_30b_uccl_ep.yaml index 075f923e06..03dd53e02d 100644 --- a/examples/llm_finetune/qwen/qwen3_moe_30b_uccl_ep.yaml +++ b/examples/llm_finetune/qwen/qwen3_moe_30b_uccl_ep.yaml @@ -65,7 +65,7 @@ distributed: pp_size: 1 # Set ep_size to the total number of GPUs across all nodes. # UCCL-EP supports ep_size: 2, 4, 8, 16, 32, 64, 128. - ep_size: 16 + ep_size: 8 sequence_parallel: false activation_checkpointing: true diff --git a/nemo_automodel/_transformers/model_init.py b/nemo_automodel/_transformers/model_init.py index 5208fe0959..799d8ff936 100644 --- a/nemo_automodel/_transformers/model_init.py +++ b/nemo_automodel/_transformers/model_init.py @@ -206,7 +206,8 @@ def get_hf_config(pretrained_model_name_or_path, attn_implementation, **kwargs): attn_implementation=attn_implementation, ) except ValueError as e: - if "does not recognize this architecture" in str(e): + err = str(e) + if "does not recognize this architecture" in err: raise ValueError( f"{e}\n\n" f"The checkpoint '{pretrained_model_name_or_path}' has a model type not " @@ -217,10 +218,58 @@ def get_hf_config(pretrained_model_name_or_path, attn_implementation, **kwargs): f"or install from source:\n" f" pip install git+https://github.com/NVIDIA-NeMo/Automodel.git" ) from e - raise + # Some upstream configs (e.g. stepfun-ai/Step-3.5-Flash) ship + # layer_types longer than num_hidden_layers, which newer transformers + # versions reject during config instantiation. Fix the raw dict and retry. + if "num_hidden_layers" in err and ("layer_types" in err or "layer types" in err): + hf_config = _load_config_with_layer_types_fix( + pretrained_model_name_or_path, + attn_implementation, + trust_remote_code=trust_remote_code, + **kwargs, + ) + else: + raise return hf_config +def _load_config_with_layer_types_fix(pretrained_model_name_or_path, attn_implementation, trust_remote_code, **kwargs): + """Load an HF config after truncating ``layer_types`` to ``num_hidden_layers``. + + Works around buggy upstream configs whose ``layer_types`` list is longer than + ``num_hidden_layers`` (e.g. stepfun-ai/Step-3.5-Flash). + """ + from transformers.models.auto.configuration_auto import CONFIG_MAPPING + + config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + n = config_dict.get("num_hidden_layers") + lt = config_dict.get("layer_types") + if isinstance(n, int) and isinstance(lt, list) and len(lt) > n: + logger.warning( + "Truncating layer_types (len=%d) to num_hidden_layers=%d for %s", + len(lt), + n, + pretrained_model_name_or_path, + ) + config_dict["layer_types"] = lt[:n] + + config_cls = None + auto_map = config_dict.get("auto_map") or {} + if trust_remote_code and "AutoConfig" in auto_map: + from transformers.dynamic_module_utils import get_class_from_dynamic_module + + config_cls = get_class_from_dynamic_module(auto_map["AutoConfig"], pretrained_model_name_or_path) + if config_cls is None: + model_type = config_dict.get("model_type") + config_cls = CONFIG_MAPPING.get(model_type) + if config_cls is None: + raise ValueError( + f"Could not resolve config class for {pretrained_model_name_or_path} " + f"(model_type={config_dict.get('model_type')!r})" + ) + return config_cls.from_dict(config_dict, attn_implementation=attn_implementation) + + def get_is_hf_model(config, force_hf): """Determine whether the model should use the HF (not custom) implementation.""" if force_hf: diff --git a/pyproject.toml b/pyproject.toml index 9e84b6e01a..3dd22827fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,6 +80,7 @@ dependencies = [ "opencv-python-headless==4.10.0.84", "pybind11", "pyyaml", + "tiktoken", "torch>=2.6.0,<=2.10.0", "torchdata", "transformers==5.5.0", diff --git a/tests/unit_tests/_transformers/test_model_init.py b/tests/unit_tests/_transformers/test_model_init.py index 0d1c679fc1..a72656f0b0 100644 --- a/tests/unit_tests/_transformers/test_model_init.py +++ b/tests/unit_tests/_transformers/test_model_init.py @@ -25,6 +25,7 @@ _consume_config_overrides, _has_safetensors, _init_model, + _load_config_with_layer_types_fix, _resolve_model_dir, _setup_bnb_loading_kwargs, _stream_load_bnb_weights, @@ -413,3 +414,131 @@ def test_model_without_hf_conversion_mapping_is_supported(self): cls, config = self._make_cls(nn.Linear) cls._model_mapping = {cfg_type: nn.Linear} assert _streaming_bnb_supported(cls, cfg_type()) is True + + +class TestLayerTypesFix: + """_load_config_with_layer_types_fix must truncate layer_types and resolve the right config class.""" + + def _config_dict(self, n_layers=45, n_layer_types=48): + return { + "model_type": "step3p5", + "num_hidden_layers": n_layers, + "layer_types": ["full_attention"] + ["sliding_attention"] * (n_layer_types - 1), + "hidden_size": 4096, + "auto_map": {"AutoConfig": "configuration_step3p5.Step3p5Config"}, + } + + @patch("transformers.dynamic_module_utils.get_class_from_dynamic_module") + @patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict") + def test_truncates_layer_types_via_dynamic_module(self, mock_get_dict, mock_get_cls): + mock_get_dict.return_value = (self._config_dict(), {}) + built = MagicMock() + fake_cls = MagicMock() + fake_cls.from_dict.return_value = built + mock_get_cls.return_value = fake_cls + + result = _load_config_with_layer_types_fix( + "stepfun-ai/Step-3.5-Flash", "sdpa", trust_remote_code=True + ) + + assert result is built + passed_dict = fake_cls.from_dict.call_args[0][0] + assert len(passed_dict["layer_types"]) == 45 + assert passed_dict["layer_types"][0] == "full_attention" + assert fake_cls.from_dict.call_args[1]["attn_implementation"] == "sdpa" + mock_get_cls.assert_called_once_with( + "configuration_step3p5.Step3p5Config", "stepfun-ai/Step-3.5-Flash" + ) + + @patch("transformers.models.auto.configuration_auto.CONFIG_MAPPING", new_callable=MagicMock) + @patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict") + def test_resolves_via_config_mapping_when_not_trust_remote_code(self, mock_get_dict, mock_mapping): + cfg_dict = self._config_dict() + cfg_dict.pop("auto_map") + mock_get_dict.return_value = (cfg_dict, {}) + + fake_cls = MagicMock() + fake_cls.from_dict.return_value = "built" + mock_mapping.get.side_effect = lambda k: fake_cls if k == "step3p5" else None + + result = _load_config_with_layer_types_fix( + "some/model", "flash_attention_2", trust_remote_code=False + ) + + assert result == "built" + passed_dict = fake_cls.from_dict.call_args[0][0] + assert len(passed_dict["layer_types"]) == 45 + mock_mapping.get.assert_called_once_with("step3p5") + + @patch("transformers.models.auto.configuration_auto.CONFIG_MAPPING", new_callable=MagicMock) + @patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict") + def test_matching_lengths_leaves_layer_types_untouched(self, mock_get_dict, mock_mapping): + cfg_dict = self._config_dict(n_layers=45, n_layer_types=45) + original = list(cfg_dict["layer_types"]) + mock_get_dict.return_value = (cfg_dict, {}) + + fake_cls = MagicMock() + fake_cls.from_dict.return_value = MagicMock() + mock_mapping.get.return_value = fake_cls + + _load_config_with_layer_types_fix("some/model", "sdpa", trust_remote_code=False) + + passed_dict = fake_cls.from_dict.call_args[0][0] + assert passed_dict["layer_types"] == original + + @patch("transformers.models.auto.configuration_auto.CONFIG_MAPPING", new_callable=MagicMock) + @patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict") + def test_raises_when_config_class_cannot_be_resolved(self, mock_get_dict, mock_mapping): + cfg_dict = self._config_dict() + cfg_dict.pop("auto_map") + cfg_dict["model_type"] = "definitely_not_a_real_model_type_xyz" + mock_get_dict.return_value = (cfg_dict, {}) + mock_mapping.get.return_value = None + + with pytest.raises(ValueError, match="Could not resolve config class"): + _load_config_with_layer_types_fix("some/model", "sdpa", trust_remote_code=False) + + +class TestGetHfConfigLayerTypesRetry: + """get_hf_config should retry via the layer_types fix helper when AutoConfig raises.""" + + @patch("nemo_automodel._transformers.model_init._load_config_with_layer_types_fix") + @patch("nemo_automodel._transformers.model_init.resolve_trust_remote_code", return_value=True) + @patch("nemo_automodel._transformers.model_init.AutoConfig.from_pretrained") + def test_retry_on_layer_types_mismatch(self, mock_from_pretrained, _mock_trust, mock_fix): + mock_from_pretrained.side_effect = ValueError( + "`num_hidden_layers` (45) must be equal to the number of layer types (48)." + ) + fixed_cfg = MagicMock() + mock_fix.return_value = fixed_cfg + + result = get_hf_config("stepfun-ai/Step-3.5-Flash", "sdpa") + + assert result is fixed_cfg + mock_fix.assert_called_once() + call_kwargs = mock_fix.call_args[1] + assert call_kwargs["trust_remote_code"] is True + + @patch("nemo_automodel._transformers.model_init._load_config_with_layer_types_fix") + @patch("nemo_automodel._transformers.model_init.resolve_trust_remote_code", return_value=False) + @patch("nemo_automodel._transformers.model_init.AutoConfig.from_pretrained") + def test_unrelated_value_error_is_reraised(self, mock_from_pretrained, _mock_trust, mock_fix): + mock_from_pretrained.side_effect = ValueError("some totally unrelated failure") + + with pytest.raises(ValueError, match="totally unrelated failure"): + get_hf_config("fake/model", "sdpa") + mock_fix.assert_not_called() + + @patch("nemo_automodel._transformers.model_init._load_config_with_layer_types_fix") + @patch("nemo_automodel._transformers.model_init.resolve_trust_remote_code", return_value=False) + @patch("nemo_automodel._transformers.model_init.AutoConfig.from_pretrained") + def test_unrecognized_architecture_still_raises_helpful_error( + self, mock_from_pretrained, _mock_trust, mock_fix + ): + mock_from_pretrained.side_effect = ValueError( + "Unknown model (fake/model) does not recognize this architecture" + ) + + with pytest.raises(ValueError, match="pip install --upgrade nemo_automodel"): + get_hf_config("fake/model", "sdpa") + mock_fix.assert_not_called() diff --git a/uv.lock b/uv.lock index 8758f16917..9dac4d80fa 100644 --- a/uv.lock +++ b/uv.lock @@ -3307,6 +3307,7 @@ dependencies = [ { name = "opencv-python-headless" }, { name = "pybind11" }, { name = "pyyaml" }, + { name = "tiktoken" }, { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" }, { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, @@ -3512,6 +3513,7 @@ requires-dist = [ { name = "qwen-omni-utils", marker = "extra == 'vlm'" }, { name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, { name = "sentencepiece", marker = "extra == 'extra'" }, + { name = "tiktoken" }, { name = "timm", marker = "extra == 'vlm'", specifier = "<=1.0.22" }, { name = "torch", marker = "sys_platform != 'darwin' and sys_platform != 'linux'", specifier = ">=2.6.0,<=2.10.0", index = "https://download.pytorch.org/whl/cpu" }, { name = "torch", marker = "sys_platform == 'darwin'", specifier = ">=2.6.0,<=2.10.0", index = "https://pypi.org/simple" },