diff --git a/docker/common/uv-pytorch.lock b/docker/common/uv-pytorch.lock
index baa5f8df2d..388197c937 100644
--- a/docker/common/uv-pytorch.lock
+++ b/docker/common/uv-pytorch.lock
@@ -3302,6 +3302,7 @@ dependencies = [
     { name = "opencv-python-headless" },
     { name = "pybind11" },
     { name = "pyyaml" },
+    { name = "tiktoken" },
     { name = "torch", marker = "sys_platform == 'never'" },
     { name = "torchao", marker = "sys_platform == 'never'" },
     { name = "torchdata" },
@@ -3494,6 +3495,7 @@ requires-dist = [
     { name = "qwen-omni-utils", marker = "extra == 'vlm'" },
     { name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" },
     { name = "sentencepiece", marker = "extra == 'extra'" },
+    { name = "tiktoken" },
     { name = "timm", marker = "extra == 'vlm'", specifier = "<=1.0.22" },
     { name = "torch", marker = "sys_platform != 'darwin' and sys_platform != 'linux'", specifier = ">=2.6.0,<=2.10.0", index = "https://download.pytorch.org/whl/cpu" },
     { name = "torch", marker = "sys_platform == 'darwin'", specifier = ">=2.6.0,<=2.10.0", index = "https://pypi.org/simple" },
diff --git a/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml b/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml
index d24fb31495..2d015d53c5 100644
--- a/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml
+++ b/examples/llm_finetune/qwen/qwen3_moe_30b_hellaswag.yaml
@@ -94,7 +94,7 @@ ci:
   recipe_owner: hemildesai
   time: "00:15:00"
   checkpoint_robustness:
-    hf_kl_threshold: 1e-3
+    hf_kl_threshold: 1e-2
     tokenizer_name: Qwen/Qwen3-30B-A3B
     no_check_resume: true
     dataset.num_samples_limit: 500
diff --git a/examples/llm_finetune/qwen/qwen3_moe_30b_uccl_ep.yaml b/examples/llm_finetune/qwen/qwen3_moe_30b_uccl_ep.yaml
index 075f923e06..03dd53e02d 100644
--- a/examples/llm_finetune/qwen/qwen3_moe_30b_uccl_ep.yaml
+++ b/examples/llm_finetune/qwen/qwen3_moe_30b_uccl_ep.yaml
@@ -65,7 +65,7 @@ distributed:
   pp_size: 1
   # Set ep_size to the total number of GPUs across all nodes.
   # UCCL-EP supports ep_size: 2, 4, 8, 16, 32, 64, 128.
-  ep_size: 16
+  ep_size: 8
 
   sequence_parallel: false
   activation_checkpointing: true
diff --git a/nemo_automodel/_transformers/model_init.py b/nemo_automodel/_transformers/model_init.py
index 5208fe0959..799d8ff936 100644
--- a/nemo_automodel/_transformers/model_init.py
+++ b/nemo_automodel/_transformers/model_init.py
@@ -206,7 +206,8 @@ def get_hf_config(pretrained_model_name_or_path, attn_implementation, **kwargs):
                 attn_implementation=attn_implementation,
             )
         except ValueError as e:
-            if "does not recognize this architecture" in str(e):
+            err = str(e)
+            if "does not recognize this architecture" in err:
                 raise ValueError(
                     f"{e}\n\n"
                     f"The checkpoint '{pretrained_model_name_or_path}' has a model type not "
@@ -217,10 +218,58 @@ def get_hf_config(pretrained_model_name_or_path, attn_implementation, **kwargs):
                     f"or install from source:\n"
                     f"  pip install git+https://github.com/NVIDIA-NeMo/Automodel.git"
                 ) from e
-            raise
+            # Some upstream configs (e.g. stepfun-ai/Step-3.5-Flash) ship
+            # layer_types longer than num_hidden_layers, which newer transformers
+            # versions reject during config instantiation. Fix the raw dict and retry.
+            if "num_hidden_layers" in err and ("layer_types" in err or "layer types" in err):
+                hf_config = _load_config_with_layer_types_fix(
+                    pretrained_model_name_or_path,
+                    attn_implementation,
+                    trust_remote_code=trust_remote_code,
+                    **kwargs,
+                )
+            else:
+                raise
     return hf_config
 
 
+def _load_config_with_layer_types_fix(pretrained_model_name_or_path, attn_implementation, trust_remote_code, **kwargs):
+    """Load an HF config after truncating ``layer_types`` to ``num_hidden_layers``.
+
+    Works around buggy upstream configs whose ``layer_types`` list is longer than
+    ``num_hidden_layers`` (e.g. stepfun-ai/Step-3.5-Flash).
+    """
+    from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+
+    config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
+    n = config_dict.get("num_hidden_layers")
+    lt = config_dict.get("layer_types")
+    if isinstance(n, int) and isinstance(lt, list) and len(lt) > n:
+        logger.warning(
+            "Truncating layer_types (len=%d) to num_hidden_layers=%d for %s",
+            len(lt),
+            n,
+            pretrained_model_name_or_path,
+        )
+        config_dict["layer_types"] = lt[:n]
+
+    config_cls = None
+    auto_map = config_dict.get("auto_map") or {}
+    if trust_remote_code and "AutoConfig" in auto_map:
+        from transformers.dynamic_module_utils import get_class_from_dynamic_module
+
+        config_cls = get_class_from_dynamic_module(auto_map["AutoConfig"], pretrained_model_name_or_path)
+    if config_cls is None:
+        model_type = config_dict.get("model_type")
+        config_cls = CONFIG_MAPPING.get(model_type)
+    if config_cls is None:
+        raise ValueError(
+            f"Could not resolve config class for {pretrained_model_name_or_path} "
+            f"(model_type={config_dict.get('model_type')!r})"
+        )
+    return config_cls.from_dict(config_dict, attn_implementation=attn_implementation)
+
+
 def get_is_hf_model(config, force_hf):
     """Determine whether the model should use the HF (not custom) implementation."""
     if force_hf:
diff --git a/pyproject.toml b/pyproject.toml
index 9e84b6e01a..3dd22827fc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,6 +80,7 @@ dependencies = [
     "opencv-python-headless==4.10.0.84",
     "pybind11",
     "pyyaml",
+    "tiktoken",
     "torch>=2.6.0,<=2.10.0",
     "torchdata",
     "transformers==5.5.0",
diff --git a/tests/unit_tests/_transformers/test_model_init.py b/tests/unit_tests/_transformers/test_model_init.py
index 0d1c679fc1..a72656f0b0 100644
--- a/tests/unit_tests/_transformers/test_model_init.py
+++ b/tests/unit_tests/_transformers/test_model_init.py
@@ -25,6 +25,7 @@
     _consume_config_overrides,
     _has_safetensors,
     _init_model,
+    _load_config_with_layer_types_fix,
     _resolve_model_dir,
     _setup_bnb_loading_kwargs,
     _stream_load_bnb_weights,
@@ -413,3 +414,131 @@ def test_model_without_hf_conversion_mapping_is_supported(self):
         cls, config = self._make_cls(nn.Linear)
         cls._model_mapping = {cfg_type: nn.Linear}
         assert _streaming_bnb_supported(cls, cfg_type()) is True
+
+
+class TestLayerTypesFix:
+    """_load_config_with_layer_types_fix must truncate layer_types and resolve the right config class."""
+
+    def _config_dict(self, n_layers=45, n_layer_types=48):
+        return {
+            "model_type": "step3p5",
+            "num_hidden_layers": n_layers,
+            "layer_types": ["full_attention"] + ["sliding_attention"] * (n_layer_types - 1),
+            "hidden_size": 4096,
+            "auto_map": {"AutoConfig": "configuration_step3p5.Step3p5Config"},
+        }
+
+    @patch("transformers.dynamic_module_utils.get_class_from_dynamic_module")
+    @patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict")
+    def test_truncates_layer_types_via_dynamic_module(self, mock_get_dict, mock_get_cls):
+        mock_get_dict.return_value = (self._config_dict(), {})
+        built = MagicMock()
+        fake_cls = MagicMock()
+        fake_cls.from_dict.return_value = built
+        mock_get_cls.return_value = fake_cls
+
+        result = _load_config_with_layer_types_fix(
+            "stepfun-ai/Step-3.5-Flash", "sdpa", trust_remote_code=True
+        )
+
+        assert result is built
+        passed_dict = fake_cls.from_dict.call_args[0][0]
+        assert len(passed_dict["layer_types"]) == 45
+        assert passed_dict["layer_types"][0] == "full_attention"
+        assert fake_cls.from_dict.call_args[1]["attn_implementation"] == "sdpa"
+        mock_get_cls.assert_called_once_with(
+            "configuration_step3p5.Step3p5Config", "stepfun-ai/Step-3.5-Flash"
+        )
+
+    @patch("transformers.models.auto.configuration_auto.CONFIG_MAPPING", new_callable=MagicMock)
+    @patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict")
+    def test_resolves_via_config_mapping_when_not_trust_remote_code(self, mock_get_dict, mock_mapping):
+        cfg_dict = self._config_dict()
+        cfg_dict.pop("auto_map")
+        mock_get_dict.return_value = (cfg_dict, {})
+
+        fake_cls = MagicMock()
+        fake_cls.from_dict.return_value = "built"
+        mock_mapping.get.side_effect = lambda k: fake_cls if k == "step3p5" else None
+
+        result = _load_config_with_layer_types_fix(
+            "some/model", "flash_attention_2", trust_remote_code=False
+        )
+
+        assert result == "built"
+        passed_dict = fake_cls.from_dict.call_args[0][0]
+        assert len(passed_dict["layer_types"]) == 45
+        mock_mapping.get.assert_called_once_with("step3p5")
+
+    @patch("transformers.models.auto.configuration_auto.CONFIG_MAPPING", new_callable=MagicMock)
+    @patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict")
+    def test_matching_lengths_leaves_layer_types_untouched(self, mock_get_dict, mock_mapping):
+        cfg_dict = self._config_dict(n_layers=45, n_layer_types=45)
+        original = list(cfg_dict["layer_types"])
+        mock_get_dict.return_value = (cfg_dict, {})
+
+        fake_cls = MagicMock()
+        fake_cls.from_dict.return_value = MagicMock()
+        mock_mapping.get.return_value = fake_cls
+
+        _load_config_with_layer_types_fix("some/model", "sdpa", trust_remote_code=False)
+
+        passed_dict = fake_cls.from_dict.call_args[0][0]
+        assert passed_dict["layer_types"] == original
+
+    @patch("transformers.models.auto.configuration_auto.CONFIG_MAPPING", new_callable=MagicMock)
+    @patch("nemo_automodel._transformers.model_init.PretrainedConfig.get_config_dict")
+    def test_raises_when_config_class_cannot_be_resolved(self, mock_get_dict, mock_mapping):
+        cfg_dict = self._config_dict()
+        cfg_dict.pop("auto_map")
+        cfg_dict["model_type"] = "definitely_not_a_real_model_type_xyz"
+        mock_get_dict.return_value = (cfg_dict, {})
+        mock_mapping.get.return_value = None
+
+        with pytest.raises(ValueError, match="Could not resolve config class"):
+            _load_config_with_layer_types_fix("some/model", "sdpa", trust_remote_code=False)
+
+
+class TestGetHfConfigLayerTypesRetry:
+    """get_hf_config should retry via the layer_types fix helper when AutoConfig raises."""
+
+    @patch("nemo_automodel._transformers.model_init._load_config_with_layer_types_fix")
+    @patch("nemo_automodel._transformers.model_init.resolve_trust_remote_code", return_value=True)
+    @patch("nemo_automodel._transformers.model_init.AutoConfig.from_pretrained")
+    def test_retry_on_layer_types_mismatch(self, mock_from_pretrained, _mock_trust, mock_fix):
+        mock_from_pretrained.side_effect = ValueError(
+            "`num_hidden_layers` (45) must be equal to the number of layer types (48)."
+        )
+        fixed_cfg = MagicMock()
+        mock_fix.return_value = fixed_cfg
+
+        result = get_hf_config("stepfun-ai/Step-3.5-Flash", "sdpa")
+
+        assert result is fixed_cfg
+        mock_fix.assert_called_once()
+        call_kwargs = mock_fix.call_args[1]
+        assert call_kwargs["trust_remote_code"] is True
+
+    @patch("nemo_automodel._transformers.model_init._load_config_with_layer_types_fix")
+    @patch("nemo_automodel._transformers.model_init.resolve_trust_remote_code", return_value=False)
+    @patch("nemo_automodel._transformers.model_init.AutoConfig.from_pretrained")
+    def test_unrelated_value_error_is_reraised(self, mock_from_pretrained, _mock_trust, mock_fix):
+        mock_from_pretrained.side_effect = ValueError("some totally unrelated failure")
+
+        with pytest.raises(ValueError, match="totally unrelated failure"):
+            get_hf_config("fake/model", "sdpa")
+        mock_fix.assert_not_called()
+
+    @patch("nemo_automodel._transformers.model_init._load_config_with_layer_types_fix")
+    @patch("nemo_automodel._transformers.model_init.resolve_trust_remote_code", return_value=False)
+    @patch("nemo_automodel._transformers.model_init.AutoConfig.from_pretrained")
+    def test_unrecognized_architecture_still_raises_helpful_error(
+        self, mock_from_pretrained, _mock_trust, mock_fix
+    ):
+        mock_from_pretrained.side_effect = ValueError(
+            "Unknown model (fake/model) does not recognize this architecture"
+        )
+
+        with pytest.raises(ValueError, match="pip install --upgrade nemo_automodel"):
+            get_hf_config("fake/model", "sdpa")
+        mock_fix.assert_not_called()
diff --git a/uv.lock b/uv.lock
index 8758f16917..9dac4d80fa 100644
--- a/uv.lock
+++ b/uv.lock
@@ -3307,6 +3307,7 @@ dependencies = [
     { name = "opencv-python-headless" },
     { name = "pybind11" },
     { name = "pyyaml" },
+    { name = "tiktoken" },
     { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" },
     { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
     { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
@@ -3512,6 +3513,7 @@ requires-dist = [
     { name = "qwen-omni-utils", marker = "extra == 'vlm'" },
     { name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" },
     { name = "sentencepiece", marker = "extra == 'extra'" },
+    { name = "tiktoken" },
     { name = "timm", marker = "extra == 'vlm'", specifier = "<=1.0.22" },
     { name = "torch", marker = "sys_platform != 'darwin' and sys_platform != 'linux'", specifier = ">=2.6.0,<=2.10.0", index = "https://download.pytorch.org/whl/cpu" },
     { name = "torch", marker = "sys_platform == 'darwin'", specifier = ">=2.6.0,<=2.10.0", index = "https://pypi.org/simple" },