From e5b31ca668f2aba57b16dad477e576287682a455 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 24 Jul 2025 22:20:39 +0000
Subject: [PATCH 01/22] test: filter out tests that needs_hf_token by default

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .github/workflows/cicd-main.yml                       | 4 ++--
 pyproject.toml                                        | 1 +
 tests/unit/models/generation/test_vllm_large_model.py | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 3b10108e6a..2c299a8998 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -221,8 +221,8 @@ jobs:
       UNIT_TEST_SCRIPT: |
         cd /opt/nemo-rl
         if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then
-          uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"not mcore\"
-          uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m mcore
+          uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"not mcore and needs_hf_token\"
+          uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m \"mcore and needs_hf_token\"
         else
           echo Skipping unit tests for docs-only level
         fi
diff --git a/pyproject.toml b/pyproject.toml
index cddda79abe..c985cd1351 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -174,6 +174,7 @@ testpaths = ["tests"]
 python_files = "test_*.py"
 markers = [
     "mcore: marks tests that require the mcore extra",
+    "needs_hf_token: marks tests that require HuggingFace token access for gated models",
 ]
 
 [tool.coverage.run]
diff --git a/tests/unit/models/generation/test_vllm_large_model.py b/tests/unit/models/generation/test_vllm_large_model.py
index d24a0c0f31..03c6822ecb 100644
--- a/tests/unit/models/generation/test_vllm_large_model.py
+++ b/tests/unit/models/generation/test_vllm_large_model.py
@@ -125,6 +125,7 @@ def test_input_data(tokenizer):
 
 # skip this test for now
 @pytest.mark.skip(reason="Skipping large model test until we have resources in CI.")
+@pytest.mark.needs_hf_token
 @pytest.mark.asyncio
 @pytest.mark.parametrize("tensor_parallel_size", [4, 8])
 @pytest.mark.parametrize("pipeline_parallel_size", [2])

From 13b71aa92ae26e9632863ace99ad0b2bba998b9b Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 01:30:04 +0000
Subject: [PATCH 02/22] wip

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/conftest.py                        |  65 +++++----
 .../unit/models/policy/test_dtensor_worker.py | 125 +++++++++++-------
 .../models/policy/test_megatron_worker.py     |  80 ++++++-----
 3 files changed, 157 insertions(+), 113 deletions(-)

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 3197fa2d57..f17be7611d 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -30,6 +30,12 @@
 
 dir_path = os.path.dirname(os.path.abspath(__file__))
 
+
+def has_hf_token():
+    """Check if HuggingFace token is available."""
+    return os.getenv("HF_TOKEN") is not None
+
+
 TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets")
 UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json")
 UNIT_RESULTS_FILE_DATED = os.path.join(
@@ -37,25 +43,6 @@
 )
 
 
-# Mapping between asset and absolute path (each are populated from a session level fixture)
-class TEST_ASSETS:
-    TINY_LLAMA_MODEL_PATH = os.path.join(
-        TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer"
-    )
-    TINY_LLAMA_TIED_MODEL_PATH = os.path.join(
-        TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer"
-    )
-    TINY_QWEN2_MODEL_PATH = os.path.join(
-        TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer"
-    )
-    TINY_QWEN3_MODEL_PATH = os.path.join(
-        TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer"
-    )
-    TINY_GEMMA3_MODEL_PATH = os.path.join(
-        TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer"
-    )
-
-
 class UnitTestData(TypedDict):
     exit_status: int | str
     git_commit: str
@@ -394,14 +381,20 @@ def mock_2gpu_distributed_env():
 #######################
 
 
-@pytest.fixture(scope="session", autouse=True)
+@pytest.fixture(scope="session")
 def tiny_llama_model_path():
     """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
     import shutil
 
     from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
 
-    model_path = TEST_ASSETS.TINY_LLAMA_MODEL_PATH
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B"
+        )
+
+    model_path = os.path.join(TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer")
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=128256 (so we can re-use llama3.2 1b tokenizer)
     config = LlamaConfig(
@@ -422,14 +415,22 @@ def tiny_llama_model_path():
     yield model_path
 
 
-@pytest.fixture(scope="session", autouse=True)
+@pytest.fixture(scope="session")
 def tiny_llama_tied_model_path():
     """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
     import shutil
 
     from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
 
-    model_path = TEST_ASSETS.TINY_LLAMA_TIED_MODEL_PATH
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B"
+        )
+
+    model_path = os.path.join(
+        TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer"
+    )
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=128256 (so we can re-use llama3.2 1b tokenizer)
     config = LlamaConfig(
@@ -450,14 +451,14 @@ def tiny_llama_tied_model_path():
     yield model_path
 
 
-@pytest.fixture(scope="session", autouse=True)
+@pytest.fixture(scope="session")
 def tiny_qwen2_model_path():
     """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
     import shutil
 
     from transformers import AutoTokenizer, Qwen2Config, Qwen2ForCausalLM
 
-    model_path = TEST_ASSETS.TINY_QWEN2_MODEL_PATH
+    model_path = os.path.join(TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer")
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer)
     config = Qwen2Config(
@@ -478,14 +479,14 @@ def tiny_qwen2_model_path():
     yield model_path
 
 
-@pytest.fixture(scope="session", autouse=True)
+@pytest.fixture(scope="session")
 def tiny_qwen3_model_path():
     """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
     import shutil
 
     from transformers import AutoTokenizer, Qwen3Config, Qwen3ForCausalLM
 
-    model_path = TEST_ASSETS.TINY_QWEN3_MODEL_PATH
+    model_path = os.path.join(TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer")
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer)
     config = Qwen3Config(
@@ -506,14 +507,20 @@ def tiny_qwen3_model_path():
     yield model_path
 
 
-@pytest.fixture(scope="session", autouse=True)
+@pytest.fixture(scope="session")
 def tiny_gemma3_model_path():
     """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
     import shutil
 
     from transformers import AutoTokenizer, Gemma3ForCausalLM, Gemma3TextConfig
 
-    model_path = TEST_ASSETS.TINY_GEMMA3_MODEL_PATH
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: google/gemma-3-1b-it"
+        )
+
+    model_path = os.path.join(TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer")
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=262144 so we can re-use gemma-3-1b tokenizer
     config = Gemma3TextConfig(
diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py
index c176082698..a8e5bc59e4 100644
--- a/tests/unit/models/policy/test_dtensor_worker.py
+++ b/tests/unit/models/policy/test_dtensor_worker.py
@@ -31,12 +31,11 @@
 from nemo_rl.models.generation import configure_generation_config
 from nemo_rl.models.policy import PolicyConfig
 from nemo_rl.models.policy.lm_policy import Policy
-from tests.unit.conftest import TEST_ASSETS
 from tests.unit.test_utils import SimpleLoss
 
 
 def create_test_config(
-    model_name: str = TEST_ASSETS.TINY_LLAMA_MODEL_PATH,
+    model_name: str,
     tp: int = 1,
     cp: int = 1,
     sequence_parallel: bool = False,
@@ -145,9 +144,9 @@ def gc_collect():
 
 
 @pytest.fixture
-def policy_setup(two_gpu_virtual_cluster):
+def policy_setup(two_gpu_virtual_cluster, tiny_llama_model_path):
     """Setup and teardown for policy tests - creates a virtual cluster and policy."""
-    config = create_test_config()
+    config = create_test_config(tiny_llama_model_path)
     tokenizer = get_tokenizer(config["tokenizer"])
     config["generation"] = configure_generation_config(config["generation"], tokenizer)
 
@@ -240,9 +239,17 @@ def test_lm_policy_init(policy_setup):
 @pytest.fixture
 def training_setup(request, two_gpu_virtual_cluster):
     """Setup and teardown specifically for training tests."""
-    model_name, tp, cp, sequence_parallel, cpu_offload, activation_checkpointing = (
-        request.param
-    )
+    (
+        model_fixture_name,
+        tp,
+        cp,
+        sequence_parallel,
+        cpu_offload,
+        activation_checkpointing,
+    ) = request.param
+
+    # Get the actual model path from the requested fixture
+    model_name = request.getfixturevalue(model_fixture_name)
     policy = None
     data = None
     loss_fn = None
@@ -299,37 +306,38 @@ def training_setup(request, two_gpu_virtual_cluster):
         policy.shutdown()
 
 
+@pytest.mark.needs_hf_token
 @pytest.mark.timeout(60)
 @pytest.mark.parametrize(
     "training_setup",
     [
-        # model_name                        tp cp  sp     cpu    act
-        (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, False, False, False),
-        (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, True, False, False),
-        (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, False, True, False),
-        (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, False, False, True),
-        (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, False, False),
-        (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, True, True, False),
-        (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, True, False, True),
-        (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, False, True, True),
-        (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, True, True, True),
-        (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 2, False, False, False),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, True, True, False),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, True, False, True),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, False, True, True),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, True, True, True),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 2, False, False, False),
+        # model_fixture_name        tp cp  sp     cpu    act
+        ("tiny_llama_model_path", 1, 1, False, False, False),
+        ("tiny_llama_model_path", 1, 1, True, False, False),
+        ("tiny_llama_model_path", 1, 1, False, True, False),
+        ("tiny_llama_model_path", 1, 1, False, False, True),
+        ("tiny_llama_model_path", 1, 2, False, False, False),
+        ("tiny_qwen2_model_path", 1, 1, True, True, False),
+        ("tiny_qwen2_model_path", 1, 1, True, False, True),
+        ("tiny_qwen2_model_path", 1, 1, False, True, True),
+        ("tiny_qwen2_model_path", 1, 1, True, True, True),
+        ("tiny_qwen2_model_path", 1, 2, False, False, False),
+        ("tiny_qwen3_model_path", 1, 1, True, True, False),
+        ("tiny_qwen3_model_path", 1, 1, True, False, True),
+        ("tiny_qwen3_model_path", 1, 1, False, True, True),
+        ("tiny_qwen3_model_path", 1, 1, True, True, True),
+        ("tiny_qwen3_model_path", 1, 2, False, False, False),
         (
-            TEST_ASSETS.TINY_GEMMA3_MODEL_PATH,
+            "tiny_gemma3_model_path",
             1,
             1,
             True,
             True,
             False,
         ),  # gemma3 doesn't support spda
-        (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 1, 1, True, False, True),
-        (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 1, 1, False, True, True),
-        (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 1, 1, True, True, True),
+        ("tiny_gemma3_model_path", 1, 1, True, False, True),
+        ("tiny_gemma3_model_path", 1, 1, False, True, True),
+        ("tiny_gemma3_model_path", 1, 1, True, True, True),
         # CP doesn't support gemma3 due to spda input has attent_mask != None.
     ],
     indirect=True,
@@ -372,9 +380,17 @@ def verify_loss_tensor(loss_tensor):
 @pytest.fixture
 def logprob_setup(request, two_gpu_virtual_cluster):
     """Setup and teardown specifically for training tests."""
-    model_name, tp, cp, sequence_parallel, cpu_offload, activation_checkpointing = (
-        request.param
-    )
+    (
+        model_fixture_name,
+        tp,
+        cp,
+        sequence_parallel,
+        cpu_offload,
+        activation_checkpointing,
+    ) = request.param
+
+    # Get the actual model path from the requested fixture
+    model_name = request.getfixturevalue(model_fixture_name)
     policy = None
     data = None
 
@@ -449,28 +465,29 @@ def logprob_setup(request, two_gpu_virtual_cluster):
         policy.shutdown()
 
 
+@pytest.mark.needs_hf_token
 @pytest.mark.timeout(360)
 @pytest.mark.parametrize(
     "logprob_setup",
     [
         # TP=2, CP=1
-        (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 2, 1, False, True, False),
-        (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 2, 1, False, False, False),
-        (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, 1, False, False, False),
-        (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, 1, False, True, False),
-        (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, 1, False, True, True),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, 1, False, True, False),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, 1, False, False, False),
-        (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 2, 1, False, True, False),
-        (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 2, 1, False, False, False),
+        ("tiny_qwen2_model_path", 2, 1, False, True, False),
+        ("tiny_qwen2_model_path", 2, 1, False, False, False),
+        ("tiny_llama_model_path", 2, 1, False, False, False),
+        ("tiny_llama_model_path", 2, 1, False, True, False),
+        ("tiny_llama_model_path", 2, 1, False, True, True),
+        ("tiny_qwen3_model_path", 2, 1, False, True, False),
+        ("tiny_qwen3_model_path", 2, 1, False, False, False),
+        ("tiny_gemma3_model_path", 2, 1, False, True, False),
+        ("tiny_gemma3_model_path", 2, 1, False, False, False),
         # TP=1, CP=2
-        (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 2, False, True, False),
-        (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 2, False, False, False),
-        (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, False, False),
-        (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, True, False),
-        (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, True, True),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 2, False, True, False),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 2, False, False, False),
+        ("tiny_qwen2_model_path", 1, 2, False, True, False),
+        ("tiny_qwen2_model_path", 1, 2, False, False, False),
+        ("tiny_llama_model_path", 1, 2, False, False, False),
+        ("tiny_llama_model_path", 1, 2, False, True, False),
+        ("tiny_llama_model_path", 1, 2, False, True, True),
+        ("tiny_qwen3_model_path", 1, 2, False, True, False),
+        ("tiny_qwen3_model_path", 1, 2, False, False, False),
     ],
     indirect=True,
 )
@@ -491,7 +508,10 @@ def test_dtensor_worker_logprob_tp2_or_cp2_matches_unsharded(logprob_setup):
     )
 
 
-def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(two_gpu_virtual_cluster):
+@pytest.mark.needs_hf_token
+def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(
+    two_gpu_virtual_cluster, tiny_llama_tied_model_path
+):
     """Test that DTensor with a tp > 1 and a tied model with a custom parallel plan works."""
     from torch.distributed.tensor.parallel import ColwiseParallel
     from torch.distributed.tensor.placement_types import Replicate
@@ -501,7 +521,7 @@ def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(two_gpu_virtual_clu
         "model.embed_tokens": ColwiseParallel(output_layouts=Replicate()),
     }
     config = create_test_config(
-        model_name=TEST_ASSETS.TINY_LLAMA_TIED_MODEL_PATH,
+        model_name=tiny_llama_tied_model_path,
         tp=2,
         cp=1,
         sequence_parallel=False,
@@ -534,8 +554,11 @@ def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(two_gpu_virtual_clu
     policy.shutdown()
 
 
+@pytest.mark.needs_hf_token
 @pytest.mark.timeout(180)
-def test_dtensor_loss_independent_of_microbatch_size_two_gpus(two_gpu_virtual_cluster):
+def test_dtensor_loss_independent_of_microbatch_size_two_gpus(
+    two_gpu_virtual_cluster, tiny_llama_model_path
+):
     """Tests that changing microbatch size while keeping global batch size constant does not affect loss values in DTensor."""
     # Create test batch with global batch size of 8
     global_batch_size = 8
@@ -569,7 +592,7 @@ def test_dtensor_loss_independent_of_microbatch_size_two_gpus(two_gpu_virtual_cl
     )
 
     # Test with mbs=1, 2 microbatches per GPU
-    config = create_test_config()
+    config = create_test_config(tiny_llama_model_path)
     tokenizer = get_tokenizer(config["tokenizer"])
 
     print("Creating training Policy with mbs=1...")
@@ -605,7 +628,7 @@ def test_dtensor_loss_independent_of_microbatch_size_two_gpus(two_gpu_virtual_cl
     policy_mbs1.worker_group.shutdown()
 
     # Test with mbs=2, 1 microbatch per GPU
-    config = create_test_config()
+    config = create_test_config(tiny_llama_model_path)
     config["train_micro_batch_size"] = 2
     config["generation"] = configure_generation_config(config["generation"], tokenizer)
 
diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index a399bca0d5..76753e60d0 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -28,12 +28,11 @@
 from nemo_rl.models.generation import configure_generation_config
 from nemo_rl.models.policy import PolicyConfig
 from nemo_rl.models.policy.lm_policy import Policy
-from tests.unit.conftest import TEST_ASSETS
 from tests.unit.test_utils import SimpleLoss
 
 
 def create_megatron_test_config(
-    model_name: str = TEST_ASSETS.TINY_LLAMA_MODEL_PATH,
+    model_name: str,
     tp: int = 1,
     pp: int = 1,
     precision: str = "float32",
@@ -199,18 +198,21 @@ def policy_setup(request):
 @pytest.fixture
 def training_setup(request):
     """Setup and teardown specifically for training tests."""
-    # Parse parameters: (num_gpus, tp, pp, model_name, config_updates)
+    # Parse parameters: (num_gpus, tp, pp, model_fixture_name, config_updates)
     if hasattr(request, "param") and request.param is not None:
-        num_gpus, tp, pp, model_name, config_updates = request.param
+        num_gpus, tp, pp, model_fixture_name, config_updates = request.param
     else:
-        num_gpus, tp, pp, model_name, config_updates = (
+        num_gpus, tp, pp, model_fixture_name, config_updates = (
             2,
             1,
             1,
-            TEST_ASSETS.TINY_LLAMA_MODEL_PATH,
+            "tiny_llama_model_path",
             {},
         )
 
+    # Get the actual model path from the requested fixture
+    model_name = request.getfixturevalue(model_fixture_name)
+
     policy = None
     cluster = None
     data = None
@@ -317,24 +319,25 @@ def training_setup(request):
             cluster.shutdown()
 
 
+@pytest.mark.needs_hf_token
 @pytest.mark.timeout(300)
 @pytest.mark.parametrize(
     "training_setup",
     [
-        # (num_gpus, tp, pp, model_name, config_updates)
-        (2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {}),
-        (2, 2, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {}),
-        (2, 1, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH, {}),
-        (2, 2, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH, {}),
-        (2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {"precision": "bfloat16"}),
+        # (num_gpus, tp, pp, model_fixture_name, config_updates)
+        (2, 1, 1, "tiny_llama_model_path", {}),
+        (2, 2, 1, "tiny_llama_model_path", {}),
+        (2, 1, 1, "tiny_qwen2_model_path", {}),
+        (2, 2, 1, "tiny_qwen2_model_path", {}),
+        (2, 1, 1, "tiny_llama_model_path", {"precision": "bfloat16"}),
         (
             2,
             1,
             1,
-            TEST_ASSETS.TINY_LLAMA_MODEL_PATH,
+            "tiny_llama_model_path",
             {"activation_checkpointing": True},
         ),
-        (2, 2, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {"sequence_parallel": True}),
+        (2, 2, 1, "tiny_llama_model_path", {"sequence_parallel": True}),
     ],
     indirect=True,
     ids=[
@@ -536,11 +539,14 @@ def test_megatron_policy_generation(generation_setup):
 @pytest.fixture
 def logprob_setup(request):
     """Setup and teardown specifically for logprob tests."""
-    # Parse parameters: (num_gpus, tp, pp, model_name)
+    # Parse parameters: (num_gpus, tp, pp, model_fixture_name)
     if hasattr(request, "param") and request.param is not None:
-        num_gpus, tp, pp, model_name = request.param
+        num_gpus, tp, pp, model_fixture_name = request.param
     else:
-        num_gpus, tp, pp, model_name = 2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH
+        num_gpus, tp, pp, model_fixture_name = 2, 1, 1, "tiny_llama_model_path"
+
+    # Get the actual model path from the requested fixture
+    model_name = request.getfixturevalue(model_fixture_name)
 
     policy = None
     cluster = None
@@ -616,14 +622,15 @@ def logprob_setup(request):
 
 
 @pytest.mark.timeout(180)
+@pytest.mark.needs_hf_token
 @pytest.mark.parametrize(
     "logprob_setup",
     [
-        # (num_gpus, tp, pp, model_name)
-        (2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH),
-        (2, 2, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH),
-        (2, 1, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH),
-        (2, 2, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH),
+        # (num_gpus, tp, pp, model_fixture_name)
+        (2, 1, 1, "tiny_llama_model_path"),
+        (2, 2, 1, "tiny_llama_model_path"),
+        (2, 1, 1, "tiny_qwen2_model_path"),
+        (2, 2, 1, "tiny_qwen2_model_path"),
     ],
     indirect=True,
     ids=["2gpu_dp2_llama", "2gpu_tp2_llama", "2gpu_dp2_qwen2", "2gpu_tp2_qwen2"],
@@ -656,7 +663,8 @@ def test_megatron_policy_logprobs(logprob_setup):
 
 
 @pytest.mark.timeout(240)
-def test_megatron_loss_independent_of_microbatch_size():
+@pytest.mark.needs_hf_token
+def test_megatron_loss_independent_of_microbatch_size(tiny_llama_model_path):
     """Test that changing microbatch size while keeping global batch size constant does not affect loss values."""
     num_gpus = 2
     global_batch_size = 8
@@ -697,7 +705,7 @@ def test_megatron_loss_independent_of_microbatch_size():
         max_colocated_worker_groups=1,
     )
 
-    config1 = create_megatron_test_config()
+    config1 = create_megatron_test_config(tiny_llama_model_path)
     config1["train_micro_batch_size"] = 1
     tokenizer = get_tokenizer(config1["tokenizer"])
     config1["generation"] = configure_generation_config(
@@ -745,7 +753,7 @@ def test_megatron_loss_independent_of_microbatch_size():
         max_colocated_worker_groups=1,
     )
 
-    config2 = create_megatron_test_config()
+    config2 = create_megatron_test_config(tiny_llama_model_path)
     config2["train_micro_batch_size"] = 2
     config2["generation"] = configure_generation_config(
         config2["generation"], tokenizer
@@ -774,7 +782,8 @@ def test_megatron_loss_independent_of_microbatch_size():
 
 
 @pytest.mark.timeout(300)
-def test_megatron_reference_policy_functionality():
+@pytest.mark.needs_hf_token
+def test_megatron_reference_policy_functionality(tiny_llama_model_path):
     """Test Megatron reference policy functionality."""
     num_gpus = 2
 
@@ -786,7 +795,7 @@ def test_megatron_reference_policy_functionality():
         max_colocated_worker_groups=1,
     )
 
-    config = create_megatron_test_config()
+    config = create_megatron_test_config(tiny_llama_model_path)
     config["megatron_cfg"]["optimizer"]["lr"] = 1e-2  # Increase from 5e-6 to 1e-2
     config["megatron_cfg"]["optimizer"]["min_lr"] = 1e-3  # Increase min_lr as well
 
@@ -894,6 +903,7 @@ def test_megatron_reference_policy_functionality():
 
 
 @pytest.mark.timeout(400)
+@pytest.mark.needs_hf_token
 @pytest.mark.parametrize(
     "num_gpus,tp,pp",
     [
@@ -903,12 +913,14 @@ def test_megatron_reference_policy_functionality():
     ],
     ids=["2gpu_dp2_save_restore", "2gpu_pp2_save_restore", "2gpu_tp2_save_restore"],
 )
-def test_megatron_checkpoint_save_kill_and_restore(num_gpus, tp, pp):
+def test_megatron_checkpoint_save_kill_and_restore(
+    num_gpus, tp, pp, tiny_llama_model_path
+):
     """Test full checkpoint save/restore cycle: save -> kill worker -> restart -> verify restore."""
     from copy import deepcopy
 
     # Use tiny model for faster testing
-    model_name = TEST_ASSETS.TINY_LLAMA_MODEL_PATH
+    model_name = tiny_llama_model_path
     tokenizer = get_tokenizer({"name": model_name})
 
     with tempfile.TemporaryDirectory(prefix="megatron_save_restore_") as temp_dir:
@@ -1146,7 +1158,8 @@ def test_megatron_checkpoint_save_kill_and_restore(num_gpus, tp, pp):
 
 
 @pytest.mark.timeout(300)
-def test_megatron_dpo_training():
+@pytest.mark.needs_hf_token
+def test_megatron_dpo_training(tiny_llama_model_path):
     """Test DPO training with Megatron backend."""
     num_gpus = 2
     batch_size = 8
@@ -1184,7 +1197,7 @@ def test_megatron_dpo_training():
         max_colocated_worker_groups=1,
     )
 
-    config = create_megatron_test_config()
+    config = create_megatron_test_config(tiny_llama_model_path)
     tokenizer = get_tokenizer(config["tokenizer"])
 
     policy = Policy(
@@ -1242,7 +1255,8 @@ def test_megatron_dpo_training():
 
 
 @pytest.mark.timeout(300)
-def test_megatron_sft_training():
+@pytest.mark.needs_hf_token
+def test_megatron_sft_training(tiny_llama_model_path):
     """Test SFT training with Megatron backend."""
     num_gpus = 2
     batch_size = 8
@@ -1277,7 +1291,7 @@ def test_megatron_sft_training():
         max_colocated_worker_groups=1,
     )
 
-    config = create_megatron_test_config()
+    config = create_megatron_test_config(tiny_llama_model_path)
     tokenizer = get_tokenizer(config["tokenizer"])
 
     policy = Policy(

From e2f06bc17de7e63bb6bc489910c267f3bcb60743 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 01:45:37 +0000
Subject: [PATCH 03/22] fix

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/algorithms/test_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/unit/algorithms/test_utils.py b/tests/unit/algorithms/test_utils.py
index 82338de026..8bd45c2349 100755
--- a/tests/unit/algorithms/test_utils.py
+++ b/tests/unit/algorithms/test_utils.py
@@ -77,6 +77,7 @@ def get_format_with_simple_role_header(messages):
     return message
 
 
+@pytest.mark.needs_hf_token
 def test_get_tokenizer_no_chat_template(conversation_messages):
     """Test get_tokenizer when no chat template is specified in config"""
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct"}
@@ -89,6 +90,7 @@ def test_get_tokenizer_no_chat_template(conversation_messages):
     assert formatted == expected
 
 
+@pytest.mark.needs_hf_token
 def test_get_tokenizer_default_chat_template(conversation_messages):
     """Test get_tokenizer when chat_template is 'default' in config"""
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"}
@@ -100,6 +102,7 @@ def test_get_tokenizer_default_chat_template(conversation_messages):
     assert formatted == expected
 
 
+@pytest.mark.needs_hf_token
 def test_get_tokenizer_null_chat_template(conversation_messages):
     """Test get_tokenizer when chat_template is None in config"""
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None}
@@ -113,6 +116,7 @@ def test_get_tokenizer_null_chat_template(conversation_messages):
     assert formatted == expected
 
 
+@pytest.mark.needs_hf_token
 def test_get_tokenizer_custom_jinja_template(conversation_messages):
     """Test get_tokenizer when a custom jinja template is specified"""
     custom_template = COMMON_CHAT_TEMPLATES.simple_role_header

From fa548a890b29f783417a8b56cdae2198d877c2d1 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 02:19:46 +0000
Subject: [PATCH 04/22] wip

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/algorithms/test_utils.py | 30 +++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tests/unit/algorithms/test_utils.py b/tests/unit/algorithms/test_utils.py
index 8bd45c2349..d1940e8374 100755
--- a/tests/unit/algorithms/test_utils.py
+++ b/tests/unit/algorithms/test_utils.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from datetime import datetime
 
 import pytest
@@ -20,6 +21,11 @@
 from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
 
 
+def has_hf_token():
+    """Check if HuggingFace token is available."""
+    return os.getenv("HF_TOKEN") is not None
+
+
 @pytest.fixture
 def conversation_messages():
     """Fixture providing a multi-turn conversation for testing chat templates"""
@@ -80,6 +86,12 @@ def get_format_with_simple_role_header(messages):
 @pytest.mark.needs_hf_token
 def test_get_tokenizer_no_chat_template(conversation_messages):
     """Test get_tokenizer when no chat template is specified in config"""
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct"
+        )
+
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct"}
     tokenizer = get_tokenizer(config)
 
@@ -93,6 +105,12 @@ def test_get_tokenizer_no_chat_template(conversation_messages):
 @pytest.mark.needs_hf_token
 def test_get_tokenizer_default_chat_template(conversation_messages):
     """Test get_tokenizer when chat_template is 'default' in config"""
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct"
+        )
+
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"}
     tokenizer = get_tokenizer(config)
 
@@ -105,6 +123,12 @@ def test_get_tokenizer_default_chat_template(conversation_messages):
 @pytest.mark.needs_hf_token
 def test_get_tokenizer_null_chat_template(conversation_messages):
     """Test get_tokenizer when chat_template is None in config"""
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct"
+        )
+
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None}
     tokenizer = get_tokenizer(config)
 
@@ -119,6 +143,12 @@ def test_get_tokenizer_null_chat_template(conversation_messages):
 @pytest.mark.needs_hf_token
 def test_get_tokenizer_custom_jinja_template(conversation_messages):
     """Test get_tokenizer when a custom jinja template is specified"""
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct"
+        )
+
     custom_template = COMMON_CHAT_TEMPLATES.simple_role_header
     config = {
         "name": "meta-llama/Llama-3.2-1B-Instruct",

From 09936ca5776060727b2c10a87791baf7af519d26 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 02:22:34 +0000
Subject: [PATCH 05/22] wip

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .../data/hf_datasets/test_oai_format_dataset.py    | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/unit/data/hf_datasets/test_oai_format_dataset.py b/tests/unit/data/hf_datasets/test_oai_format_dataset.py
index 4ba75a6a1d..9b53ea03a2 100644
--- a/tests/unit/data/hf_datasets/test_oai_format_dataset.py
+++ b/tests/unit/data/hf_datasets/test_oai_format_dataset.py
@@ -13,11 +13,18 @@
 # limitations under the License.
 
 import json
+import os
 import tempfile
 
 import pytest
 from transformers import AutoTokenizer
 
+
+def has_hf_token():
+    """Check if HuggingFace token is available."""
+    return os.getenv("HF_TOKEN") is not None
+
+
 from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
 from nemo_rl.data.hf_datasets.oai_format_dataset import (
     OpenAIFormatDataset,
@@ -87,8 +94,15 @@ def test_custom_keys(sample_data):
     assert dataset.system_prompt == "You are a helpful assistant."
 
 
+@pytest.mark.needs_hf_token
 @pytest.mark.parametrize("sample_data", [("messages", "system_key")], indirect=True)
 def test_message_formatting(sample_data):
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: Meta-Llama/Meta-Llama-3-8B-Instruct"
+        )
+
     train_path, val_path = sample_data
     dataset = OpenAIFormatDataset(
         train_path, val_path, chat_key="messages", system_key="system_key"

From a63d0e766d6fcbb319f7e0af7281250f1fcc8a3e Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 02:25:42 +0000
Subject: [PATCH 06/22] fix

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .../unit/data/hf_datasets/test_prompt_response.py  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/unit/data/hf_datasets/test_prompt_response.py b/tests/unit/data/hf_datasets/test_prompt_response.py
index 8ff7f5c5f6..1742dd3c86 100644
--- a/tests/unit/data/hf_datasets/test_prompt_response.py
+++ b/tests/unit/data/hf_datasets/test_prompt_response.py
@@ -13,11 +13,18 @@
 # limitations under the License.
 
 import json
+import os
 import tempfile
 
 import pytest
 from transformers import AutoTokenizer
 
+
+def has_hf_token():
+    """Check if HuggingFace token is available."""
+    return os.getenv("HF_TOKEN") is not None
+
+
 from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
 from nemo_rl.data.hf_datasets.prompt_response_dataset import (
     PromptResponseDataset,
@@ -76,8 +83,15 @@ def test_custom_keys(sample_data):
     assert dataset.output_key == "answer"
 
 
+@pytest.mark.needs_hf_token
 @pytest.mark.parametrize("sample_data", [("question", "answer")], indirect=True)
 def test_message_formatting(sample_data):
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: Meta-Llama/Meta-Llama-3-8B-Instruct"
+        )
+
     train_path, val_path = sample_data
     dataset = PromptResponseDataset(
         train_path, val_path, input_key="question", output_key="answer"

From 8147d6c733c520ada6d094bd6c51a4f2f8285ae6 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 02:29:39 +0000
Subject: [PATCH 07/22] wip

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/data/test_llm_message_utils.py | 29 +++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py
index 6f9c3f3129..7ce59a5986 100644
--- a/tests/unit/data/test_llm_message_utils.py
+++ b/tests/unit/data/test_llm_message_utils.py
@@ -13,10 +13,18 @@
 # limitations under the License.
 
 
+import os
+
 import pytest
 import torch
 from transformers import AutoTokenizer
 
+
+def has_hf_token():
+    """Check if HuggingFace token is available."""
+    return os.getenv("HF_TOKEN") is not None
+
+
 from nemo_rl.data.hf_datasets import COMMON_CHAT_TEMPLATES
 from nemo_rl.data.interfaces import LLMMessageLogType, TaskDataSpec
 from nemo_rl.data.llm_message_utils import (
@@ -328,9 +336,16 @@ def test_batch_pad_message_log_custom_pad_value(
     )
 
 
+@pytest.mark.needs_hf_token
 def test_get_formatted_message_log_llama(
     raw_chat_message_log: LLMMessageLogType,
 ) -> None:
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct"
+        )
+
     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
 
     ## get expected result
@@ -372,9 +387,16 @@ def test_get_formatted_message_log_llama(
     assert actual_text == expected_text
 
 
+@pytest.mark.needs_hf_token
 def test_get_formatted_message_log_add_generation_prompt_llama(
     raw_chat_message_log: LLMMessageLogType,
 ) -> None:
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct"
+        )
+
     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
 
     ## get expected result
@@ -499,7 +521,14 @@ def test_get_formatted_message_log_add_generation_prompt_qwen(
     assert actual_text == expected_text
 
 
+@pytest.mark.needs_hf_token
 def test_formatted_message_log_empty_message():
+    # Skip if no HF token available for gated model
+    if not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(
+            "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct"
+        )
+
     message_logs = [
         [
             {"role": "system", "content": "You are a helpful assistant."},

From 19dcb012ea6709e9657f81062ba8baf97a78d613 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 02:36:42 +0000
Subject: [PATCH 08/22] ok

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/conftest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index f17be7611d..79c9cf0def 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -37,6 +37,7 @@ def has_hf_token():
 
 
 TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets")
+os.makedirs(TEST_ASSETS_DIR, exist_ok=True)
 UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json")
 UNIT_RESULTS_FILE_DATED = os.path.join(
     dir_path, f"unit_results/{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

From 03ef590409cd339d51ed373ca611ce53d72904af Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 02:45:12 +0000
Subject: [PATCH 09/22] fix

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/models/dtensor/test_parallelize.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/unit/models/dtensor/test_parallelize.py b/tests/unit/models/dtensor/test_parallelize.py
index 5acb7addc4..0c3de464f6 100644
--- a/tests/unit/models/dtensor/test_parallelize.py
+++ b/tests/unit/models/dtensor/test_parallelize.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from itertools import product
 from unittest.mock import MagicMock
 
@@ -19,6 +20,12 @@
 from torch.distributed.tensor.parallel import ParallelStyle, parallelize_module
 from transformers import AutoModelForCausalLM
 
+
+def has_hf_token():
+    """Check if HuggingFace token is available."""
+    return os.getenv("HF_TOKEN") is not None
+
+
 from nemo_rl.models.dtensor.parallelize import (
     _parallelize_gemma3,
     _parallelize_llama,
@@ -26,6 +33,7 @@
 )
 
 
+@pytest.mark.needs_hf_token
 @pytest.mark.parametrize(
     "model_name, parallelize_func, sequence_parallel",
     [
@@ -44,6 +52,17 @@
 )
 def test_parallelize_plan_keys(model_name, parallelize_func, sequence_parallel):
     """Tests that the keys in the parallelization plans are valid by mocking parallel styles."""
+    # List of known gated models that require HF token
+    gated_models = [
+        "google/gemma-3-1b-it",
+        "google/gemma-3-4b-it",
+        "meta-llama/Llama-3.2-1B-Instruct",
+    ]
+
+    # Skip if testing a gated model and no HF token available
+    if model_name in gated_models and not has_hf_token() and not os.getenv("CI"):
+        pytest.skip(f"HuggingFace token not available for gated model: {model_name}")
+
     model = AutoModelForCausalLM.from_pretrained(model_name)
     parallel_plan = parallelize_func(model, sequence_parallel=sequence_parallel)
 

From ac0b41579f108403df9fd9668cf4c18d4fce3d58 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 03:06:17 +0000
Subject: [PATCH 10/22] try cleaning up

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/algorithms/test_utils.py           | 30 ----------------
 tests/unit/conftest.py                        | 36 ++++++++-----------
 .../hf_datasets/test_oai_format_dataset.py    | 13 -------
 .../data/hf_datasets/test_prompt_response.py  | 13 -------
 tests/unit/data/test_llm_message_utils.py     | 26 --------------
 tests/unit/models/dtensor/test_parallelize.py | 18 ----------
 6 files changed, 15 insertions(+), 121 deletions(-)

diff --git a/tests/unit/algorithms/test_utils.py b/tests/unit/algorithms/test_utils.py
index d1940e8374..8bd45c2349 100755
--- a/tests/unit/algorithms/test_utils.py
+++ b/tests/unit/algorithms/test_utils.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from datetime import datetime
 
 import pytest
@@ -21,11 +20,6 @@
 from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
 
 
-def has_hf_token():
-    """Check if HuggingFace token is available."""
-    return os.getenv("HF_TOKEN") is not None
-
-
 @pytest.fixture
 def conversation_messages():
     """Fixture providing a multi-turn conversation for testing chat templates"""
@@ -86,12 +80,6 @@ def get_format_with_simple_role_header(messages):
 @pytest.mark.needs_hf_token
 def test_get_tokenizer_no_chat_template(conversation_messages):
     """Test get_tokenizer when no chat template is specified in config"""
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct"
-        )
-
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct"}
     tokenizer = get_tokenizer(config)
 
@@ -105,12 +93,6 @@ def test_get_tokenizer_no_chat_template(conversation_messages):
 @pytest.mark.needs_hf_token
 def test_get_tokenizer_default_chat_template(conversation_messages):
     """Test get_tokenizer when chat_template is 'default' in config"""
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct"
-        )
-
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"}
     tokenizer = get_tokenizer(config)
 
@@ -123,12 +105,6 @@ def test_get_tokenizer_default_chat_template(conversation_messages):
 @pytest.mark.needs_hf_token
 def test_get_tokenizer_null_chat_template(conversation_messages):
     """Test get_tokenizer when chat_template is None in config"""
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct"
-        )
-
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None}
     tokenizer = get_tokenizer(config)
 
@@ -143,12 +119,6 @@ def test_get_tokenizer_null_chat_template(conversation_messages):
 @pytest.mark.needs_hf_token
 def test_get_tokenizer_custom_jinja_template(conversation_messages):
     """Test get_tokenizer when a custom jinja template is specified"""
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct"
-        )
-
     custom_template = COMMON_CHAT_TEMPLATES.simple_role_header
     config = {
         "name": "meta-llama/Llama-3.2-1B-Instruct",
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 79c9cf0def..830e04fb97 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -31,9 +31,21 @@
 dir_path = os.path.dirname(os.path.abspath(__file__))
 
 
-def has_hf_token():
-    """Check if HuggingFace token is available."""
-    return os.getenv("HF_TOKEN") is not None
+def pytest_runtest_setup(item):
+    """Automatically skip tests that require HF tokens unless explicitly requested."""
+    needs_hf_token = item.get_closest_marker("needs_hf_token")
+    if needs_hf_token:
+        # Check if the test should run based on pytest marker selection
+        # If -m needs_hf_token is specified, run the test
+        # Otherwise, skip it by default
+        config = item.session.config
+
+        # Check if needs_hf_token marker is explicitly selected
+        marker_expr = config.getoption("-m", default="")
+        if "needs_hf_token" not in marker_expr:
+            pytest.skip(
+                "Skipping test that requires HF token (use -m needs_hf_token to run)"
+            )
 
 
 TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets")
@@ -389,12 +401,6 @@ def tiny_llama_model_path():
 
     from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
 
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B"
-        )
-
     model_path = os.path.join(TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer")
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=128256 (so we can re-use llama3.2 1b tokenizer)
@@ -423,12 +429,6 @@ def tiny_llama_tied_model_path():
 
     from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
 
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B"
-        )
-
     model_path = os.path.join(
         TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer"
     )
@@ -515,12 +515,6 @@ def tiny_gemma3_model_path():
 
     from transformers import AutoTokenizer, Gemma3ForCausalLM, Gemma3TextConfig
 
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: google/gemma-3-1b-it"
-        )
-
     model_path = os.path.join(TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer")
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=262144 so we can re-use gemma-3-1b tokenizer
diff --git a/tests/unit/data/hf_datasets/test_oai_format_dataset.py b/tests/unit/data/hf_datasets/test_oai_format_dataset.py
index 9b53ea03a2..6438c52fe2 100644
--- a/tests/unit/data/hf_datasets/test_oai_format_dataset.py
+++ b/tests/unit/data/hf_datasets/test_oai_format_dataset.py
@@ -13,18 +13,11 @@
 # limitations under the License.
 
 import json
-import os
 import tempfile
 
 import pytest
 from transformers import AutoTokenizer
 
-
-def has_hf_token():
-    """Check if HuggingFace token is available."""
-    return os.getenv("HF_TOKEN") is not None
-
-
 from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
 from nemo_rl.data.hf_datasets.oai_format_dataset import (
     OpenAIFormatDataset,
@@ -97,12 +90,6 @@ def test_custom_keys(sample_data):
 @pytest.mark.needs_hf_token
 @pytest.mark.parametrize("sample_data", [("messages", "system_key")], indirect=True)
 def test_message_formatting(sample_data):
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: Meta-Llama/Meta-Llama-3-8B-Instruct"
-        )
-
     train_path, val_path = sample_data
     dataset = OpenAIFormatDataset(
         train_path, val_path, chat_key="messages", system_key="system_key"
diff --git a/tests/unit/data/hf_datasets/test_prompt_response.py b/tests/unit/data/hf_datasets/test_prompt_response.py
index 1742dd3c86..d6b908a1a5 100644
--- a/tests/unit/data/hf_datasets/test_prompt_response.py
+++ b/tests/unit/data/hf_datasets/test_prompt_response.py
@@ -13,18 +13,11 @@
 # limitations under the License.
 
 import json
-import os
 import tempfile
 
 import pytest
 from transformers import AutoTokenizer
 
-
-def has_hf_token():
-    """Check if HuggingFace token is available."""
-    return os.getenv("HF_TOKEN") is not None
-
-
 from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
 from nemo_rl.data.hf_datasets.prompt_response_dataset import (
     PromptResponseDataset,
@@ -86,12 +79,6 @@ def test_custom_keys(sample_data):
 @pytest.mark.needs_hf_token
 @pytest.mark.parametrize("sample_data", [("question", "answer")], indirect=True)
 def test_message_formatting(sample_data):
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: Meta-Llama/Meta-Llama-3-8B-Instruct"
-        )
-
     train_path, val_path = sample_data
     dataset = PromptResponseDataset(
         train_path, val_path, input_key="question", output_key="answer"
diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py
index 7ce59a5986..8b329e8072 100644
--- a/tests/unit/data/test_llm_message_utils.py
+++ b/tests/unit/data/test_llm_message_utils.py
@@ -13,18 +13,10 @@
 # limitations under the License.
 
 
-import os
-
 import pytest
 import torch
 from transformers import AutoTokenizer
 
-
-def has_hf_token():
-    """Check if HuggingFace token is available."""
-    return os.getenv("HF_TOKEN") is not None
-
-
 from nemo_rl.data.hf_datasets import COMMON_CHAT_TEMPLATES
 from nemo_rl.data.interfaces import LLMMessageLogType, TaskDataSpec
 from nemo_rl.data.llm_message_utils import (
@@ -340,12 +332,6 @@ def test_batch_pad_message_log_custom_pad_value(
 def test_get_formatted_message_log_llama(
     raw_chat_message_log: LLMMessageLogType,
 ) -> None:
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct"
-        )
-
     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
 
     ## get expected result
@@ -391,12 +377,6 @@ def test_get_formatted_message_log_llama(
 def test_get_formatted_message_log_add_generation_prompt_llama(
     raw_chat_message_log: LLMMessageLogType,
 ) -> None:
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct"
-        )
-
     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
 
     ## get expected result
@@ -523,12 +503,6 @@ def test_get_formatted_message_log_add_generation_prompt_qwen(
 
 @pytest.mark.needs_hf_token
 def test_formatted_message_log_empty_message():
-    # Skip if no HF token available for gated model
-    if not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(
-            "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct"
-        )
-
     message_logs = [
         [
             {"role": "system", "content": "You are a helpful assistant."},
diff --git a/tests/unit/models/dtensor/test_parallelize.py b/tests/unit/models/dtensor/test_parallelize.py
index 0c3de464f6..013c5256ab 100644
--- a/tests/unit/models/dtensor/test_parallelize.py
+++ b/tests/unit/models/dtensor/test_parallelize.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from itertools import product
 from unittest.mock import MagicMock
 
@@ -20,12 +19,6 @@
 from torch.distributed.tensor.parallel import ParallelStyle, parallelize_module
 from transformers import AutoModelForCausalLM
 
-
-def has_hf_token():
-    """Check if HuggingFace token is available."""
-    return os.getenv("HF_TOKEN") is not None
-
-
 from nemo_rl.models.dtensor.parallelize import (
     _parallelize_gemma3,
     _parallelize_llama,
@@ -52,17 +45,6 @@ def has_hf_token():
 )
 def test_parallelize_plan_keys(model_name, parallelize_func, sequence_parallel):
     """Tests that the keys in the parallelization plans are valid by mocking parallel styles."""
-    # List of known gated models that require HF token
-    gated_models = [
-        "google/gemma-3-1b-it",
-        "google/gemma-3-4b-it",
-        "meta-llama/Llama-3.2-1B-Instruct",
-    ]
-
-    # Skip if testing a gated model and no HF token available
-    if model_name in gated_models and not has_hf_token() and not os.getenv("CI"):
-        pytest.skip(f"HuggingFace token not available for gated model: {model_name}")
-
     model = AutoModelForCausalLM.from_pretrained(model_name)
     parallel_plan = parallelize_func(model, sequence_parallel=sequence_parallel)
 

From 8f5da92537111e3d37b2f250dba5fd42151a8f7e Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 04:58:09 +0000
Subject: [PATCH 11/22] try qwen3

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/models/generation/test_vllm_generation.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index 8a38e5c61e..94280a03c5 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -1620,13 +1620,15 @@ def test_vllm_megatron_weight_update_with_packing(cluster, test_input_data):
         # Enable packing during test
         os.environ["NEMO_RL_MEGATRON_IPC_TENSOR_PACKING_THRESHOLD"] = "1"
 
-        # Both policies must use the same model (Qwen2.5-0.5B) for weight transfer compatibility
-        model_name = "Qwen/Qwen2.5-0.5B"
+        # Both policies must use the same model for weight transfer compatibility
+        # NOTE: We have tried using Qwen/Qwen2.5-0.5B, but some small models exhibit variance depending
+        #  on which hardware it is run on.
+        model_name = "Qwen/Qwen3-0.6B"
         tokenizer = get_tokenizer({"name": model_name})
 
         # Create Policy
         megatron_config = get_basic_megatron_test_config(
-            tp=1, pp=1, precision="float32"
+            tp=1, pp=1, precision="bfloat16"
         )
         megatron_config["model_name"] = model_name
         megatron_config["tokenizer"]["name"] = model_name
@@ -1653,8 +1655,8 @@ def test_vllm_megatron_weight_update_with_packing(cluster, test_input_data):
         output_ids = outputs["output_ids"]
         generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
         assert generated_texts == [
-            "Hello, my name is John. I am a",
-            "The capital of France is Paris. It is the",
+            "Hello, my name is Lina. I'm",
+            "The capital of France is Paris. The capital of",
         ], "Output should be the same as the expected output"
 
     finally:

From cf0c477557f71266ae55736e42552a1eb43042a2 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 05:34:02 +0000
Subject: [PATCH 12/22] fix

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/models/huggingface/test_common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/unit/models/huggingface/test_common.py b/tests/unit/models/huggingface/test_common.py
index faf06fbdb7..d183958bb8 100644
--- a/tests/unit/models/huggingface/test_common.py
+++ b/tests/unit/models/huggingface/test_common.py
@@ -17,6 +17,7 @@
 from nemo_rl.models.huggingface.common import ModelFlag, is_gemma_model
 
 
+@pytest.mark.needs_hf_token
 @pytest.mark.parametrize(
     "model_name",
     [
@@ -42,6 +43,7 @@ def test_gemma_models(model_name):
     assert ModelFlag.VLLM_LOAD_FORMAT_AUTO.matches(model_name)
 
 
+@pytest.mark.needs_hf_token
 @pytest.mark.parametrize(
     "model_name",
     [

From 029c0a8e2e78d8c9572db541ede60f833f77a7e0 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 05:49:13 +0000
Subject: [PATCH 13/22] one more

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/data/hf_datasets/test_squad.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/data/hf_datasets/test_squad.py b/tests/unit/data/hf_datasets/test_squad.py
index 5e736ee8ac..8c7705f558 100644
--- a/tests/unit/data/hf_datasets/test_squad.py
+++ b/tests/unit/data/hf_datasets/test_squad.py
@@ -17,6 +17,7 @@
 from nemo_rl.data.hf_datasets.squad import SquadDataset
 
 
+@pytest.mark.needs_hf_token
 @pytest.mark.skip(reason="dataset download is flaky")
 def test_squad_dataset():
     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

From 370766c760353d4a0f2822d9f1aee31791e8ac0c Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 06:11:59 +0000
Subject: [PATCH 14/22] try fixing the mcore marker

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/conftest.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 830e04fb97..2b1494952b 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -32,20 +32,23 @@
 
 
 def pytest_runtest_setup(item):
-    """Automatically skip tests that require HF tokens unless explicitly requested."""
+    """Automatically skip tests that require special dependencies unless explicitly requested."""
+    config = item.session.config
+    marker_expr = config.getoption("-m", default="")
+
+    # Check for needs_hf_token marker
     needs_hf_token = item.get_closest_marker("needs_hf_token")
-    if needs_hf_token:
-        # Check if the test should run based on pytest marker selection
-        # If -m needs_hf_token is specified, run the test
-        # Otherwise, skip it by default
-        config = item.session.config
-
-        # Check if needs_hf_token marker is explicitly selected
-        marker_expr = config.getoption("-m", default="")
-        if "needs_hf_token" not in marker_expr:
-            pytest.skip(
-                "Skipping test that requires HF token (use -m needs_hf_token to run)"
-            )
+    if needs_hf_token and "needs_hf_token" not in marker_expr:
+        pytest.skip(
+            "Skipping test that requires HF token (use -m needs_hf_token to run)"
+        )
+
+    # Check for mcore marker
+    mcore = item.get_closest_marker("mcore")
+    if mcore and "mcore" not in marker_expr:
+        pytest.skip(
+            "Skipping test that requires mcore dependencies (use -m mcore to run)"
+        )
 
 
 TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets")

From 81bb823406ab81d8e9d4a4cf69b87e4b0c7bd71b Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 06:17:48 +0000
Subject: [PATCH 15/22] change up the ci markers

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .github/workflows/cicd-main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 2c299a8998..c8f803b2cf 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -221,7 +221,7 @@ jobs:
       UNIT_TEST_SCRIPT: |
         cd /opt/nemo-rl
         if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then
-          uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"not mcore and needs_hf_token\"
+          uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"needs_hf_token\"
           uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m \"mcore and needs_hf_token\"
         else
           echo Skipping unit tests for docs-only level

From 707c44fb991af8ff814035843ef4f545233347b3 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 06:47:00 +0000
Subject: [PATCH 16/22] hf_gate

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .github/workflows/cicd-main.yml                    |  4 ++--
 pyproject.toml                                     |  2 +-
 tests/unit/algorithms/test_utils.py                |  8 ++++----
 tests/unit/conftest.py                             | 10 ++++------
 .../data/hf_datasets/test_oai_format_dataset.py    |  2 +-
 .../unit/data/hf_datasets/test_prompt_response.py  |  2 +-
 tests/unit/data/hf_datasets/test_squad.py          |  2 +-
 tests/unit/data/test_llm_message_utils.py          |  6 +++---
 tests/unit/models/dtensor/test_parallelize.py      |  2 +-
 .../models/generation/test_vllm_large_model.py     |  2 +-
 tests/unit/models/huggingface/test_common.py       |  4 ++--
 tests/unit/models/policy/test_dtensor_worker.py    |  9 +++++----
 tests/unit/models/policy/test_megatron_worker.py   | 14 +++++++-------
 13 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index c8f803b2cf..67f1bc8500 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -221,8 +221,8 @@ jobs:
       UNIT_TEST_SCRIPT: |
         cd /opt/nemo-rl
         if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then
-          uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"needs_hf_token\"
-          uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m \"mcore and needs_hf_token\"
+          uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"hf_gated\"
+          uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m \"mcore and hf_gated\"
         else
           echo Skipping unit tests for docs-only level
         fi
diff --git a/pyproject.toml b/pyproject.toml
index c985cd1351..643f1cdb1a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -174,7 +174,7 @@ testpaths = ["tests"]
 python_files = "test_*.py"
 markers = [
     "mcore: marks tests that require the mcore extra",
-    "needs_hf_token: marks tests that require HuggingFace token access for gated models",
+    "hf_gated: marks tests that require HuggingFace token access for gated models",
 ]
 
 [tool.coverage.run]
diff --git a/tests/unit/algorithms/test_utils.py b/tests/unit/algorithms/test_utils.py
index 8bd45c2349..2dd00a5eeb 100755
--- a/tests/unit/algorithms/test_utils.py
+++ b/tests/unit/algorithms/test_utils.py
@@ -77,7 +77,7 @@ def get_format_with_simple_role_header(messages):
     return message
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_get_tokenizer_no_chat_template(conversation_messages):
     """Test get_tokenizer when no chat template is specified in config"""
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct"}
@@ -90,7 +90,7 @@ def test_get_tokenizer_no_chat_template(conversation_messages):
     assert formatted == expected
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_get_tokenizer_default_chat_template(conversation_messages):
     """Test get_tokenizer when chat_template is 'default' in config"""
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"}
@@ -102,7 +102,7 @@ def test_get_tokenizer_default_chat_template(conversation_messages):
     assert formatted == expected
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_get_tokenizer_null_chat_template(conversation_messages):
     """Test get_tokenizer when chat_template is None in config"""
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None}
@@ -116,7 +116,7 @@ def test_get_tokenizer_null_chat_template(conversation_messages):
     assert formatted == expected
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_get_tokenizer_custom_jinja_template(conversation_messages):
     """Test get_tokenizer when a custom jinja template is specified"""
     custom_template = COMMON_CHAT_TEMPLATES.simple_role_header
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 2b1494952b..e3a809709a 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -36,12 +36,10 @@ def pytest_runtest_setup(item):
     config = item.session.config
     marker_expr = config.getoption("-m", default="")
 
-    # Check for needs_hf_token marker
-    needs_hf_token = item.get_closest_marker("needs_hf_token")
-    if needs_hf_token and "needs_hf_token" not in marker_expr:
-        pytest.skip(
-            "Skipping test that requires HF token (use -m needs_hf_token to run)"
-        )
+    # Check for hf_gated marker
+    hf_gated = item.get_closest_marker("hf_gated")
+    if hf_gated and "hf_gated" not in marker_expr:
+        pytest.skip("Skipping test that requires HF token (use -m hf_gated to run)")
 
     # Check for mcore marker
     mcore = item.get_closest_marker("mcore")
diff --git a/tests/unit/data/hf_datasets/test_oai_format_dataset.py b/tests/unit/data/hf_datasets/test_oai_format_dataset.py
index 6438c52fe2..ae6b878779 100644
--- a/tests/unit/data/hf_datasets/test_oai_format_dataset.py
+++ b/tests/unit/data/hf_datasets/test_oai_format_dataset.py
@@ -87,7 +87,7 @@ def test_custom_keys(sample_data):
     assert dataset.system_prompt == "You are a helpful assistant."
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.parametrize("sample_data", [("messages", "system_key")], indirect=True)
 def test_message_formatting(sample_data):
     train_path, val_path = sample_data
diff --git a/tests/unit/data/hf_datasets/test_prompt_response.py b/tests/unit/data/hf_datasets/test_prompt_response.py
index d6b908a1a5..cbf18977a4 100644
--- a/tests/unit/data/hf_datasets/test_prompt_response.py
+++ b/tests/unit/data/hf_datasets/test_prompt_response.py
@@ -76,7 +76,7 @@ def test_custom_keys(sample_data):
     assert dataset.output_key == "answer"
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.parametrize("sample_data", [("question", "answer")], indirect=True)
 def test_message_formatting(sample_data):
     train_path, val_path = sample_data
diff --git a/tests/unit/data/hf_datasets/test_squad.py b/tests/unit/data/hf_datasets/test_squad.py
index 8c7705f558..f5e01b250a 100644
--- a/tests/unit/data/hf_datasets/test_squad.py
+++ b/tests/unit/data/hf_datasets/test_squad.py
@@ -17,7 +17,7 @@
 from nemo_rl.data.hf_datasets.squad import SquadDataset
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.skip(reason="dataset download is flaky")
 def test_squad_dataset():
     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py
index 8b329e8072..91ae2e41b7 100644
--- a/tests/unit/data/test_llm_message_utils.py
+++ b/tests/unit/data/test_llm_message_utils.py
@@ -328,7 +328,7 @@ def test_batch_pad_message_log_custom_pad_value(
     )
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_get_formatted_message_log_llama(
     raw_chat_message_log: LLMMessageLogType,
 ) -> None:
@@ -373,7 +373,7 @@ def test_get_formatted_message_log_llama(
     assert actual_text == expected_text
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_get_formatted_message_log_add_generation_prompt_llama(
     raw_chat_message_log: LLMMessageLogType,
 ) -> None:
@@ -501,7 +501,7 @@ def test_get_formatted_message_log_add_generation_prompt_qwen(
     assert actual_text == expected_text
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_formatted_message_log_empty_message():
     message_logs = [
         [
diff --git a/tests/unit/models/dtensor/test_parallelize.py b/tests/unit/models/dtensor/test_parallelize.py
index 013c5256ab..192fa354ac 100644
--- a/tests/unit/models/dtensor/test_parallelize.py
+++ b/tests/unit/models/dtensor/test_parallelize.py
@@ -26,7 +26,7 @@
 )
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.parametrize(
     "model_name, parallelize_func, sequence_parallel",
     [
diff --git a/tests/unit/models/generation/test_vllm_large_model.py b/tests/unit/models/generation/test_vllm_large_model.py
index 03c6822ecb..7b93ef46d1 100644
--- a/tests/unit/models/generation/test_vllm_large_model.py
+++ b/tests/unit/models/generation/test_vllm_large_model.py
@@ -125,7 +125,7 @@ def test_input_data(tokenizer):
 
 # skip this test for now
 @pytest.mark.skip(reason="Skipping large model test until we have resources in CI.")
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.asyncio
 @pytest.mark.parametrize("tensor_parallel_size", [4, 8])
 @pytest.mark.parametrize("pipeline_parallel_size", [2])
diff --git a/tests/unit/models/huggingface/test_common.py b/tests/unit/models/huggingface/test_common.py
index d183958bb8..95da64b0b4 100644
--- a/tests/unit/models/huggingface/test_common.py
+++ b/tests/unit/models/huggingface/test_common.py
@@ -17,7 +17,7 @@
 from nemo_rl.models.huggingface.common import ModelFlag, is_gemma_model
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.parametrize(
     "model_name",
     [
@@ -43,7 +43,7 @@ def test_gemma_models(model_name):
     assert ModelFlag.VLLM_LOAD_FORMAT_AUTO.matches(model_name)
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.parametrize(
     "model_name",
     [
diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py
index a8e5bc59e4..33a91c37eb 100644
--- a/tests/unit/models/policy/test_dtensor_worker.py
+++ b/tests/unit/models/policy/test_dtensor_worker.py
@@ -159,6 +159,7 @@ def policy_setup(two_gpu_virtual_cluster, tiny_llama_model_path):
     policy.shutdown()
 
 
+@pytest.mark.hf_gated
 @pytest.mark.timeout(180)
 def test_lm_policy_init(policy_setup):
     policy = policy_setup
@@ -306,7 +307,7 @@ def training_setup(request, two_gpu_virtual_cluster):
         policy.shutdown()
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.timeout(60)
 @pytest.mark.parametrize(
     "training_setup",
@@ -465,7 +466,7 @@ def logprob_setup(request, two_gpu_virtual_cluster):
         policy.shutdown()
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.timeout(360)
 @pytest.mark.parametrize(
     "logprob_setup",
@@ -508,7 +509,7 @@ def test_dtensor_worker_logprob_tp2_or_cp2_matches_unsharded(logprob_setup):
     )
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(
     two_gpu_virtual_cluster, tiny_llama_tied_model_path
 ):
@@ -554,7 +555,7 @@ def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(
     policy.shutdown()
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.timeout(180)
 def test_dtensor_loss_independent_of_microbatch_size_two_gpus(
     two_gpu_virtual_cluster, tiny_llama_model_path
diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index 76753e60d0..30c4e52d13 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -319,7 +319,7 @@ def training_setup(request):
             cluster.shutdown()
 
 
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.timeout(300)
 @pytest.mark.parametrize(
     "training_setup",
@@ -622,7 +622,7 @@ def logprob_setup(request):
 
 
 @pytest.mark.timeout(180)
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.parametrize(
     "logprob_setup",
     [
@@ -663,7 +663,7 @@ def test_megatron_policy_logprobs(logprob_setup):
 
 
 @pytest.mark.timeout(240)
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_megatron_loss_independent_of_microbatch_size(tiny_llama_model_path):
     """Test that changing microbatch size while keeping global batch size constant does not affect loss values."""
     num_gpus = 2
@@ -782,7 +782,7 @@ def test_megatron_loss_independent_of_microbatch_size(tiny_llama_model_path):
 
 
 @pytest.mark.timeout(300)
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_megatron_reference_policy_functionality(tiny_llama_model_path):
     """Test Megatron reference policy functionality."""
     num_gpus = 2
@@ -903,7 +903,7 @@ def test_megatron_reference_policy_functionality(tiny_llama_model_path):
 
 
 @pytest.mark.timeout(400)
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 @pytest.mark.parametrize(
     "num_gpus,tp,pp",
     [
@@ -1158,7 +1158,7 @@ def test_megatron_checkpoint_save_kill_and_restore(
 
 
 @pytest.mark.timeout(300)
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_megatron_dpo_training(tiny_llama_model_path):
     """Test DPO training with Megatron backend."""
     num_gpus = 2
@@ -1255,7 +1255,7 @@ def test_megatron_dpo_training(tiny_llama_model_path):
 
 
 @pytest.mark.timeout(300)
-@pytest.mark.needs_hf_token
+@pytest.mark.hf_gated
 def test_megatron_sft_training(tiny_llama_model_path):
     """Test SFT training with Megatron backend."""
     num_gpus = 2

From 4bffe0ab70bf6f13554e10d5a6c2dd750d590578 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 07:01:56 +0000
Subject: [PATCH 17/22] notes

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 docs/testing.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/testing.md b/docs/testing.md
index 4d44b141fb..3b2e5208f7 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -9,8 +9,14 @@ Unit tests require 2 GPUs to test the full suite.
 :::
 
 ```sh
-# Run the unit tests using local GPUs
+# Run the unit tests using local GPUs (by default tests requiring gated HF repos are skipped)
 uv run --group test bash tests/run_unit.sh
+
+# Run the unit tests and include tests requiring HF gated repos
+uv run --group test bash tests/run_unit.sh -m "hf_gated"
+
+# Run the mcore and HF dated repo tests
+uv run --extra mcore --group test bash tests/run_unit.sh -m "hf_gated and mcore"
 ```
 
 :::{note}

From 9ecf0aadbac73a245fc4bcbce7e3e8a9cc3e6a97 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 07:02:25 +0000
Subject: [PATCH 18/22] notes

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 docs/testing.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/docs/testing.md b/docs/testing.md
index 3b2e5208f7..f2d90928ff 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -19,12 +19,6 @@ uv run --group test bash tests/run_unit.sh -m "hf_gated"
 uv run --extra mcore --group test bash tests/run_unit.sh -m "hf_gated and mcore"
 ```
 
-:::{note}
-Tests can also be run on Slurm with `ray.sub`, but note that some tests will be skipped
-due to no GPUs being located on the head node. To run the full suite of tests, please
-launch on a regular GPU allocation.
-:::
-
 ### Run Unit Tests in a Hermetic Environment
 
 For environments lacking necessary dependencies (e.g., `gcc`, `nvcc`)

From 27078a6ce33fe9d49bd7e4aa1390668db4c049ff Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 07:30:18 +0000
Subject: [PATCH 19/22] fix

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .../models/policy/test_megatron_worker.py     | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index 30c4e52d13..75b735ac0d 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -151,7 +151,7 @@ def gc_collect():
 
 
 @pytest.fixture
-def policy_setup(request):
+def policy_setup(request, tiny_llama_model_path):
     """Setup and teardown for policy tests - creates a virtual cluster and policy."""
     # Get parameters from request
     if hasattr(request, "param") and request.param is not None:
@@ -176,7 +176,7 @@ def policy_setup(request):
             max_colocated_worker_groups=1,
         )
 
-        config = create_megatron_test_config(tp=tp, pp=pp)
+        config = create_megatron_test_config(tiny_llama_model_path, tp=tp, pp=pp)
         tokenizer = get_tokenizer(config["tokenizer"])
         config["generation"] = configure_generation_config(
             config["generation"], tokenizer
@@ -389,7 +389,7 @@ def verify_loss_tensor(loss_tensor):
 
 
 @pytest.fixture
-def generation_setup(request):
+def generation_setup(request, tiny_llama_model_path):
     """Setup and teardown specifically for generation tests."""
     # Parse parameters: (num_gpus, tp, pp, generation_backend)
     if hasattr(request, "param") and request.param is not None:
@@ -418,6 +418,7 @@ def generation_setup(request):
         )
 
         config = create_megatron_test_config(
+            tiny_llama_model_path,
             tp=tp,
             pp=pp,
             generation_backend=generation_backend,
@@ -1338,7 +1339,7 @@ def test_megatron_sft_training(tiny_llama_model_path):
 
 
 @pytest.mark.timeout(300)
-def test_megatron_context_parallel_logprob_agreement():
+def test_megatron_context_parallel_logprob_agreement(tiny_llama_model_path):
     """Test that CP and non-CP models produce identical logprobs with sequence packing enabled."""
     num_gpus = 2
     batch_size = 4
@@ -1376,7 +1377,9 @@ def test_megatron_context_parallel_logprob_agreement():
         max_colocated_worker_groups=1,
     )
 
-    config_no_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16")
+    config_no_cp = create_megatron_test_config(
+        tiny_llama_model_path, tp=1, pp=1, precision="bfloat16"
+    )
     # Ensure context parallel is disabled
     config_no_cp["megatron_cfg"]["context_parallel_size"] = 1
 
@@ -1456,7 +1459,9 @@ def test_megatron_context_parallel_logprob_agreement():
         max_colocated_worker_groups=1,
     )
 
-    config_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16")
+    config_cp = create_megatron_test_config(
+        tiny_llama_model_path, tp=1, pp=1, precision="bfloat16"
+    )
     # Enable context parallel
     config_cp["megatron_cfg"]["context_parallel_size"] = 2
 
@@ -1536,7 +1541,7 @@ def test_megatron_context_parallel_logprob_agreement():
 
 
 @pytest.mark.timeout(300)
-def test_megatron_context_parallel_training_agreement():
+def test_megatron_context_parallel_training_agreement(tiny_llama_model_path):
     """Test that CP and non-CP models produce consistent training results with ClippedPG loss and sequence packing."""
     num_gpus = 2
     batch_size = 2
@@ -1594,7 +1599,9 @@ def test_megatron_context_parallel_training_agreement():
         max_colocated_worker_groups=1,
     )
 
-    config_no_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16")
+    config_no_cp = create_megatron_test_config(
+        tiny_llama_model_path, tp=1, pp=1, precision="bfloat16"
+    )
     # Ensure context parallel is disabled
     config_no_cp["megatron_cfg"]["context_parallel_size"] = 1
     config_no_cp["train_global_batch_size"] = 2
@@ -1656,7 +1663,9 @@ def test_megatron_context_parallel_training_agreement():
         max_colocated_worker_groups=1,
     )
 
-    config_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16")
+    config_cp = create_megatron_test_config(
+        tiny_llama_model_path, tp=1, pp=1, precision="bfloat16"
+    )
     # Enable context parallel
     config_cp["megatron_cfg"]["context_parallel_size"] = 2
     config_cp["train_global_batch_size"] = 2

From cbe6f95c491b21797722341faac8f2d7e85e8617 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 07:32:45 +0000
Subject: [PATCH 20/22] ifix

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/models/policy/test_megatron_worker.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index 75b735ac0d..38607ba59f 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -1338,6 +1338,7 @@ def test_megatron_sft_training(tiny_llama_model_path):
         cluster.shutdown()
 
 
+@pytest.mark.hf_gated
 @pytest.mark.timeout(300)
 def test_megatron_context_parallel_logprob_agreement(tiny_llama_model_path):
     """Test that CP and non-CP models produce identical logprobs with sequence packing enabled."""
@@ -1540,6 +1541,7 @@ def test_megatron_context_parallel_logprob_agreement(tiny_llama_model_path):
     )
 
 
+@pytest.mark.hf_gated
 @pytest.mark.timeout(300)
 def test_megatron_context_parallel_training_agreement(tiny_llama_model_path):
     """Test that CP and non-CP models produce consistent training results with ClippedPG loss and sequence packing."""

From 08b1625942f2ee0708fb4b0d9fdeabcbcba17aff Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 23:50:17 +0000
Subject: [PATCH 21/22] fix

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .github/workflows/cicd-main.yml |  4 +--
 docs/testing.md                 | 15 +++++---
 tests/unit/conftest.py          | 64 ++++++++++++++++++++++++++-------
 3 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 67f1bc8500..42dc8d015b 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -221,8 +221,8 @@ jobs:
       UNIT_TEST_SCRIPT: |
         cd /opt/nemo-rl
         if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then
-          uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"hf_gated\"
-          uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m \"mcore and hf_gated\"
+          uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl --hf-gated
+          uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --mcore-only
         else
           echo Skipping unit tests for docs-only level
         fi
diff --git a/docs/testing.md b/docs/testing.md
index f2d90928ff..8ce97346b9 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -9,14 +9,19 @@ Unit tests require 2 GPUs to test the full suite.
 :::
 
 ```sh
-# Run the unit tests using local GPUs (by default tests requiring gated HF repos are skipped)
+# Run the unit tests using local GPUs
+
+# Configuration 1: Default tests only - excludes both hf_gated and mcore tests
 uv run --group test bash tests/run_unit.sh
 
-# Run the unit tests and include tests requiring HF gated repos
-uv run --group test bash tests/run_unit.sh -m "hf_gated"
+# Configuration 2: Default + HF gated tests, excluding mcore tests
+uv run --group test bash tests/run_unit.sh --hf-gated
+
+# Configuration 3: ONLY mcore tests, excluding ones with hf_gated
+uv run --extra mcore --group test bash tests/run_unit.sh --mcore-only
 
-# Run the mcore and HF dated repo tests
-uv run --extra mcore --group test bash tests/run_unit.sh -m "hf_gated and mcore"
+# Configuration 4: ONLY mcore tests, including ones with hf_gated
+uv run --extra mcore --group test bash tests/run_unit.sh --mcore-only --hf-gated
 ```
 
 ### Run Unit Tests in a Hermetic Environment
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index e3a809709a..9e14889b45 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -31,22 +31,60 @@
 dir_path = os.path.dirname(os.path.abspath(__file__))
 
 
-def pytest_runtest_setup(item):
-    """Automatically skip tests that require special dependencies unless explicitly requested."""
-    config = item.session.config
+def pytest_addoption(parser):
+    """Add custom command line options for controlling test execution."""
+    parser.addoption(
+        "--hf-gated",
+        action="store_true",
+        default=False,
+        help="Include tests that require HuggingFace token access",
+    )
+    parser.addoption(
+        "--mcore-only",
+        action="store_true",
+        default=False,
+        help="Run ONLY mcore tests (combine with --hf-gated to include mcore+hf_gated tests)",
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    """Modify test collection to skip tests based on markers unless explicitly requested."""
+    run_hf_gated = config.getoption("--hf-gated")
+    run_mcore_only = config.getoption("--mcore-only")
     marker_expr = config.getoption("-m", default="")
 
-    # Check for hf_gated marker
-    hf_gated = item.get_closest_marker("hf_gated")
-    if hf_gated and "hf_gated" not in marker_expr:
-        pytest.skip("Skipping test that requires HF token (use -m hf_gated to run)")
+    # If user specified -m marker expressions, let pytest handle everything normally
+    if marker_expr:
+        return
 
-    # Check for mcore marker
-    mcore = item.get_closest_marker("mcore")
-    if mcore and "mcore" not in marker_expr:
-        pytest.skip(
-            "Skipping test that requires mcore dependencies (use -m mcore to run)"
-        )
+    # Filter tests based on the desired configurations
+    new_items = []
+
+    if run_mcore_only and run_hf_gated:
+        # Configuration 4: Only mcore tests, including ones with hf_gated
+        new_items = [item for item in items if item.get_closest_marker("mcore")]
+    elif run_mcore_only:
+        # Configuration 3: Only mcore tests, excluding ones with hf_gated
+        new_items = [
+            item
+            for item in items
+            if item.get_closest_marker("mcore")
+            and not item.get_closest_marker("hf_gated")
+        ]
+    elif run_hf_gated:
+        # Configuration 2: Default tests + hf_gated tests, excluding mcore
+        new_items = [item for item in items if not item.get_closest_marker("mcore")]
+    else:
+        # Configuration 1: Default only - exclude both hf_gated and mcore
+        new_items = [
+            item
+            for item in items
+            if not item.get_closest_marker("hf_gated")
+            and not item.get_closest_marker("mcore")
+        ]
+
+    # Update the items list in-place
+    items[:] = new_items
 
 
 TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets")

From e3b3e2828fad00c8445545ed57e8b08a8dd08a67 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Jul 2025 23:53:23 +0000
Subject: [PATCH 22/22] move mkdir out of moduel scope

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/conftest.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 9e14889b45..1346a1173d 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -88,7 +88,6 @@ def pytest_collection_modifyitems(config, items):
 
 
 TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets")
-os.makedirs(TEST_ASSETS_DIR, exist_ok=True)
 UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json")
 UNIT_RESULTS_FILE_DATED = os.path.join(
     dir_path, f"unit_results/{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
@@ -237,6 +236,10 @@ def log_max_mem(self, metric_name: str):
     session_data["metrics"][qualified_name]["_elapsed"] = end_time - start_time
 
 
+def pytest_sessionstart(session):
+    os.makedirs(TEST_ASSETS_DIR, exist_ok=True)
+
+
 def pytest_sessionfinish(session, exitstatus):
     if not hasattr(session.config, "_unit_test_data"):
         return