NVIDIA-NeMo · terrykong · Jul 28, 2025 · Jul 24, 2025 · Jul 25, 2025 · Jul 25, 2025
@@ -221,8 +221,8 @@ jobs:
       UNIT_TEST_SCRIPT: |
         cd /opt/nemo-rl
         if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then
-          uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"not mcore\"
-          uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m mcore
+          uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl --hf-gated
+          uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --mcore-only
         else
           echo Skipping unit tests for docs-only level
         fi

@@ -10,14 +10,19 @@ Unit tests require 2 GPUs to test the full suite.
 
 ```sh
 # Run the unit tests using local GPUs
+
+# Configuration 1: Default tests only - excludes both hf_gated and mcore tests
 uv run --group test bash tests/run_unit.sh
-```
 
-:::{note}
-Tests can also be run on Slurm with `ray.sub`, but note that some tests will be skipped
-due to no GPUs being located on the head node. To run the full suite of tests, please
-launch on a regular GPU allocation.
-:::
+# Configuration 2: Default + HF gated tests, excluding mcore tests
+uv run --group test bash tests/run_unit.sh --hf-gated
+
+# Configuration 3: ONLY mcore tests, excluding ones with hf_gated
+uv run --extra mcore --group test bash tests/run_unit.sh --mcore-only
+
+# Configuration 4: ONLY mcore tests, including ones with hf_gated
+uv run --extra mcore --group test bash tests/run_unit.sh --mcore-only --hf-gated
+```
 
 ### Run Unit Tests in a Hermetic Environment
 

@@ -174,6 +174,7 @@ testpaths = ["tests"]
 python_files = "test_*.py"
 markers = [
     "mcore: marks tests that require the mcore extra",
+    "hf_gated: marks tests that require HuggingFace token access for gated models",
 ]
 
 [tool.coverage.run]

@@ -77,6 +77,7 @@ def get_format_with_simple_role_header(messages):
     return message
 
 
+@pytest.mark.hf_gated
 def test_get_tokenizer_no_chat_template(conversation_messages):
     """Test get_tokenizer when no chat template is specified in config"""
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct"}
@@ -89,6 +90,7 @@ def test_get_tokenizer_no_chat_template(conversation_messages):
     assert formatted == expected
 
 
+@pytest.mark.hf_gated
 def test_get_tokenizer_default_chat_template(conversation_messages):
     """Test get_tokenizer when chat_template is 'default' in config"""
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"}
@@ -100,6 +102,7 @@ def test_get_tokenizer_default_chat_template(conversation_messages):
     assert formatted == expected
 
 
+@pytest.mark.hf_gated
 def test_get_tokenizer_null_chat_template(conversation_messages):
     """Test get_tokenizer when chat_template is None in config"""
     config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None}
@@ -113,6 +116,7 @@ def test_get_tokenizer_null_chat_template(conversation_messages):
     assert formatted == expected
 
 
+@pytest.mark.hf_gated
 def test_get_tokenizer_custom_jinja_template(conversation_messages):
     """Test get_tokenizer when a custom jinja template is specified"""
     custom_template = COMMON_CHAT_TEMPLATES.simple_role_header

@@ -30,32 +30,70 @@
 
 dir_path = os.path.dirname(os.path.abspath(__file__))
 
+
+def pytest_addoption(parser):
+    """Add custom command line options for controlling test execution."""
+    parser.addoption(
+        "--hf-gated",
+        action="store_true",
+        default=False,
+        help="Include tests that require HuggingFace token access",
+    )
+    parser.addoption(
+        "--mcore-only",
+        action="store_true",
+        default=False,
+        help="Run ONLY mcore tests (combine with --hf-gated to include mcore+hf_gated tests)",
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    """Modify test collection to skip tests based on markers unless explicitly requested."""
+    run_hf_gated = config.getoption("--hf-gated")
+    run_mcore_only = config.getoption("--mcore-only")
+    marker_expr = config.getoption("-m", default="")
+
+    # If user specified -m marker expressions, let pytest handle everything normally
+    if marker_expr:
+        return
+
+    # Filter tests based on the desired configurations
+    new_items = []
+
+    if run_mcore_only and run_hf_gated:
+        # Configuration 4: Only mcore tests, including ones with hf_gated
+        new_items = [item for item in items if item.get_closest_marker("mcore")]
+    elif run_mcore_only:
+        # Configuration 3: Only mcore tests, excluding ones with hf_gated
+        new_items = [
+            item
+            for item in items
+            if item.get_closest_marker("mcore")
+            and not item.get_closest_marker("hf_gated")
+        ]
+    elif run_hf_gated:
+        # Configuration 2: Default tests + hf_gated tests, excluding mcore
+        new_items = [item for item in items if not item.get_closest_marker("mcore")]
+    else:
+        # Configuration 1: Default only - exclude both hf_gated and mcore
+        new_items = [
+            item
+            for item in items
+            if not item.get_closest_marker("hf_gated")
+            and not item.get_closest_marker("mcore")
+        ]
+
+    # Update the items list in-place
+    items[:] = new_items
+
+
 TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets")
 UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json")
 UNIT_RESULTS_FILE_DATED = os.path.join(
     dir_path, f"unit_results/{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
 )
 
 
-# Mapping between asset and absolute path (each are populated from a session level fixture)
-class TEST_ASSETS:
-    TINY_LLAMA_MODEL_PATH = os.path.join(
-        TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer"
-    )
-    TINY_LLAMA_TIED_MODEL_PATH = os.path.join(
-        TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer"
-    )
-    TINY_QWEN2_MODEL_PATH = os.path.join(
-        TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer"
-    )
-    TINY_QWEN3_MODEL_PATH = os.path.join(
-        TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer"
-    )
-    TINY_GEMMA3_MODEL_PATH = os.path.join(
-        TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer"
-    )
-
-
 class UnitTestData(TypedDict):
     exit_status: int | str
     git_commit: str
@@ -198,6 +236,10 @@ def log_max_mem(self, metric_name: str):
     session_data["metrics"][qualified_name]["_elapsed"] = end_time - start_time
 
 
+def pytest_sessionstart(session):
+    os.makedirs(TEST_ASSETS_DIR, exist_ok=True)
+
+
 def pytest_sessionfinish(session, exitstatus):
     if not hasattr(session.config, "_unit_test_data"):
         return
@@ -394,14 +436,14 @@ def mock_2gpu_distributed_env():
 #######################
 
 
-@pytest.fixture(scope="session", autouse=True)
+@pytest.fixture(scope="session")
 def tiny_llama_model_path():
     """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
     import shutil
 
     from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
 
-    model_path = TEST_ASSETS.TINY_LLAMA_MODEL_PATH
+    model_path = os.path.join(TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer")
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=128256 (so we can re-use llama3.2 1b tokenizer)
     config = LlamaConfig(
@@ -422,14 +464,16 @@ def tiny_llama_model_path():
     yield model_path
 
 
-@pytest.fixture(scope="session", autouse=True)
+@pytest.fixture(scope="session")
 def tiny_llama_tied_model_path():
     """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
     import shutil
 
     from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
 
-    model_path = TEST_ASSETS.TINY_LLAMA_TIED_MODEL_PATH
+    model_path = os.path.join(
+        TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer"
+    )
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=128256 (so we can re-use llama3.2 1b tokenizer)
     config = LlamaConfig(
@@ -450,14 +494,14 @@ def tiny_llama_tied_model_path():
     yield model_path
 
 
-@pytest.fixture(scope="session", autouse=True)
+@pytest.fixture(scope="session")
 def tiny_qwen2_model_path():
     """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
     import shutil
 
     from transformers import AutoTokenizer, Qwen2Config, Qwen2ForCausalLM
 
-    model_path = TEST_ASSETS.TINY_QWEN2_MODEL_PATH
+    model_path = os.path.join(TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer")
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer)
     config = Qwen2Config(
@@ -478,14 +522,14 @@ def tiny_qwen2_model_path():
     yield model_path
 
 
-@pytest.fixture(scope="session", autouse=True)
+@pytest.fixture(scope="session")
 def tiny_qwen3_model_path():
     """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
     import shutil
 
     from transformers import AutoTokenizer, Qwen3Config, Qwen3ForCausalLM
 
-    model_path = TEST_ASSETS.TINY_QWEN3_MODEL_PATH
+    model_path = os.path.join(TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer")
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer)
     config = Qwen3Config(
@@ -506,14 +550,14 @@ def tiny_qwen3_model_path():
     yield model_path
 
 
-@pytest.fixture(scope="session", autouse=True)
+@pytest.fixture(scope="session")
 def tiny_gemma3_model_path():
     """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
     import shutil
 
     from transformers import AutoTokenizer, Gemma3ForCausalLM, Gemma3TextConfig
 
-    model_path = TEST_ASSETS.TINY_GEMMA3_MODEL_PATH
+    model_path = os.path.join(TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer")
     # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
     # vocab_size=262144 so we can re-use gemma-3-1b tokenizer
     config = Gemma3TextConfig(

@@ -87,6 +87,7 @@ def test_custom_keys(sample_data):
     assert dataset.system_prompt == "You are a helpful assistant."
 
 
+@pytest.mark.hf_gated
 @pytest.mark.parametrize("sample_data", [("messages", "system_key")], indirect=True)
 def test_message_formatting(sample_data):
     train_path, val_path = sample_data

@@ -76,6 +76,7 @@ def test_custom_keys(sample_data):
     assert dataset.output_key == "answer"
 
 
+@pytest.mark.hf_gated
 @pytest.mark.parametrize("sample_data", [("question", "answer")], indirect=True)
 def test_message_formatting(sample_data):
     train_path, val_path = sample_data

@@ -17,6 +17,7 @@
 from nemo_rl.data.hf_datasets.squad import SquadDataset
 
 
+@pytest.mark.hf_gated
 @pytest.mark.skip(reason="dataset download is flaky")
 def test_squad_dataset():
     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

@@ -328,6 +328,7 @@ def test_batch_pad_message_log_custom_pad_value(
     )
 
 
+@pytest.mark.hf_gated
 def test_get_formatted_message_log_llama(
     raw_chat_message_log: LLMMessageLogType,
 ) -> None:
@@ -372,6 +373,7 @@ def test_get_formatted_message_log_llama(
     assert actual_text == expected_text
 
 
+@pytest.mark.hf_gated
 def test_get_formatted_message_log_add_generation_prompt_llama(
     raw_chat_message_log: LLMMessageLogType,
 ) -> None:
@@ -499,6 +501,7 @@ def test_get_formatted_message_log_add_generation_prompt_qwen(
     assert actual_text == expected_text
 
 
+@pytest.mark.hf_gated
 def test_formatted_message_log_empty_message():
     message_logs = [
         [

@@ -26,6 +26,7 @@
 )
 
 
+@pytest.mark.hf_gated
 @pytest.mark.parametrize(
     "model_name, parallelize_func, sequence_parallel",
     [

@@ -1620,13 +1620,15 @@ def test_vllm_megatron_weight_update_with_packing(cluster, test_input_data):
         # Enable packing during test
         os.environ["NEMO_RL_MEGATRON_IPC_TENSOR_PACKING_THRESHOLD"] = "1"
 
-        # Both policies must use the same model (Qwen2.5-0.5B) for weight transfer compatibility
-        model_name = "Qwen/Qwen2.5-0.5B"
+        # Both policies must use the same model for weight transfer compatibility
+        # NOTE: We have tried using Qwen/Qwen2.5-0.5B, but some small models exhibit variance depending
+        #  on which hardware it is run on.
+        model_name = "Qwen/Qwen3-0.6B"
         tokenizer = get_tokenizer({"name": model_name})
 
         # Create Policy
         megatron_config = get_basic_megatron_test_config(
-            tp=1, pp=1, precision="float32"
+            tp=1, pp=1, precision="bfloat16"
         )
         megatron_config["model_name"] = model_name
         megatron_config["tokenizer"]["name"] = model_name
@@ -1653,8 +1655,8 @@ def test_vllm_megatron_weight_update_with_packing(cluster, test_input_data):
         output_ids = outputs["output_ids"]
         generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
         assert generated_texts == [
-            "Hello, my name is John. I am a",
-            "The capital of France is Paris. It is the",
+            "Hello, my name is Lina. I'm",
+            "The capital of France is Paris. The capital of",
         ], "Output should be the same as the expected output"
 
     finally:

@@ -125,6 +125,7 @@ def test_input_data(tokenizer):
 
 # skip this test for now
 @pytest.mark.skip(reason="Skipping large model test until we have resources in CI.")
+@pytest.mark.hf_gated
 @pytest.mark.asyncio
 @pytest.mark.parametrize("tensor_parallel_size", [4, 8])
 @pytest.mark.parametrize("pipeline_parallel_size", [2])

@@ -17,6 +17,7 @@
 from nemo_rl.models.huggingface.common import ModelFlag, is_gemma_model
 
 
+@pytest.mark.hf_gated
 @pytest.mark.parametrize(
     "model_name",
     [
@@ -42,6 +43,7 @@ def test_gemma_models(model_name):
     assert ModelFlag.VLLM_LOAD_FORMAT_AUTO.matches(model_name)
 
 
+@pytest.mark.hf_gated
 @pytest.mark.parametrize(
     "model_name",
     [
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,6 +26,7 @@ @@
     )
+    @pytest.mark.hf_gated
     @pytest.mark.parametrize(
         "model_name, parallelize_func, sequence_parallel",
         [
@@ Expand Down @@