From e5b31ca668f2aba57b16dad477e576287682a455 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Thu, 24 Jul 2025 22:20:39 +0000 Subject: [PATCH 01/22] test: filter out tests that needs_hf_token by default Signed-off-by: Terry Kong --- .github/workflows/cicd-main.yml | 4 ++-- pyproject.toml | 1 + tests/unit/models/generation/test_vllm_large_model.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 3b10108e6a..2c299a8998 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -221,8 +221,8 @@ jobs: UNIT_TEST_SCRIPT: | cd /opt/nemo-rl if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then - uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"not mcore\" - uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m mcore + uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"not mcore and needs_hf_token\" + uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m \"mcore and needs_hf_token\" else echo Skipping unit tests for docs-only level fi diff --git a/pyproject.toml b/pyproject.toml index cddda79abe..c985cd1351 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -174,6 +174,7 @@ testpaths = ["tests"] python_files = "test_*.py" markers = [ "mcore: marks tests that require the mcore extra", + "needs_hf_token: marks tests that require HuggingFace token access for gated models", ] [tool.coverage.run] diff --git a/tests/unit/models/generation/test_vllm_large_model.py b/tests/unit/models/generation/test_vllm_large_model.py index d24a0c0f31..03c6822ecb 100644 --- a/tests/unit/models/generation/test_vllm_large_model.py +++ b/tests/unit/models/generation/test_vllm_large_model.py @@ -125,6 +125,7 @@ def test_input_data(tokenizer): # skip this test for now @pytest.mark.skip(reason="Skipping large model test until we have resources in CI.") +@pytest.mark.needs_hf_token @pytest.mark.asyncio @pytest.mark.parametrize("tensor_parallel_size", [4, 8]) @pytest.mark.parametrize("pipeline_parallel_size", [2]) From 13b71aa92ae26e9632863ace99ad0b2bba998b9b Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 01:30:04 +0000 Subject: [PATCH 02/22] wip Signed-off-by: Terry Kong --- tests/unit/conftest.py | 65 +++++---- .../unit/models/policy/test_dtensor_worker.py | 125 +++++++++++------- .../models/policy/test_megatron_worker.py | 80 ++++++----- 3 files changed, 157 insertions(+), 113 deletions(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 3197fa2d57..f17be7611d 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -30,6 +30,12 @@ dir_path = os.path.dirname(os.path.abspath(__file__)) + +def has_hf_token(): + """Check if HuggingFace token is available.""" + return os.getenv("HF_TOKEN") is not None + + TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets") UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json") UNIT_RESULTS_FILE_DATED = os.path.join( @@ -37,25 +43,6 @@ ) -# Mapping between asset and absolute path (each are populated from a session level fixture) -class TEST_ASSETS: - TINY_LLAMA_MODEL_PATH = os.path.join( - TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer" - ) - TINY_LLAMA_TIED_MODEL_PATH = os.path.join( - TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer" - ) - TINY_QWEN2_MODEL_PATH = os.path.join( - TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer" - ) - TINY_QWEN3_MODEL_PATH = os.path.join( - TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer" - ) - TINY_GEMMA3_MODEL_PATH = os.path.join( - TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer" - ) - - class UnitTestData(TypedDict): exit_status: int | str git_commit: str @@ -394,14 +381,20 @@ def mock_2gpu_distributed_env(): ####################### -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def tiny_llama_model_path(): """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" import shutil from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM - model_path = TEST_ASSETS.TINY_LLAMA_MODEL_PATH + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B" + ) + + model_path = os.path.join(TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer") # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=128256 (so we can re-use llama3.2 1b tokenizer) config = LlamaConfig( @@ -422,14 +415,22 @@ def tiny_llama_model_path(): yield model_path -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def tiny_llama_tied_model_path(): """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" import shutil from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM - model_path = TEST_ASSETS.TINY_LLAMA_TIED_MODEL_PATH + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B" + ) + + model_path = os.path.join( + TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer" + ) # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=128256 (so we can re-use llama3.2 1b tokenizer) config = LlamaConfig( @@ -450,14 +451,14 @@ def tiny_llama_tied_model_path(): yield model_path -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def tiny_qwen2_model_path(): """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" import shutil from transformers import AutoTokenizer, Qwen2Config, Qwen2ForCausalLM - model_path = TEST_ASSETS.TINY_QWEN2_MODEL_PATH + model_path = os.path.join(TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer") # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer) config = Qwen2Config( @@ -478,14 +479,14 @@ def tiny_qwen2_model_path(): yield model_path -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def tiny_qwen3_model_path(): """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" import shutil from transformers import AutoTokenizer, Qwen3Config, Qwen3ForCausalLM - model_path = TEST_ASSETS.TINY_QWEN3_MODEL_PATH + model_path = os.path.join(TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer") # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer) config = Qwen3Config( @@ -506,14 +507,20 @@ def tiny_qwen3_model_path(): yield model_path -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def tiny_gemma3_model_path(): """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" import shutil from transformers import AutoTokenizer, Gemma3ForCausalLM, Gemma3TextConfig - model_path = TEST_ASSETS.TINY_GEMMA3_MODEL_PATH + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: google/gemma-3-1b-it" + ) + + model_path = os.path.join(TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer") # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=262144 so we can re-use gemma-3-1b tokenizer config = Gemma3TextConfig( diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py index c176082698..a8e5bc59e4 100644 --- a/tests/unit/models/policy/test_dtensor_worker.py +++ b/tests/unit/models/policy/test_dtensor_worker.py @@ -31,12 +31,11 @@ from nemo_rl.models.generation import configure_generation_config from nemo_rl.models.policy import PolicyConfig from nemo_rl.models.policy.lm_policy import Policy -from tests.unit.conftest import TEST_ASSETS from tests.unit.test_utils import SimpleLoss def create_test_config( - model_name: str = TEST_ASSETS.TINY_LLAMA_MODEL_PATH, + model_name: str, tp: int = 1, cp: int = 1, sequence_parallel: bool = False, @@ -145,9 +144,9 @@ def gc_collect(): @pytest.fixture -def policy_setup(two_gpu_virtual_cluster): +def policy_setup(two_gpu_virtual_cluster, tiny_llama_model_path): """Setup and teardown for policy tests - creates a virtual cluster and policy.""" - config = create_test_config() + config = create_test_config(tiny_llama_model_path) tokenizer = get_tokenizer(config["tokenizer"]) config["generation"] = configure_generation_config(config["generation"], tokenizer) @@ -240,9 +239,17 @@ def test_lm_policy_init(policy_setup): @pytest.fixture def training_setup(request, two_gpu_virtual_cluster): """Setup and teardown specifically for training tests.""" - model_name, tp, cp, sequence_parallel, cpu_offload, activation_checkpointing = ( - request.param - ) + ( + model_fixture_name, + tp, + cp, + sequence_parallel, + cpu_offload, + activation_checkpointing, + ) = request.param + + # Get the actual model path from the requested fixture + model_name = request.getfixturevalue(model_fixture_name) policy = None data = None loss_fn = None @@ -299,37 +306,38 @@ def training_setup(request, two_gpu_virtual_cluster): policy.shutdown() +@pytest.mark.needs_hf_token @pytest.mark.timeout(60) @pytest.mark.parametrize( "training_setup", [ - # model_name tp cp sp cpu act - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, False, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, True, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, False, True, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, False, False, True), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, False, False), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, True, True, False), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, True, False, True), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, False, True, True), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, True, True, True), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 2, False, False, False), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, True, True, False), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, True, False, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, False, True, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, True, True, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 2, False, False, False), + # model_fixture_name tp cp sp cpu act + ("tiny_llama_model_path", 1, 1, False, False, False), + ("tiny_llama_model_path", 1, 1, True, False, False), + ("tiny_llama_model_path", 1, 1, False, True, False), + ("tiny_llama_model_path", 1, 1, False, False, True), + ("tiny_llama_model_path", 1, 2, False, False, False), + ("tiny_qwen2_model_path", 1, 1, True, True, False), + ("tiny_qwen2_model_path", 1, 1, True, False, True), + ("tiny_qwen2_model_path", 1, 1, False, True, True), + ("tiny_qwen2_model_path", 1, 1, True, True, True), + ("tiny_qwen2_model_path", 1, 2, False, False, False), + ("tiny_qwen3_model_path", 1, 1, True, True, False), + ("tiny_qwen3_model_path", 1, 1, True, False, True), + ("tiny_qwen3_model_path", 1, 1, False, True, True), + ("tiny_qwen3_model_path", 1, 1, True, True, True), + ("tiny_qwen3_model_path", 1, 2, False, False, False), ( - TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, + "tiny_gemma3_model_path", 1, 1, True, True, False, ), # gemma3 doesn't support spda - (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 1, 1, True, False, True), - (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 1, 1, False, True, True), - (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 1, 1, True, True, True), + ("tiny_gemma3_model_path", 1, 1, True, False, True), + ("tiny_gemma3_model_path", 1, 1, False, True, True), + ("tiny_gemma3_model_path", 1, 1, True, True, True), # CP doesn't support gemma3 due to spda input has attent_mask != None. ], indirect=True, @@ -372,9 +380,17 @@ def verify_loss_tensor(loss_tensor): @pytest.fixture def logprob_setup(request, two_gpu_virtual_cluster): """Setup and teardown specifically for training tests.""" - model_name, tp, cp, sequence_parallel, cpu_offload, activation_checkpointing = ( - request.param - ) + ( + model_fixture_name, + tp, + cp, + sequence_parallel, + cpu_offload, + activation_checkpointing, + ) = request.param + + # Get the actual model path from the requested fixture + model_name = request.getfixturevalue(model_fixture_name) policy = None data = None @@ -449,28 +465,29 @@ def logprob_setup(request, two_gpu_virtual_cluster): policy.shutdown() +@pytest.mark.needs_hf_token @pytest.mark.timeout(360) @pytest.mark.parametrize( "logprob_setup", [ # TP=2, CP=1 - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 2, 1, False, True, False), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 2, 1, False, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, 1, False, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, 1, False, True, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, 1, False, True, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, 1, False, True, False), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, 1, False, False, False), - (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 2, 1, False, True, False), - (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 2, 1, False, False, False), + ("tiny_qwen2_model_path", 2, 1, False, True, False), + ("tiny_qwen2_model_path", 2, 1, False, False, False), + ("tiny_llama_model_path", 2, 1, False, False, False), + ("tiny_llama_model_path", 2, 1, False, True, False), + ("tiny_llama_model_path", 2, 1, False, True, True), + ("tiny_qwen3_model_path", 2, 1, False, True, False), + ("tiny_qwen3_model_path", 2, 1, False, False, False), + ("tiny_gemma3_model_path", 2, 1, False, True, False), + ("tiny_gemma3_model_path", 2, 1, False, False, False), # TP=1, CP=2 - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 2, False, True, False), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 2, False, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, True, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, True, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 2, False, True, False), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 2, False, False, False), + ("tiny_qwen2_model_path", 1, 2, False, True, False), + ("tiny_qwen2_model_path", 1, 2, False, False, False), + ("tiny_llama_model_path", 1, 2, False, False, False), + ("tiny_llama_model_path", 1, 2, False, True, False), + ("tiny_llama_model_path", 1, 2, False, True, True), + ("tiny_qwen3_model_path", 1, 2, False, True, False), + ("tiny_qwen3_model_path", 1, 2, False, False, False), ], indirect=True, ) @@ -491,7 +508,10 @@ def test_dtensor_worker_logprob_tp2_or_cp2_matches_unsharded(logprob_setup): ) -def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(two_gpu_virtual_cluster): +@pytest.mark.needs_hf_token +def test_dtensor_tp_and_tied_model_with_custom_parallel_plan( + two_gpu_virtual_cluster, tiny_llama_tied_model_path +): """Test that DTensor with a tp > 1 and a tied model with a custom parallel plan works.""" from torch.distributed.tensor.parallel import ColwiseParallel from torch.distributed.tensor.placement_types import Replicate @@ -501,7 +521,7 @@ def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(two_gpu_virtual_clu "model.embed_tokens": ColwiseParallel(output_layouts=Replicate()), } config = create_test_config( - model_name=TEST_ASSETS.TINY_LLAMA_TIED_MODEL_PATH, + model_name=tiny_llama_tied_model_path, tp=2, cp=1, sequence_parallel=False, @@ -534,8 +554,11 @@ def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(two_gpu_virtual_clu policy.shutdown() +@pytest.mark.needs_hf_token @pytest.mark.timeout(180) -def test_dtensor_loss_independent_of_microbatch_size_two_gpus(two_gpu_virtual_cluster): +def test_dtensor_loss_independent_of_microbatch_size_two_gpus( + two_gpu_virtual_cluster, tiny_llama_model_path +): """Tests that changing microbatch size while keeping global batch size constant does not affect loss values in DTensor.""" # Create test batch with global batch size of 8 global_batch_size = 8 @@ -569,7 +592,7 @@ def test_dtensor_loss_independent_of_microbatch_size_two_gpus(two_gpu_virtual_cl ) # Test with mbs=1, 2 microbatches per GPU - config = create_test_config() + config = create_test_config(tiny_llama_model_path) tokenizer = get_tokenizer(config["tokenizer"]) print("Creating training Policy with mbs=1...") @@ -605,7 +628,7 @@ def test_dtensor_loss_independent_of_microbatch_size_two_gpus(two_gpu_virtual_cl policy_mbs1.worker_group.shutdown() # Test with mbs=2, 1 microbatch per GPU - config = create_test_config() + config = create_test_config(tiny_llama_model_path) config["train_micro_batch_size"] = 2 config["generation"] = configure_generation_config(config["generation"], tokenizer) diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index a399bca0d5..76753e60d0 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -28,12 +28,11 @@ from nemo_rl.models.generation import configure_generation_config from nemo_rl.models.policy import PolicyConfig from nemo_rl.models.policy.lm_policy import Policy -from tests.unit.conftest import TEST_ASSETS from tests.unit.test_utils import SimpleLoss def create_megatron_test_config( - model_name: str = TEST_ASSETS.TINY_LLAMA_MODEL_PATH, + model_name: str, tp: int = 1, pp: int = 1, precision: str = "float32", @@ -199,18 +198,21 @@ def policy_setup(request): @pytest.fixture def training_setup(request): """Setup and teardown specifically for training tests.""" - # Parse parameters: (num_gpus, tp, pp, model_name, config_updates) + # Parse parameters: (num_gpus, tp, pp, model_fixture_name, config_updates) if hasattr(request, "param") and request.param is not None: - num_gpus, tp, pp, model_name, config_updates = request.param + num_gpus, tp, pp, model_fixture_name, config_updates = request.param else: - num_gpus, tp, pp, model_name, config_updates = ( + num_gpus, tp, pp, model_fixture_name, config_updates = ( 2, 1, 1, - TEST_ASSETS.TINY_LLAMA_MODEL_PATH, + "tiny_llama_model_path", {}, ) + # Get the actual model path from the requested fixture + model_name = request.getfixturevalue(model_fixture_name) + policy = None cluster = None data = None @@ -317,24 +319,25 @@ def training_setup(request): cluster.shutdown() +@pytest.mark.needs_hf_token @pytest.mark.timeout(300) @pytest.mark.parametrize( "training_setup", [ - # (num_gpus, tp, pp, model_name, config_updates) - (2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {}), - (2, 2, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {}), - (2, 1, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH, {}), - (2, 2, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH, {}), - (2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {"precision": "bfloat16"}), + # (num_gpus, tp, pp, model_fixture_name, config_updates) + (2, 1, 1, "tiny_llama_model_path", {}), + (2, 2, 1, "tiny_llama_model_path", {}), + (2, 1, 1, "tiny_qwen2_model_path", {}), + (2, 2, 1, "tiny_qwen2_model_path", {}), + (2, 1, 1, "tiny_llama_model_path", {"precision": "bfloat16"}), ( 2, 1, 1, - TEST_ASSETS.TINY_LLAMA_MODEL_PATH, + "tiny_llama_model_path", {"activation_checkpointing": True}, ), - (2, 2, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {"sequence_parallel": True}), + (2, 2, 1, "tiny_llama_model_path", {"sequence_parallel": True}), ], indirect=True, ids=[ @@ -536,11 +539,14 @@ def test_megatron_policy_generation(generation_setup): @pytest.fixture def logprob_setup(request): """Setup and teardown specifically for logprob tests.""" - # Parse parameters: (num_gpus, tp, pp, model_name) + # Parse parameters: (num_gpus, tp, pp, model_fixture_name) if hasattr(request, "param") and request.param is not None: - num_gpus, tp, pp, model_name = request.param + num_gpus, tp, pp, model_fixture_name = request.param else: - num_gpus, tp, pp, model_name = 2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH + num_gpus, tp, pp, model_fixture_name = 2, 1, 1, "tiny_llama_model_path" + + # Get the actual model path from the requested fixture + model_name = request.getfixturevalue(model_fixture_name) policy = None cluster = None @@ -616,14 +622,15 @@ def logprob_setup(request): @pytest.mark.timeout(180) +@pytest.mark.needs_hf_token @pytest.mark.parametrize( "logprob_setup", [ - # (num_gpus, tp, pp, model_name) - (2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH), - (2, 2, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH), - (2, 1, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH), - (2, 2, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH), + # (num_gpus, tp, pp, model_fixture_name) + (2, 1, 1, "tiny_llama_model_path"), + (2, 2, 1, "tiny_llama_model_path"), + (2, 1, 1, "tiny_qwen2_model_path"), + (2, 2, 1, "tiny_qwen2_model_path"), ], indirect=True, ids=["2gpu_dp2_llama", "2gpu_tp2_llama", "2gpu_dp2_qwen2", "2gpu_tp2_qwen2"], @@ -656,7 +663,8 @@ def test_megatron_policy_logprobs(logprob_setup): @pytest.mark.timeout(240) -def test_megatron_loss_independent_of_microbatch_size(): +@pytest.mark.needs_hf_token +def test_megatron_loss_independent_of_microbatch_size(tiny_llama_model_path): """Test that changing microbatch size while keeping global batch size constant does not affect loss values.""" num_gpus = 2 global_batch_size = 8 @@ -697,7 +705,7 @@ def test_megatron_loss_independent_of_microbatch_size(): max_colocated_worker_groups=1, ) - config1 = create_megatron_test_config() + config1 = create_megatron_test_config(tiny_llama_model_path) config1["train_micro_batch_size"] = 1 tokenizer = get_tokenizer(config1["tokenizer"]) config1["generation"] = configure_generation_config( @@ -745,7 +753,7 @@ def test_megatron_loss_independent_of_microbatch_size(): max_colocated_worker_groups=1, ) - config2 = create_megatron_test_config() + config2 = create_megatron_test_config(tiny_llama_model_path) config2["train_micro_batch_size"] = 2 config2["generation"] = configure_generation_config( config2["generation"], tokenizer @@ -774,7 +782,8 @@ def test_megatron_loss_independent_of_microbatch_size(): @pytest.mark.timeout(300) -def test_megatron_reference_policy_functionality(): +@pytest.mark.needs_hf_token +def test_megatron_reference_policy_functionality(tiny_llama_model_path): """Test Megatron reference policy functionality.""" num_gpus = 2 @@ -786,7 +795,7 @@ def test_megatron_reference_policy_functionality(): max_colocated_worker_groups=1, ) - config = create_megatron_test_config() + config = create_megatron_test_config(tiny_llama_model_path) config["megatron_cfg"]["optimizer"]["lr"] = 1e-2 # Increase from 5e-6 to 1e-2 config["megatron_cfg"]["optimizer"]["min_lr"] = 1e-3 # Increase min_lr as well @@ -894,6 +903,7 @@ def test_megatron_reference_policy_functionality(): @pytest.mark.timeout(400) +@pytest.mark.needs_hf_token @pytest.mark.parametrize( "num_gpus,tp,pp", [ @@ -903,12 +913,14 @@ def test_megatron_reference_policy_functionality(): ], ids=["2gpu_dp2_save_restore", "2gpu_pp2_save_restore", "2gpu_tp2_save_restore"], ) -def test_megatron_checkpoint_save_kill_and_restore(num_gpus, tp, pp): +def test_megatron_checkpoint_save_kill_and_restore( + num_gpus, tp, pp, tiny_llama_model_path +): """Test full checkpoint save/restore cycle: save -> kill worker -> restart -> verify restore.""" from copy import deepcopy # Use tiny model for faster testing - model_name = TEST_ASSETS.TINY_LLAMA_MODEL_PATH + model_name = tiny_llama_model_path tokenizer = get_tokenizer({"name": model_name}) with tempfile.TemporaryDirectory(prefix="megatron_save_restore_") as temp_dir: @@ -1146,7 +1158,8 @@ def test_megatron_checkpoint_save_kill_and_restore(num_gpus, tp, pp): @pytest.mark.timeout(300) -def test_megatron_dpo_training(): +@pytest.mark.needs_hf_token +def test_megatron_dpo_training(tiny_llama_model_path): """Test DPO training with Megatron backend.""" num_gpus = 2 batch_size = 8 @@ -1184,7 +1197,7 @@ def test_megatron_dpo_training(): max_colocated_worker_groups=1, ) - config = create_megatron_test_config() + config = create_megatron_test_config(tiny_llama_model_path) tokenizer = get_tokenizer(config["tokenizer"]) policy = Policy( @@ -1242,7 +1255,8 @@ def test_megatron_dpo_training(): @pytest.mark.timeout(300) -def test_megatron_sft_training(): +@pytest.mark.needs_hf_token +def test_megatron_sft_training(tiny_llama_model_path): """Test SFT training with Megatron backend.""" num_gpus = 2 batch_size = 8 @@ -1277,7 +1291,7 @@ def test_megatron_sft_training(): max_colocated_worker_groups=1, ) - config = create_megatron_test_config() + config = create_megatron_test_config(tiny_llama_model_path) tokenizer = get_tokenizer(config["tokenizer"]) policy = Policy( From e2f06bc17de7e63bb6bc489910c267f3bcb60743 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 01:45:37 +0000 Subject: [PATCH 03/22] fix Signed-off-by: Terry Kong --- tests/unit/algorithms/test_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/unit/algorithms/test_utils.py b/tests/unit/algorithms/test_utils.py index 82338de026..8bd45c2349 100755 --- a/tests/unit/algorithms/test_utils.py +++ b/tests/unit/algorithms/test_utils.py @@ -77,6 +77,7 @@ def get_format_with_simple_role_header(messages): return message +@pytest.mark.needs_hf_token def test_get_tokenizer_no_chat_template(conversation_messages): """Test get_tokenizer when no chat template is specified in config""" config = {"name": "meta-llama/Llama-3.2-1B-Instruct"} @@ -89,6 +90,7 @@ def test_get_tokenizer_no_chat_template(conversation_messages): assert formatted == expected +@pytest.mark.needs_hf_token def test_get_tokenizer_default_chat_template(conversation_messages): """Test get_tokenizer when chat_template is 'default' in config""" config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"} @@ -100,6 +102,7 @@ def test_get_tokenizer_default_chat_template(conversation_messages): assert formatted == expected +@pytest.mark.needs_hf_token def test_get_tokenizer_null_chat_template(conversation_messages): """Test get_tokenizer when chat_template is None in config""" config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None} @@ -113,6 +116,7 @@ def test_get_tokenizer_null_chat_template(conversation_messages): assert formatted == expected +@pytest.mark.needs_hf_token def test_get_tokenizer_custom_jinja_template(conversation_messages): """Test get_tokenizer when a custom jinja template is specified""" custom_template = COMMON_CHAT_TEMPLATES.simple_role_header From fa548a890b29f783417a8b56cdae2198d877c2d1 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 02:19:46 +0000 Subject: [PATCH 04/22] wip Signed-off-by: Terry Kong --- tests/unit/algorithms/test_utils.py | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/unit/algorithms/test_utils.py b/tests/unit/algorithms/test_utils.py index 8bd45c2349..d1940e8374 100755 --- a/tests/unit/algorithms/test_utils.py +++ b/tests/unit/algorithms/test_utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from datetime import datetime import pytest @@ -20,6 +21,11 @@ from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES +def has_hf_token(): + """Check if HuggingFace token is available.""" + return os.getenv("HF_TOKEN") is not None + + @pytest.fixture def conversation_messages(): """Fixture providing a multi-turn conversation for testing chat templates""" @@ -80,6 +86,12 @@ def get_format_with_simple_role_header(messages): @pytest.mark.needs_hf_token def test_get_tokenizer_no_chat_template(conversation_messages): """Test get_tokenizer when no chat template is specified in config""" + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct" + ) + config = {"name": "meta-llama/Llama-3.2-1B-Instruct"} tokenizer = get_tokenizer(config) @@ -93,6 +105,12 @@ def test_get_tokenizer_no_chat_template(conversation_messages): @pytest.mark.needs_hf_token def test_get_tokenizer_default_chat_template(conversation_messages): """Test get_tokenizer when chat_template is 'default' in config""" + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct" + ) + config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"} tokenizer = get_tokenizer(config) @@ -105,6 +123,12 @@ def test_get_tokenizer_default_chat_template(conversation_messages): @pytest.mark.needs_hf_token def test_get_tokenizer_null_chat_template(conversation_messages): """Test get_tokenizer when chat_template is None in config""" + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct" + ) + config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None} tokenizer = get_tokenizer(config) @@ -119,6 +143,12 @@ def test_get_tokenizer_null_chat_template(conversation_messages): @pytest.mark.needs_hf_token def test_get_tokenizer_custom_jinja_template(conversation_messages): """Test get_tokenizer when a custom jinja template is specified""" + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct" + ) + custom_template = COMMON_CHAT_TEMPLATES.simple_role_header config = { "name": "meta-llama/Llama-3.2-1B-Instruct", From 09936ca5776060727b2c10a87791baf7af519d26 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 02:22:34 +0000 Subject: [PATCH 05/22] wip Signed-off-by: Terry Kong --- .../data/hf_datasets/test_oai_format_dataset.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/unit/data/hf_datasets/test_oai_format_dataset.py b/tests/unit/data/hf_datasets/test_oai_format_dataset.py index 4ba75a6a1d..9b53ea03a2 100644 --- a/tests/unit/data/hf_datasets/test_oai_format_dataset.py +++ b/tests/unit/data/hf_datasets/test_oai_format_dataset.py @@ -13,11 +13,18 @@ # limitations under the License. import json +import os import tempfile import pytest from transformers import AutoTokenizer + +def has_hf_token(): + """Check if HuggingFace token is available.""" + return os.getenv("HF_TOKEN") is not None + + from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES from nemo_rl.data.hf_datasets.oai_format_dataset import ( OpenAIFormatDataset, @@ -87,8 +94,15 @@ def test_custom_keys(sample_data): assert dataset.system_prompt == "You are a helpful assistant." +@pytest.mark.needs_hf_token @pytest.mark.parametrize("sample_data", [("messages", "system_key")], indirect=True) def test_message_formatting(sample_data): + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: Meta-Llama/Meta-Llama-3-8B-Instruct" + ) + train_path, val_path = sample_data dataset = OpenAIFormatDataset( train_path, val_path, chat_key="messages", system_key="system_key" From a63d0e766d6fcbb319f7e0af7281250f1fcc8a3e Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 02:25:42 +0000 Subject: [PATCH 06/22] fix Signed-off-by: Terry Kong --- .../unit/data/hf_datasets/test_prompt_response.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/unit/data/hf_datasets/test_prompt_response.py b/tests/unit/data/hf_datasets/test_prompt_response.py index 8ff7f5c5f6..1742dd3c86 100644 --- a/tests/unit/data/hf_datasets/test_prompt_response.py +++ b/tests/unit/data/hf_datasets/test_prompt_response.py @@ -13,11 +13,18 @@ # limitations under the License. import json +import os import tempfile import pytest from transformers import AutoTokenizer + +def has_hf_token(): + """Check if HuggingFace token is available.""" + return os.getenv("HF_TOKEN") is not None + + from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES from nemo_rl.data.hf_datasets.prompt_response_dataset import ( PromptResponseDataset, @@ -76,8 +83,15 @@ def test_custom_keys(sample_data): assert dataset.output_key == "answer" +@pytest.mark.needs_hf_token @pytest.mark.parametrize("sample_data", [("question", "answer")], indirect=True) def test_message_formatting(sample_data): + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: Meta-Llama/Meta-Llama-3-8B-Instruct" + ) + train_path, val_path = sample_data dataset = PromptResponseDataset( train_path, val_path, input_key="question", output_key="answer" From 8147d6c733c520ada6d094bd6c51a4f2f8285ae6 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 02:29:39 +0000 Subject: [PATCH 07/22] wip Signed-off-by: Terry Kong --- tests/unit/data/test_llm_message_utils.py | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py index 6f9c3f3129..7ce59a5986 100644 --- a/tests/unit/data/test_llm_message_utils.py +++ b/tests/unit/data/test_llm_message_utils.py @@ -13,10 +13,18 @@ # limitations under the License. +import os + import pytest import torch from transformers import AutoTokenizer + +def has_hf_token(): + """Check if HuggingFace token is available.""" + return os.getenv("HF_TOKEN") is not None + + from nemo_rl.data.hf_datasets import COMMON_CHAT_TEMPLATES from nemo_rl.data.interfaces import LLMMessageLogType, TaskDataSpec from nemo_rl.data.llm_message_utils import ( @@ -328,9 +336,16 @@ def test_batch_pad_message_log_custom_pad_value( ) +@pytest.mark.needs_hf_token def test_get_formatted_message_log_llama( raw_chat_message_log: LLMMessageLogType, ) -> None: + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct" + ) + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") ## get expected result @@ -372,9 +387,16 @@ def test_get_formatted_message_log_llama( assert actual_text == expected_text +@pytest.mark.needs_hf_token def test_get_formatted_message_log_add_generation_prompt_llama( raw_chat_message_log: LLMMessageLogType, ) -> None: + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct" + ) + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") ## get expected result @@ -499,7 +521,14 @@ def test_get_formatted_message_log_add_generation_prompt_qwen( assert actual_text == expected_text +@pytest.mark.needs_hf_token def test_formatted_message_log_empty_message(): + # Skip if no HF token available for gated model + if not has_hf_token() and not os.getenv("CI"): + pytest.skip( + "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct" + ) + message_logs = [ [ {"role": "system", "content": "You are a helpful assistant."}, From 19dcb012ea6709e9657f81062ba8baf97a78d613 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 02:36:42 +0000 Subject: [PATCH 08/22] ok Signed-off-by: Terry Kong --- tests/unit/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index f17be7611d..79c9cf0def 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -37,6 +37,7 @@ def has_hf_token(): TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets") +os.makedirs(TEST_ASSETS_DIR, exist_ok=True) UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json") UNIT_RESULTS_FILE_DATED = os.path.join( dir_path, f"unit_results/{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" From 03ef590409cd339d51ed373ca611ce53d72904af Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 02:45:12 +0000 Subject: [PATCH 09/22] fix Signed-off-by: Terry Kong --- tests/unit/models/dtensor/test_parallelize.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/unit/models/dtensor/test_parallelize.py b/tests/unit/models/dtensor/test_parallelize.py index 5acb7addc4..0c3de464f6 100644 --- a/tests/unit/models/dtensor/test_parallelize.py +++ b/tests/unit/models/dtensor/test_parallelize.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from itertools import product from unittest.mock import MagicMock @@ -19,6 +20,12 @@ from torch.distributed.tensor.parallel import ParallelStyle, parallelize_module from transformers import AutoModelForCausalLM + +def has_hf_token(): + """Check if HuggingFace token is available.""" + return os.getenv("HF_TOKEN") is not None + + from nemo_rl.models.dtensor.parallelize import ( _parallelize_gemma3, _parallelize_llama, @@ -26,6 +33,7 @@ ) +@pytest.mark.needs_hf_token @pytest.mark.parametrize( "model_name, parallelize_func, sequence_parallel", [ @@ -44,6 +52,17 @@ ) def test_parallelize_plan_keys(model_name, parallelize_func, sequence_parallel): """Tests that the keys in the parallelization plans are valid by mocking parallel styles.""" + # List of known gated models that require HF token + gated_models = [ + "google/gemma-3-1b-it", + "google/gemma-3-4b-it", + "meta-llama/Llama-3.2-1B-Instruct", + ] + + # Skip if testing a gated model and no HF token available + if model_name in gated_models and not has_hf_token() and not os.getenv("CI"): + pytest.skip(f"HuggingFace token not available for gated model: {model_name}") + model = AutoModelForCausalLM.from_pretrained(model_name) parallel_plan = parallelize_func(model, sequence_parallel=sequence_parallel) From ac0b41579f108403df9fd9668cf4c18d4fce3d58 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 03:06:17 +0000 Subject: [PATCH 10/22] try cleaning up Signed-off-by: Terry Kong --- tests/unit/algorithms/test_utils.py | 30 ---------------- tests/unit/conftest.py | 36 ++++++++----------- .../hf_datasets/test_oai_format_dataset.py | 13 ------- .../data/hf_datasets/test_prompt_response.py | 13 ------- tests/unit/data/test_llm_message_utils.py | 26 -------------- tests/unit/models/dtensor/test_parallelize.py | 18 ---------- 6 files changed, 15 insertions(+), 121 deletions(-) diff --git a/tests/unit/algorithms/test_utils.py b/tests/unit/algorithms/test_utils.py index d1940e8374..8bd45c2349 100755 --- a/tests/unit/algorithms/test_utils.py +++ b/tests/unit/algorithms/test_utils.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from datetime import datetime import pytest @@ -21,11 +20,6 @@ from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES -def has_hf_token(): - """Check if HuggingFace token is available.""" - return os.getenv("HF_TOKEN") is not None - - @pytest.fixture def conversation_messages(): """Fixture providing a multi-turn conversation for testing chat templates""" @@ -86,12 +80,6 @@ def get_format_with_simple_role_header(messages): @pytest.mark.needs_hf_token def test_get_tokenizer_no_chat_template(conversation_messages): """Test get_tokenizer when no chat template is specified in config""" - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct" - ) - config = {"name": "meta-llama/Llama-3.2-1B-Instruct"} tokenizer = get_tokenizer(config) @@ -105,12 +93,6 @@ def test_get_tokenizer_no_chat_template(conversation_messages): @pytest.mark.needs_hf_token def test_get_tokenizer_default_chat_template(conversation_messages): """Test get_tokenizer when chat_template is 'default' in config""" - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct" - ) - config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"} tokenizer = get_tokenizer(config) @@ -123,12 +105,6 @@ def test_get_tokenizer_default_chat_template(conversation_messages): @pytest.mark.needs_hf_token def test_get_tokenizer_null_chat_template(conversation_messages): """Test get_tokenizer when chat_template is None in config""" - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct" - ) - config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None} tokenizer = get_tokenizer(config) @@ -143,12 +119,6 @@ def test_get_tokenizer_null_chat_template(conversation_messages): @pytest.mark.needs_hf_token def test_get_tokenizer_custom_jinja_template(conversation_messages): """Test get_tokenizer when a custom jinja template is specified""" - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B-Instruct" - ) - custom_template = COMMON_CHAT_TEMPLATES.simple_role_header config = { "name": "meta-llama/Llama-3.2-1B-Instruct", diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 79c9cf0def..830e04fb97 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -31,9 +31,21 @@ dir_path = os.path.dirname(os.path.abspath(__file__)) -def has_hf_token(): - """Check if HuggingFace token is available.""" - return os.getenv("HF_TOKEN") is not None +def pytest_runtest_setup(item): + """Automatically skip tests that require HF tokens unless explicitly requested.""" + needs_hf_token = item.get_closest_marker("needs_hf_token") + if needs_hf_token: + # Check if the test should run based on pytest marker selection + # If -m needs_hf_token is specified, run the test + # Otherwise, skip it by default + config = item.session.config + + # Check if needs_hf_token marker is explicitly selected + marker_expr = config.getoption("-m", default="") + if "needs_hf_token" not in marker_expr: + pytest.skip( + "Skipping test that requires HF token (use -m needs_hf_token to run)" + ) TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets") @@ -389,12 +401,6 @@ def tiny_llama_model_path(): from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B" - ) - model_path = os.path.join(TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer") # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=128256 (so we can re-use llama3.2 1b tokenizer) @@ -423,12 +429,6 @@ def tiny_llama_tied_model_path(): from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: meta-llama/Llama-3.2-1B" - ) - model_path = os.path.join( TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer" ) @@ -515,12 +515,6 @@ def tiny_gemma3_model_path(): from transformers import AutoTokenizer, Gemma3ForCausalLM, Gemma3TextConfig - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: google/gemma-3-1b-it" - ) - model_path = os.path.join(TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer") # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=262144 so we can re-use gemma-3-1b tokenizer diff --git a/tests/unit/data/hf_datasets/test_oai_format_dataset.py b/tests/unit/data/hf_datasets/test_oai_format_dataset.py index 9b53ea03a2..6438c52fe2 100644 --- a/tests/unit/data/hf_datasets/test_oai_format_dataset.py +++ b/tests/unit/data/hf_datasets/test_oai_format_dataset.py @@ -13,18 +13,11 @@ # limitations under the License. import json -import os import tempfile import pytest from transformers import AutoTokenizer - -def has_hf_token(): - """Check if HuggingFace token is available.""" - return os.getenv("HF_TOKEN") is not None - - from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES from nemo_rl.data.hf_datasets.oai_format_dataset import ( OpenAIFormatDataset, @@ -97,12 +90,6 @@ def test_custom_keys(sample_data): @pytest.mark.needs_hf_token @pytest.mark.parametrize("sample_data", [("messages", "system_key")], indirect=True) def test_message_formatting(sample_data): - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: Meta-Llama/Meta-Llama-3-8B-Instruct" - ) - train_path, val_path = sample_data dataset = OpenAIFormatDataset( train_path, val_path, chat_key="messages", system_key="system_key" diff --git a/tests/unit/data/hf_datasets/test_prompt_response.py b/tests/unit/data/hf_datasets/test_prompt_response.py index 1742dd3c86..d6b908a1a5 100644 --- a/tests/unit/data/hf_datasets/test_prompt_response.py +++ b/tests/unit/data/hf_datasets/test_prompt_response.py @@ -13,18 +13,11 @@ # limitations under the License. import json -import os import tempfile import pytest from transformers import AutoTokenizer - -def has_hf_token(): - """Check if HuggingFace token is available.""" - return os.getenv("HF_TOKEN") is not None - - from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES from nemo_rl.data.hf_datasets.prompt_response_dataset import ( PromptResponseDataset, @@ -86,12 +79,6 @@ def test_custom_keys(sample_data): @pytest.mark.needs_hf_token @pytest.mark.parametrize("sample_data", [("question", "answer")], indirect=True) def test_message_formatting(sample_data): - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: Meta-Llama/Meta-Llama-3-8B-Instruct" - ) - train_path, val_path = sample_data dataset = PromptResponseDataset( train_path, val_path, input_key="question", output_key="answer" diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py index 7ce59a5986..8b329e8072 100644 --- a/tests/unit/data/test_llm_message_utils.py +++ b/tests/unit/data/test_llm_message_utils.py @@ -13,18 +13,10 @@ # limitations under the License. -import os - import pytest import torch from transformers import AutoTokenizer - -def has_hf_token(): - """Check if HuggingFace token is available.""" - return os.getenv("HF_TOKEN") is not None - - from nemo_rl.data.hf_datasets import COMMON_CHAT_TEMPLATES from nemo_rl.data.interfaces import LLMMessageLogType, TaskDataSpec from nemo_rl.data.llm_message_utils import ( @@ -340,12 +332,6 @@ def test_batch_pad_message_log_custom_pad_value( def test_get_formatted_message_log_llama( raw_chat_message_log: LLMMessageLogType, ) -> None: - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct" - ) - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") ## get expected result @@ -391,12 +377,6 @@ def test_get_formatted_message_log_llama( def test_get_formatted_message_log_add_generation_prompt_llama( raw_chat_message_log: LLMMessageLogType, ) -> None: - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct" - ) - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") ## get expected result @@ -523,12 +503,6 @@ def test_get_formatted_message_log_add_generation_prompt_qwen( @pytest.mark.needs_hf_token def test_formatted_message_log_empty_message(): - # Skip if no HF token available for gated model - if not has_hf_token() and not os.getenv("CI"): - pytest.skip( - "HuggingFace token not available for gated model: meta-llama/Meta-Llama-3-8B-Instruct" - ) - message_logs = [ [ {"role": "system", "content": "You are a helpful assistant."}, diff --git a/tests/unit/models/dtensor/test_parallelize.py b/tests/unit/models/dtensor/test_parallelize.py index 0c3de464f6..013c5256ab 100644 --- a/tests/unit/models/dtensor/test_parallelize.py +++ b/tests/unit/models/dtensor/test_parallelize.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from itertools import product from unittest.mock import MagicMock @@ -20,12 +19,6 @@ from torch.distributed.tensor.parallel import ParallelStyle, parallelize_module from transformers import AutoModelForCausalLM - -def has_hf_token(): - """Check if HuggingFace token is available.""" - return os.getenv("HF_TOKEN") is not None - - from nemo_rl.models.dtensor.parallelize import ( _parallelize_gemma3, _parallelize_llama, @@ -52,17 +45,6 @@ def has_hf_token(): ) def test_parallelize_plan_keys(model_name, parallelize_func, sequence_parallel): """Tests that the keys in the parallelization plans are valid by mocking parallel styles.""" - # List of known gated models that require HF token - gated_models = [ - "google/gemma-3-1b-it", - "google/gemma-3-4b-it", - "meta-llama/Llama-3.2-1B-Instruct", - ] - - # Skip if testing a gated model and no HF token available - if model_name in gated_models and not has_hf_token() and not os.getenv("CI"): - pytest.skip(f"HuggingFace token not available for gated model: {model_name}") - model = AutoModelForCausalLM.from_pretrained(model_name) parallel_plan = parallelize_func(model, sequence_parallel=sequence_parallel) From 8f5da92537111e3d37b2f250dba5fd42151a8f7e Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 04:58:09 +0000 Subject: [PATCH 11/22] try qwen3 Signed-off-by: Terry Kong --- tests/unit/models/generation/test_vllm_generation.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index 8a38e5c61e..94280a03c5 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -1620,13 +1620,15 @@ def test_vllm_megatron_weight_update_with_packing(cluster, test_input_data): # Enable packing during test os.environ["NEMO_RL_MEGATRON_IPC_TENSOR_PACKING_THRESHOLD"] = "1" - # Both policies must use the same model (Qwen2.5-0.5B) for weight transfer compatibility - model_name = "Qwen/Qwen2.5-0.5B" + # Both policies must use the same model for weight transfer compatibility + # NOTE: We have tried using Qwen/Qwen2.5-0.5B, but some small models exhibit variance depending + # on which hardware it is run on. + model_name = "Qwen/Qwen3-0.6B" tokenizer = get_tokenizer({"name": model_name}) # Create Policy megatron_config = get_basic_megatron_test_config( - tp=1, pp=1, precision="float32" + tp=1, pp=1, precision="bfloat16" ) megatron_config["model_name"] = model_name megatron_config["tokenizer"]["name"] = model_name @@ -1653,8 +1655,8 @@ def test_vllm_megatron_weight_update_with_packing(cluster, test_input_data): output_ids = outputs["output_ids"] generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True) assert generated_texts == [ - "Hello, my name is John. I am a", - "The capital of France is Paris. It is the", + "Hello, my name is Lina. I'm", + "The capital of France is Paris. The capital of", ], "Output should be the same as the expected output" finally: From cf0c477557f71266ae55736e42552a1eb43042a2 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 05:34:02 +0000 Subject: [PATCH 12/22] fix Signed-off-by: Terry Kong --- tests/unit/models/huggingface/test_common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/models/huggingface/test_common.py b/tests/unit/models/huggingface/test_common.py index faf06fbdb7..d183958bb8 100644 --- a/tests/unit/models/huggingface/test_common.py +++ b/tests/unit/models/huggingface/test_common.py @@ -17,6 +17,7 @@ from nemo_rl.models.huggingface.common import ModelFlag, is_gemma_model +@pytest.mark.needs_hf_token @pytest.mark.parametrize( "model_name", [ @@ -42,6 +43,7 @@ def test_gemma_models(model_name): assert ModelFlag.VLLM_LOAD_FORMAT_AUTO.matches(model_name) +@pytest.mark.needs_hf_token @pytest.mark.parametrize( "model_name", [ From 029c0a8e2e78d8c9572db541ede60f833f77a7e0 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 05:49:13 +0000 Subject: [PATCH 13/22] one more Signed-off-by: Terry Kong --- tests/unit/data/hf_datasets/test_squad.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/data/hf_datasets/test_squad.py b/tests/unit/data/hf_datasets/test_squad.py index 5e736ee8ac..8c7705f558 100644 --- a/tests/unit/data/hf_datasets/test_squad.py +++ b/tests/unit/data/hf_datasets/test_squad.py @@ -17,6 +17,7 @@ from nemo_rl.data.hf_datasets.squad import SquadDataset +@pytest.mark.needs_hf_token @pytest.mark.skip(reason="dataset download is flaky") def test_squad_dataset(): tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") From 370766c760353d4a0f2822d9f1aee31791e8ac0c Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 06:11:59 +0000 Subject: [PATCH 14/22] try fixing the mcore marker Signed-off-by: Terry Kong --- tests/unit/conftest.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 830e04fb97..2b1494952b 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -32,20 +32,23 @@ def pytest_runtest_setup(item): - """Automatically skip tests that require HF tokens unless explicitly requested.""" + """Automatically skip tests that require special dependencies unless explicitly requested.""" + config = item.session.config + marker_expr = config.getoption("-m", default="") + + # Check for needs_hf_token marker needs_hf_token = item.get_closest_marker("needs_hf_token") - if needs_hf_token: - # Check if the test should run based on pytest marker selection - # If -m needs_hf_token is specified, run the test - # Otherwise, skip it by default - config = item.session.config - - # Check if needs_hf_token marker is explicitly selected - marker_expr = config.getoption("-m", default="") - if "needs_hf_token" not in marker_expr: - pytest.skip( - "Skipping test that requires HF token (use -m needs_hf_token to run)" - ) + if needs_hf_token and "needs_hf_token" not in marker_expr: + pytest.skip( + "Skipping test that requires HF token (use -m needs_hf_token to run)" + ) + + # Check for mcore marker + mcore = item.get_closest_marker("mcore") + if mcore and "mcore" not in marker_expr: + pytest.skip( + "Skipping test that requires mcore dependencies (use -m mcore to run)" + ) TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets") From 81bb823406ab81d8e9d4a4cf69b87e4b0c7bd71b Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 06:17:48 +0000 Subject: [PATCH 15/22] change up the ci markers Signed-off-by: Terry Kong --- .github/workflows/cicd-main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 2c299a8998..c8f803b2cf 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -221,7 +221,7 @@ jobs: UNIT_TEST_SCRIPT: | cd /opt/nemo-rl if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then - uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"not mcore and needs_hf_token\" + uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"needs_hf_token\" uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m \"mcore and needs_hf_token\" else echo Skipping unit tests for docs-only level From 707c44fb991af8ff814035843ef4f545233347b3 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 06:47:00 +0000 Subject: [PATCH 16/22] hf_gate Signed-off-by: Terry Kong --- .github/workflows/cicd-main.yml | 4 ++-- pyproject.toml | 2 +- tests/unit/algorithms/test_utils.py | 8 ++++---- tests/unit/conftest.py | 10 ++++------ .../data/hf_datasets/test_oai_format_dataset.py | 2 +- .../unit/data/hf_datasets/test_prompt_response.py | 2 +- tests/unit/data/hf_datasets/test_squad.py | 2 +- tests/unit/data/test_llm_message_utils.py | 6 +++--- tests/unit/models/dtensor/test_parallelize.py | 2 +- .../models/generation/test_vllm_large_model.py | 2 +- tests/unit/models/huggingface/test_common.py | 4 ++-- tests/unit/models/policy/test_dtensor_worker.py | 9 +++++---- tests/unit/models/policy/test_megatron_worker.py | 14 +++++++------- 13 files changed, 33 insertions(+), 34 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index c8f803b2cf..67f1bc8500 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -221,8 +221,8 @@ jobs: UNIT_TEST_SCRIPT: | cd /opt/nemo-rl if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then - uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"needs_hf_token\" - uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m \"mcore and needs_hf_token\" + uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"hf_gated\" + uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m \"mcore and hf_gated\" else echo Skipping unit tests for docs-only level fi diff --git a/pyproject.toml b/pyproject.toml index c985cd1351..643f1cdb1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -174,7 +174,7 @@ testpaths = ["tests"] python_files = "test_*.py" markers = [ "mcore: marks tests that require the mcore extra", - "needs_hf_token: marks tests that require HuggingFace token access for gated models", + "hf_gated: marks tests that require HuggingFace token access for gated models", ] [tool.coverage.run] diff --git a/tests/unit/algorithms/test_utils.py b/tests/unit/algorithms/test_utils.py index 8bd45c2349..2dd00a5eeb 100755 --- a/tests/unit/algorithms/test_utils.py +++ b/tests/unit/algorithms/test_utils.py @@ -77,7 +77,7 @@ def get_format_with_simple_role_header(messages): return message -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_get_tokenizer_no_chat_template(conversation_messages): """Test get_tokenizer when no chat template is specified in config""" config = {"name": "meta-llama/Llama-3.2-1B-Instruct"} @@ -90,7 +90,7 @@ def test_get_tokenizer_no_chat_template(conversation_messages): assert formatted == expected -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_get_tokenizer_default_chat_template(conversation_messages): """Test get_tokenizer when chat_template is 'default' in config""" config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"} @@ -102,7 +102,7 @@ def test_get_tokenizer_default_chat_template(conversation_messages): assert formatted == expected -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_get_tokenizer_null_chat_template(conversation_messages): """Test get_tokenizer when chat_template is None in config""" config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None} @@ -116,7 +116,7 @@ def test_get_tokenizer_null_chat_template(conversation_messages): assert formatted == expected -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_get_tokenizer_custom_jinja_template(conversation_messages): """Test get_tokenizer when a custom jinja template is specified""" custom_template = COMMON_CHAT_TEMPLATES.simple_role_header diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 2b1494952b..e3a809709a 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -36,12 +36,10 @@ def pytest_runtest_setup(item): config = item.session.config marker_expr = config.getoption("-m", default="") - # Check for needs_hf_token marker - needs_hf_token = item.get_closest_marker("needs_hf_token") - if needs_hf_token and "needs_hf_token" not in marker_expr: - pytest.skip( - "Skipping test that requires HF token (use -m needs_hf_token to run)" - ) + # Check for hf_gated marker + hf_gated = item.get_closest_marker("hf_gated") + if hf_gated and "hf_gated" not in marker_expr: + pytest.skip("Skipping test that requires HF token (use -m hf_gated to run)") # Check for mcore marker mcore = item.get_closest_marker("mcore") diff --git a/tests/unit/data/hf_datasets/test_oai_format_dataset.py b/tests/unit/data/hf_datasets/test_oai_format_dataset.py index 6438c52fe2..ae6b878779 100644 --- a/tests/unit/data/hf_datasets/test_oai_format_dataset.py +++ b/tests/unit/data/hf_datasets/test_oai_format_dataset.py @@ -87,7 +87,7 @@ def test_custom_keys(sample_data): assert dataset.system_prompt == "You are a helpful assistant." -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.parametrize("sample_data", [("messages", "system_key")], indirect=True) def test_message_formatting(sample_data): train_path, val_path = sample_data diff --git a/tests/unit/data/hf_datasets/test_prompt_response.py b/tests/unit/data/hf_datasets/test_prompt_response.py index d6b908a1a5..cbf18977a4 100644 --- a/tests/unit/data/hf_datasets/test_prompt_response.py +++ b/tests/unit/data/hf_datasets/test_prompt_response.py @@ -76,7 +76,7 @@ def test_custom_keys(sample_data): assert dataset.output_key == "answer" -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.parametrize("sample_data", [("question", "answer")], indirect=True) def test_message_formatting(sample_data): train_path, val_path = sample_data diff --git a/tests/unit/data/hf_datasets/test_squad.py b/tests/unit/data/hf_datasets/test_squad.py index 8c7705f558..f5e01b250a 100644 --- a/tests/unit/data/hf_datasets/test_squad.py +++ b/tests/unit/data/hf_datasets/test_squad.py @@ -17,7 +17,7 @@ from nemo_rl.data.hf_datasets.squad import SquadDataset -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.skip(reason="dataset download is flaky") def test_squad_dataset(): tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py index 8b329e8072..91ae2e41b7 100644 --- a/tests/unit/data/test_llm_message_utils.py +++ b/tests/unit/data/test_llm_message_utils.py @@ -328,7 +328,7 @@ def test_batch_pad_message_log_custom_pad_value( ) -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_get_formatted_message_log_llama( raw_chat_message_log: LLMMessageLogType, ) -> None: @@ -373,7 +373,7 @@ def test_get_formatted_message_log_llama( assert actual_text == expected_text -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_get_formatted_message_log_add_generation_prompt_llama( raw_chat_message_log: LLMMessageLogType, ) -> None: @@ -501,7 +501,7 @@ def test_get_formatted_message_log_add_generation_prompt_qwen( assert actual_text == expected_text -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_formatted_message_log_empty_message(): message_logs = [ [ diff --git a/tests/unit/models/dtensor/test_parallelize.py b/tests/unit/models/dtensor/test_parallelize.py index 013c5256ab..192fa354ac 100644 --- a/tests/unit/models/dtensor/test_parallelize.py +++ b/tests/unit/models/dtensor/test_parallelize.py @@ -26,7 +26,7 @@ ) -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.parametrize( "model_name, parallelize_func, sequence_parallel", [ diff --git a/tests/unit/models/generation/test_vllm_large_model.py b/tests/unit/models/generation/test_vllm_large_model.py index 03c6822ecb..7b93ef46d1 100644 --- a/tests/unit/models/generation/test_vllm_large_model.py +++ b/tests/unit/models/generation/test_vllm_large_model.py @@ -125,7 +125,7 @@ def test_input_data(tokenizer): # skip this test for now @pytest.mark.skip(reason="Skipping large model test until we have resources in CI.") -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.asyncio @pytest.mark.parametrize("tensor_parallel_size", [4, 8]) @pytest.mark.parametrize("pipeline_parallel_size", [2]) diff --git a/tests/unit/models/huggingface/test_common.py b/tests/unit/models/huggingface/test_common.py index d183958bb8..95da64b0b4 100644 --- a/tests/unit/models/huggingface/test_common.py +++ b/tests/unit/models/huggingface/test_common.py @@ -17,7 +17,7 @@ from nemo_rl.models.huggingface.common import ModelFlag, is_gemma_model -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.parametrize( "model_name", [ @@ -43,7 +43,7 @@ def test_gemma_models(model_name): assert ModelFlag.VLLM_LOAD_FORMAT_AUTO.matches(model_name) -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.parametrize( "model_name", [ diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py index a8e5bc59e4..33a91c37eb 100644 --- a/tests/unit/models/policy/test_dtensor_worker.py +++ b/tests/unit/models/policy/test_dtensor_worker.py @@ -159,6 +159,7 @@ def policy_setup(two_gpu_virtual_cluster, tiny_llama_model_path): policy.shutdown() +@pytest.mark.hf_gated @pytest.mark.timeout(180) def test_lm_policy_init(policy_setup): policy = policy_setup @@ -306,7 +307,7 @@ def training_setup(request, two_gpu_virtual_cluster): policy.shutdown() -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.timeout(60) @pytest.mark.parametrize( "training_setup", @@ -465,7 +466,7 @@ def logprob_setup(request, two_gpu_virtual_cluster): policy.shutdown() -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.timeout(360) @pytest.mark.parametrize( "logprob_setup", @@ -508,7 +509,7 @@ def test_dtensor_worker_logprob_tp2_or_cp2_matches_unsharded(logprob_setup): ) -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_dtensor_tp_and_tied_model_with_custom_parallel_plan( two_gpu_virtual_cluster, tiny_llama_tied_model_path ): @@ -554,7 +555,7 @@ def test_dtensor_tp_and_tied_model_with_custom_parallel_plan( policy.shutdown() -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.timeout(180) def test_dtensor_loss_independent_of_microbatch_size_two_gpus( two_gpu_virtual_cluster, tiny_llama_model_path diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 76753e60d0..30c4e52d13 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -319,7 +319,7 @@ def training_setup(request): cluster.shutdown() -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.timeout(300) @pytest.mark.parametrize( "training_setup", @@ -622,7 +622,7 @@ def logprob_setup(request): @pytest.mark.timeout(180) -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.parametrize( "logprob_setup", [ @@ -663,7 +663,7 @@ def test_megatron_policy_logprobs(logprob_setup): @pytest.mark.timeout(240) -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_megatron_loss_independent_of_microbatch_size(tiny_llama_model_path): """Test that changing microbatch size while keeping global batch size constant does not affect loss values.""" num_gpus = 2 @@ -782,7 +782,7 @@ def test_megatron_loss_independent_of_microbatch_size(tiny_llama_model_path): @pytest.mark.timeout(300) -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_megatron_reference_policy_functionality(tiny_llama_model_path): """Test Megatron reference policy functionality.""" num_gpus = 2 @@ -903,7 +903,7 @@ def test_megatron_reference_policy_functionality(tiny_llama_model_path): @pytest.mark.timeout(400) -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated @pytest.mark.parametrize( "num_gpus,tp,pp", [ @@ -1158,7 +1158,7 @@ def test_megatron_checkpoint_save_kill_and_restore( @pytest.mark.timeout(300) -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_megatron_dpo_training(tiny_llama_model_path): """Test DPO training with Megatron backend.""" num_gpus = 2 @@ -1255,7 +1255,7 @@ def test_megatron_dpo_training(tiny_llama_model_path): @pytest.mark.timeout(300) -@pytest.mark.needs_hf_token +@pytest.mark.hf_gated def test_megatron_sft_training(tiny_llama_model_path): """Test SFT training with Megatron backend.""" num_gpus = 2 From 4bffe0ab70bf6f13554e10d5a6c2dd750d590578 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 07:01:56 +0000 Subject: [PATCH 17/22] notes Signed-off-by: Terry Kong --- docs/testing.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/testing.md b/docs/testing.md index 4d44b141fb..3b2e5208f7 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -9,8 +9,14 @@ Unit tests require 2 GPUs to test the full suite. ::: ```sh -# Run the unit tests using local GPUs +# Run the unit tests using local GPUs (by default tests requiring gated HF repos are skipped) uv run --group test bash tests/run_unit.sh + +# Run the unit tests and include tests requiring HF gated repos +uv run --group test bash tests/run_unit.sh -m "hf_gated" + +# Run the mcore and HF dated repo tests +uv run --extra mcore --group test bash tests/run_unit.sh -m "hf_gated and mcore" ``` :::{note} From 9ecf0aadbac73a245fc4bcbce7e3e8a9cc3e6a97 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 07:02:25 +0000 Subject: [PATCH 18/22] notes Signed-off-by: Terry Kong --- docs/testing.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docs/testing.md b/docs/testing.md index 3b2e5208f7..f2d90928ff 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -19,12 +19,6 @@ uv run --group test bash tests/run_unit.sh -m "hf_gated" uv run --extra mcore --group test bash tests/run_unit.sh -m "hf_gated and mcore" ``` -:::{note} -Tests can also be run on Slurm with `ray.sub`, but note that some tests will be skipped -due to no GPUs being located on the head node. To run the full suite of tests, please -launch on a regular GPU allocation. -::: - ### Run Unit Tests in a Hermetic Environment For environments lacking necessary dependencies (e.g., `gcc`, `nvcc`) From 27078a6ce33fe9d49bd7e4aa1390668db4c049ff Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 07:30:18 +0000 Subject: [PATCH 19/22] fix Signed-off-by: Terry Kong --- .../models/policy/test_megatron_worker.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 30c4e52d13..75b735ac0d 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -151,7 +151,7 @@ def gc_collect(): @pytest.fixture -def policy_setup(request): +def policy_setup(request, tiny_llama_model_path): """Setup and teardown for policy tests - creates a virtual cluster and policy.""" # Get parameters from request if hasattr(request, "param") and request.param is not None: @@ -176,7 +176,7 @@ def policy_setup(request): max_colocated_worker_groups=1, ) - config = create_megatron_test_config(tp=tp, pp=pp) + config = create_megatron_test_config(tiny_llama_model_path, tp=tp, pp=pp) tokenizer = get_tokenizer(config["tokenizer"]) config["generation"] = configure_generation_config( config["generation"], tokenizer @@ -389,7 +389,7 @@ def verify_loss_tensor(loss_tensor): @pytest.fixture -def generation_setup(request): +def generation_setup(request, tiny_llama_model_path): """Setup and teardown specifically for generation tests.""" # Parse parameters: (num_gpus, tp, pp, generation_backend) if hasattr(request, "param") and request.param is not None: @@ -418,6 +418,7 @@ def generation_setup(request): ) config = create_megatron_test_config( + tiny_llama_model_path, tp=tp, pp=pp, generation_backend=generation_backend, @@ -1338,7 +1339,7 @@ def test_megatron_sft_training(tiny_llama_model_path): @pytest.mark.timeout(300) -def test_megatron_context_parallel_logprob_agreement(): +def test_megatron_context_parallel_logprob_agreement(tiny_llama_model_path): """Test that CP and non-CP models produce identical logprobs with sequence packing enabled.""" num_gpus = 2 batch_size = 4 @@ -1376,7 +1377,9 @@ def test_megatron_context_parallel_logprob_agreement(): max_colocated_worker_groups=1, ) - config_no_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16") + config_no_cp = create_megatron_test_config( + tiny_llama_model_path, tp=1, pp=1, precision="bfloat16" + ) # Ensure context parallel is disabled config_no_cp["megatron_cfg"]["context_parallel_size"] = 1 @@ -1456,7 +1459,9 @@ def test_megatron_context_parallel_logprob_agreement(): max_colocated_worker_groups=1, ) - config_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16") + config_cp = create_megatron_test_config( + tiny_llama_model_path, tp=1, pp=1, precision="bfloat16" + ) # Enable context parallel config_cp["megatron_cfg"]["context_parallel_size"] = 2 @@ -1536,7 +1541,7 @@ def test_megatron_context_parallel_logprob_agreement(): @pytest.mark.timeout(300) -def test_megatron_context_parallel_training_agreement(): +def test_megatron_context_parallel_training_agreement(tiny_llama_model_path): """Test that CP and non-CP models produce consistent training results with ClippedPG loss and sequence packing.""" num_gpus = 2 batch_size = 2 @@ -1594,7 +1599,9 @@ def test_megatron_context_parallel_training_agreement(): max_colocated_worker_groups=1, ) - config_no_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16") + config_no_cp = create_megatron_test_config( + tiny_llama_model_path, tp=1, pp=1, precision="bfloat16" + ) # Ensure context parallel is disabled config_no_cp["megatron_cfg"]["context_parallel_size"] = 1 config_no_cp["train_global_batch_size"] = 2 @@ -1656,7 +1663,9 @@ def test_megatron_context_parallel_training_agreement(): max_colocated_worker_groups=1, ) - config_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16") + config_cp = create_megatron_test_config( + tiny_llama_model_path, tp=1, pp=1, precision="bfloat16" + ) # Enable context parallel config_cp["megatron_cfg"]["context_parallel_size"] = 2 config_cp["train_global_batch_size"] = 2 From cbe6f95c491b21797722341faac8f2d7e85e8617 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 07:32:45 +0000 Subject: [PATCH 20/22] ifix Signed-off-by: Terry Kong --- tests/unit/models/policy/test_megatron_worker.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 75b735ac0d..38607ba59f 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -1338,6 +1338,7 @@ def test_megatron_sft_training(tiny_llama_model_path): cluster.shutdown() +@pytest.mark.hf_gated @pytest.mark.timeout(300) def test_megatron_context_parallel_logprob_agreement(tiny_llama_model_path): """Test that CP and non-CP models produce identical logprobs with sequence packing enabled.""" @@ -1540,6 +1541,7 @@ def test_megatron_context_parallel_logprob_agreement(tiny_llama_model_path): ) +@pytest.mark.hf_gated @pytest.mark.timeout(300) def test_megatron_context_parallel_training_agreement(tiny_llama_model_path): """Test that CP and non-CP models produce consistent training results with ClippedPG loss and sequence packing.""" From 08b1625942f2ee0708fb4b0d9fdeabcbcba17aff Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 23:50:17 +0000 Subject: [PATCH 21/22] fix Signed-off-by: Terry Kong --- .github/workflows/cicd-main.yml | 4 +-- docs/testing.md | 15 +++++--- tests/unit/conftest.py | 64 ++++++++++++++++++++++++++------- 3 files changed, 63 insertions(+), 20 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 67f1bc8500..42dc8d015b 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -221,8 +221,8 @@ jobs: UNIT_TEST_SCRIPT: | cd /opt/nemo-rl if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then - uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"hf_gated\" - uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m \"mcore and hf_gated\" + uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl --hf-gated + uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --mcore-only else echo Skipping unit tests for docs-only level fi diff --git a/docs/testing.md b/docs/testing.md index f2d90928ff..8ce97346b9 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -9,14 +9,19 @@ Unit tests require 2 GPUs to test the full suite. ::: ```sh -# Run the unit tests using local GPUs (by default tests requiring gated HF repos are skipped) +# Run the unit tests using local GPUs + +# Configuration 1: Default tests only - excludes both hf_gated and mcore tests uv run --group test bash tests/run_unit.sh -# Run the unit tests and include tests requiring HF gated repos -uv run --group test bash tests/run_unit.sh -m "hf_gated" +# Configuration 2: Default + HF gated tests, excluding mcore tests +uv run --group test bash tests/run_unit.sh --hf-gated + +# Configuration 3: ONLY mcore tests, excluding ones with hf_gated +uv run --extra mcore --group test bash tests/run_unit.sh --mcore-only -# Run the mcore and HF dated repo tests -uv run --extra mcore --group test bash tests/run_unit.sh -m "hf_gated and mcore" +# Configuration 4: ONLY mcore tests, including ones with hf_gated +uv run --extra mcore --group test bash tests/run_unit.sh --mcore-only --hf-gated ``` ### Run Unit Tests in a Hermetic Environment diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index e3a809709a..9e14889b45 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -31,22 +31,60 @@ dir_path = os.path.dirname(os.path.abspath(__file__)) -def pytest_runtest_setup(item): - """Automatically skip tests that require special dependencies unless explicitly requested.""" - config = item.session.config +def pytest_addoption(parser): + """Add custom command line options for controlling test execution.""" + parser.addoption( + "--hf-gated", + action="store_true", + default=False, + help="Include tests that require HuggingFace token access", + ) + parser.addoption( + "--mcore-only", + action="store_true", + default=False, + help="Run ONLY mcore tests (combine with --hf-gated to include mcore+hf_gated tests)", + ) + + +def pytest_collection_modifyitems(config, items): + """Modify test collection to skip tests based on markers unless explicitly requested.""" + run_hf_gated = config.getoption("--hf-gated") + run_mcore_only = config.getoption("--mcore-only") marker_expr = config.getoption("-m", default="") - # Check for hf_gated marker - hf_gated = item.get_closest_marker("hf_gated") - if hf_gated and "hf_gated" not in marker_expr: - pytest.skip("Skipping test that requires HF token (use -m hf_gated to run)") + # If user specified -m marker expressions, let pytest handle everything normally + if marker_expr: + return - # Check for mcore marker - mcore = item.get_closest_marker("mcore") - if mcore and "mcore" not in marker_expr: - pytest.skip( - "Skipping test that requires mcore dependencies (use -m mcore to run)" - ) + # Filter tests based on the desired configurations + new_items = [] + + if run_mcore_only and run_hf_gated: + # Configuration 4: Only mcore tests, including ones with hf_gated + new_items = [item for item in items if item.get_closest_marker("mcore")] + elif run_mcore_only: + # Configuration 3: Only mcore tests, excluding ones with hf_gated + new_items = [ + item + for item in items + if item.get_closest_marker("mcore") + and not item.get_closest_marker("hf_gated") + ] + elif run_hf_gated: + # Configuration 2: Default tests + hf_gated tests, excluding mcore + new_items = [item for item in items if not item.get_closest_marker("mcore")] + else: + # Configuration 1: Default only - exclude both hf_gated and mcore + new_items = [ + item + for item in items + if not item.get_closest_marker("hf_gated") + and not item.get_closest_marker("mcore") + ] + + # Update the items list in-place + items[:] = new_items TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets") From e3b3e2828fad00c8445545ed57e8b08a8dd08a67 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Jul 2025 23:53:23 +0000 Subject: [PATCH 22/22] move mkdir out of moduel scope Signed-off-by: Terry Kong --- tests/unit/conftest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 9e14889b45..1346a1173d 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -88,7 +88,6 @@ def pytest_collection_modifyitems(config, items): TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets") -os.makedirs(TEST_ASSETS_DIR, exist_ok=True) UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json") UNIT_RESULTS_FILE_DATED = os.path.join( dir_path, f"unit_results/{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" @@ -237,6 +236,10 @@ def log_max_mem(self, metric_name: str): session_data["metrics"][qualified_name]["_elapsed"] = end_time - start_time +def pytest_sessionstart(session): + os.makedirs(TEST_ASSETS_DIR, exist_ok=True) + + def pytest_sessionfinish(session, exitstatus): if not hasattr(session.config, "_unit_test_data"): return