diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 3b10108e6a..42dc8d015b 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -221,8 +221,8 @@ jobs: UNIT_TEST_SCRIPT: | cd /opt/nemo-rl if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then - uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"not mcore\" - uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m mcore + uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl --hf-gated + uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --mcore-only else echo Skipping unit tests for docs-only level fi diff --git a/docs/testing.md b/docs/testing.md index 4d44b141fb..8ce97346b9 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -10,14 +10,19 @@ Unit tests require 2 GPUs to test the full suite. ```sh # Run the unit tests using local GPUs + +# Configuration 1: Default tests only - excludes both hf_gated and mcore tests uv run --group test bash tests/run_unit.sh -``` -:::{note} -Tests can also be run on Slurm with `ray.sub`, but note that some tests will be skipped -due to no GPUs being located on the head node. To run the full suite of tests, please -launch on a regular GPU allocation. -::: +# Configuration 2: Default + HF gated tests, excluding mcore tests +uv run --group test bash tests/run_unit.sh --hf-gated + +# Configuration 3: ONLY mcore tests, excluding ones with hf_gated +uv run --extra mcore --group test bash tests/run_unit.sh --mcore-only + +# Configuration 4: ONLY mcore tests, including ones with hf_gated +uv run --extra mcore --group test bash tests/run_unit.sh --mcore-only --hf-gated +``` ### Run Unit Tests in a Hermetic Environment diff --git a/pyproject.toml b/pyproject.toml index cddda79abe..643f1cdb1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -174,6 +174,7 @@ testpaths = ["tests"] python_files = "test_*.py" markers = [ "mcore: marks tests that require the mcore extra", + "hf_gated: marks tests that require HuggingFace token access for gated models", ] [tool.coverage.run] diff --git a/tests/unit/algorithms/test_utils.py b/tests/unit/algorithms/test_utils.py index 82338de026..2dd00a5eeb 100755 --- a/tests/unit/algorithms/test_utils.py +++ b/tests/unit/algorithms/test_utils.py @@ -77,6 +77,7 @@ def get_format_with_simple_role_header(messages): return message +@pytest.mark.hf_gated def test_get_tokenizer_no_chat_template(conversation_messages): """Test get_tokenizer when no chat template is specified in config""" config = {"name": "meta-llama/Llama-3.2-1B-Instruct"} @@ -89,6 +90,7 @@ def test_get_tokenizer_no_chat_template(conversation_messages): assert formatted == expected +@pytest.mark.hf_gated def test_get_tokenizer_default_chat_template(conversation_messages): """Test get_tokenizer when chat_template is 'default' in config""" config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"} @@ -100,6 +102,7 @@ def test_get_tokenizer_default_chat_template(conversation_messages): assert formatted == expected +@pytest.mark.hf_gated def test_get_tokenizer_null_chat_template(conversation_messages): """Test get_tokenizer when chat_template is None in config""" config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None} @@ -113,6 +116,7 @@ def test_get_tokenizer_null_chat_template(conversation_messages): assert formatted == expected +@pytest.mark.hf_gated def test_get_tokenizer_custom_jinja_template(conversation_messages): """Test get_tokenizer when a custom jinja template is specified""" custom_template = COMMON_CHAT_TEMPLATES.simple_role_header diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 3197fa2d57..1346a1173d 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -30,6 +30,63 @@ dir_path = os.path.dirname(os.path.abspath(__file__)) + +def pytest_addoption(parser): + """Add custom command line options for controlling test execution.""" + parser.addoption( + "--hf-gated", + action="store_true", + default=False, + help="Include tests that require HuggingFace token access", + ) + parser.addoption( + "--mcore-only", + action="store_true", + default=False, + help="Run ONLY mcore tests (combine with --hf-gated to include mcore+hf_gated tests)", + ) + + +def pytest_collection_modifyitems(config, items): + """Modify test collection to skip tests based on markers unless explicitly requested.""" + run_hf_gated = config.getoption("--hf-gated") + run_mcore_only = config.getoption("--mcore-only") + marker_expr = config.getoption("-m", default="") + + # If user specified -m marker expressions, let pytest handle everything normally + if marker_expr: + return + + # Filter tests based on the desired configurations + new_items = [] + + if run_mcore_only and run_hf_gated: + # Configuration 4: Only mcore tests, including ones with hf_gated + new_items = [item for item in items if item.get_closest_marker("mcore")] + elif run_mcore_only: + # Configuration 3: Only mcore tests, excluding ones with hf_gated + new_items = [ + item + for item in items + if item.get_closest_marker("mcore") + and not item.get_closest_marker("hf_gated") + ] + elif run_hf_gated: + # Configuration 2: Default tests + hf_gated tests, excluding mcore + new_items = [item for item in items if not item.get_closest_marker("mcore")] + else: + # Configuration 1: Default only - exclude both hf_gated and mcore + new_items = [ + item + for item in items + if not item.get_closest_marker("hf_gated") + and not item.get_closest_marker("mcore") + ] + + # Update the items list in-place + items[:] = new_items + + TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets") UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json") UNIT_RESULTS_FILE_DATED = os.path.join( @@ -37,25 +94,6 @@ ) -# Mapping between asset and absolute path (each are populated from a session level fixture) -class TEST_ASSETS: - TINY_LLAMA_MODEL_PATH = os.path.join( - TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer" - ) - TINY_LLAMA_TIED_MODEL_PATH = os.path.join( - TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer" - ) - TINY_QWEN2_MODEL_PATH = os.path.join( - TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer" - ) - TINY_QWEN3_MODEL_PATH = os.path.join( - TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer" - ) - TINY_GEMMA3_MODEL_PATH = os.path.join( - TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer" - ) - - class UnitTestData(TypedDict): exit_status: int | str git_commit: str @@ -198,6 +236,10 @@ def log_max_mem(self, metric_name: str): session_data["metrics"][qualified_name]["_elapsed"] = end_time - start_time +def pytest_sessionstart(session): + os.makedirs(TEST_ASSETS_DIR, exist_ok=True) + + def pytest_sessionfinish(session, exitstatus): if not hasattr(session.config, "_unit_test_data"): return @@ -394,14 +436,14 @@ def mock_2gpu_distributed_env(): ####################### -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def tiny_llama_model_path(): """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" import shutil from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM - model_path = TEST_ASSETS.TINY_LLAMA_MODEL_PATH + model_path = os.path.join(TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer") # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=128256 (so we can re-use llama3.2 1b tokenizer) config = LlamaConfig( @@ -422,14 +464,16 @@ def tiny_llama_model_path(): yield model_path -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def tiny_llama_tied_model_path(): """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" import shutil from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM - model_path = TEST_ASSETS.TINY_LLAMA_TIED_MODEL_PATH + model_path = os.path.join( + TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer" + ) # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=128256 (so we can re-use llama3.2 1b tokenizer) config = LlamaConfig( @@ -450,14 +494,14 @@ def tiny_llama_tied_model_path(): yield model_path -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def tiny_qwen2_model_path(): """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" import shutil from transformers import AutoTokenizer, Qwen2Config, Qwen2ForCausalLM - model_path = TEST_ASSETS.TINY_QWEN2_MODEL_PATH + model_path = os.path.join(TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer") # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer) config = Qwen2Config( @@ -478,14 +522,14 @@ def tiny_qwen2_model_path(): yield model_path -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def tiny_qwen3_model_path(): """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" import shutil from transformers import AutoTokenizer, Qwen3Config, Qwen3ForCausalLM - model_path = TEST_ASSETS.TINY_QWEN3_MODEL_PATH + model_path = os.path.join(TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer") # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer) config = Qwen3Config( @@ -506,14 +550,14 @@ def tiny_qwen3_model_path(): yield model_path -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def tiny_gemma3_model_path(): """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" import shutil from transformers import AutoTokenizer, Gemma3ForCausalLM, Gemma3TextConfig - model_path = TEST_ASSETS.TINY_GEMMA3_MODEL_PATH + model_path = os.path.join(TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer") # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) # vocab_size=262144 so we can re-use gemma-3-1b tokenizer config = Gemma3TextConfig( diff --git a/tests/unit/data/hf_datasets/test_oai_format_dataset.py b/tests/unit/data/hf_datasets/test_oai_format_dataset.py index 4ba75a6a1d..ae6b878779 100644 --- a/tests/unit/data/hf_datasets/test_oai_format_dataset.py +++ b/tests/unit/data/hf_datasets/test_oai_format_dataset.py @@ -87,6 +87,7 @@ def test_custom_keys(sample_data): assert dataset.system_prompt == "You are a helpful assistant." +@pytest.mark.hf_gated @pytest.mark.parametrize("sample_data", [("messages", "system_key")], indirect=True) def test_message_formatting(sample_data): train_path, val_path = sample_data diff --git a/tests/unit/data/hf_datasets/test_prompt_response.py b/tests/unit/data/hf_datasets/test_prompt_response.py index 8ff7f5c5f6..cbf18977a4 100644 --- a/tests/unit/data/hf_datasets/test_prompt_response.py +++ b/tests/unit/data/hf_datasets/test_prompt_response.py @@ -76,6 +76,7 @@ def test_custom_keys(sample_data): assert dataset.output_key == "answer" +@pytest.mark.hf_gated @pytest.mark.parametrize("sample_data", [("question", "answer")], indirect=True) def test_message_formatting(sample_data): train_path, val_path = sample_data diff --git a/tests/unit/data/hf_datasets/test_squad.py b/tests/unit/data/hf_datasets/test_squad.py index 5e736ee8ac..f5e01b250a 100644 --- a/tests/unit/data/hf_datasets/test_squad.py +++ b/tests/unit/data/hf_datasets/test_squad.py @@ -17,6 +17,7 @@ from nemo_rl.data.hf_datasets.squad import SquadDataset +@pytest.mark.hf_gated @pytest.mark.skip(reason="dataset download is flaky") def test_squad_dataset(): tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py index 6f9c3f3129..91ae2e41b7 100644 --- a/tests/unit/data/test_llm_message_utils.py +++ b/tests/unit/data/test_llm_message_utils.py @@ -328,6 +328,7 @@ def test_batch_pad_message_log_custom_pad_value( ) +@pytest.mark.hf_gated def test_get_formatted_message_log_llama( raw_chat_message_log: LLMMessageLogType, ) -> None: @@ -372,6 +373,7 @@ def test_get_formatted_message_log_llama( assert actual_text == expected_text +@pytest.mark.hf_gated def test_get_formatted_message_log_add_generation_prompt_llama( raw_chat_message_log: LLMMessageLogType, ) -> None: @@ -499,6 +501,7 @@ def test_get_formatted_message_log_add_generation_prompt_qwen( assert actual_text == expected_text +@pytest.mark.hf_gated def test_formatted_message_log_empty_message(): message_logs = [ [ diff --git a/tests/unit/models/dtensor/test_parallelize.py b/tests/unit/models/dtensor/test_parallelize.py index 5acb7addc4..192fa354ac 100644 --- a/tests/unit/models/dtensor/test_parallelize.py +++ b/tests/unit/models/dtensor/test_parallelize.py @@ -26,6 +26,7 @@ ) +@pytest.mark.hf_gated @pytest.mark.parametrize( "model_name, parallelize_func, sequence_parallel", [ diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index 8a38e5c61e..94280a03c5 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -1620,13 +1620,15 @@ def test_vllm_megatron_weight_update_with_packing(cluster, test_input_data): # Enable packing during test os.environ["NEMO_RL_MEGATRON_IPC_TENSOR_PACKING_THRESHOLD"] = "1" - # Both policies must use the same model (Qwen2.5-0.5B) for weight transfer compatibility - model_name = "Qwen/Qwen2.5-0.5B" + # Both policies must use the same model for weight transfer compatibility + # NOTE: We have tried using Qwen/Qwen2.5-0.5B, but some small models exhibit variance depending + # on which hardware it is run on. + model_name = "Qwen/Qwen3-0.6B" tokenizer = get_tokenizer({"name": model_name}) # Create Policy megatron_config = get_basic_megatron_test_config( - tp=1, pp=1, precision="float32" + tp=1, pp=1, precision="bfloat16" ) megatron_config["model_name"] = model_name megatron_config["tokenizer"]["name"] = model_name @@ -1653,8 +1655,8 @@ def test_vllm_megatron_weight_update_with_packing(cluster, test_input_data): output_ids = outputs["output_ids"] generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True) assert generated_texts == [ - "Hello, my name is John. I am a", - "The capital of France is Paris. It is the", + "Hello, my name is Lina. I'm", + "The capital of France is Paris. The capital of", ], "Output should be the same as the expected output" finally: diff --git a/tests/unit/models/generation/test_vllm_large_model.py b/tests/unit/models/generation/test_vllm_large_model.py index d24a0c0f31..7b93ef46d1 100644 --- a/tests/unit/models/generation/test_vllm_large_model.py +++ b/tests/unit/models/generation/test_vllm_large_model.py @@ -125,6 +125,7 @@ def test_input_data(tokenizer): # skip this test for now @pytest.mark.skip(reason="Skipping large model test until we have resources in CI.") +@pytest.mark.hf_gated @pytest.mark.asyncio @pytest.mark.parametrize("tensor_parallel_size", [4, 8]) @pytest.mark.parametrize("pipeline_parallel_size", [2]) diff --git a/tests/unit/models/huggingface/test_common.py b/tests/unit/models/huggingface/test_common.py index faf06fbdb7..95da64b0b4 100644 --- a/tests/unit/models/huggingface/test_common.py +++ b/tests/unit/models/huggingface/test_common.py @@ -17,6 +17,7 @@ from nemo_rl.models.huggingface.common import ModelFlag, is_gemma_model +@pytest.mark.hf_gated @pytest.mark.parametrize( "model_name", [ @@ -42,6 +43,7 @@ def test_gemma_models(model_name): assert ModelFlag.VLLM_LOAD_FORMAT_AUTO.matches(model_name) +@pytest.mark.hf_gated @pytest.mark.parametrize( "model_name", [ diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py index c176082698..33a91c37eb 100644 --- a/tests/unit/models/policy/test_dtensor_worker.py +++ b/tests/unit/models/policy/test_dtensor_worker.py @@ -31,12 +31,11 @@ from nemo_rl.models.generation import configure_generation_config from nemo_rl.models.policy import PolicyConfig from nemo_rl.models.policy.lm_policy import Policy -from tests.unit.conftest import TEST_ASSETS from tests.unit.test_utils import SimpleLoss def create_test_config( - model_name: str = TEST_ASSETS.TINY_LLAMA_MODEL_PATH, + model_name: str, tp: int = 1, cp: int = 1, sequence_parallel: bool = False, @@ -145,9 +144,9 @@ def gc_collect(): @pytest.fixture -def policy_setup(two_gpu_virtual_cluster): +def policy_setup(two_gpu_virtual_cluster, tiny_llama_model_path): """Setup and teardown for policy tests - creates a virtual cluster and policy.""" - config = create_test_config() + config = create_test_config(tiny_llama_model_path) tokenizer = get_tokenizer(config["tokenizer"]) config["generation"] = configure_generation_config(config["generation"], tokenizer) @@ -160,6 +159,7 @@ def policy_setup(two_gpu_virtual_cluster): policy.shutdown() +@pytest.mark.hf_gated @pytest.mark.timeout(180) def test_lm_policy_init(policy_setup): policy = policy_setup @@ -240,9 +240,17 @@ def test_lm_policy_init(policy_setup): @pytest.fixture def training_setup(request, two_gpu_virtual_cluster): """Setup and teardown specifically for training tests.""" - model_name, tp, cp, sequence_parallel, cpu_offload, activation_checkpointing = ( - request.param - ) + ( + model_fixture_name, + tp, + cp, + sequence_parallel, + cpu_offload, + activation_checkpointing, + ) = request.param + + # Get the actual model path from the requested fixture + model_name = request.getfixturevalue(model_fixture_name) policy = None data = None loss_fn = None @@ -299,37 +307,38 @@ def training_setup(request, two_gpu_virtual_cluster): policy.shutdown() +@pytest.mark.hf_gated @pytest.mark.timeout(60) @pytest.mark.parametrize( "training_setup", [ - # model_name tp cp sp cpu act - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, False, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, True, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, False, True, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 1, False, False, True), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, False, False), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, True, True, False), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, True, False, True), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, False, True, True), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 1, True, True, True), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 2, False, False, False), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, True, True, False), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, True, False, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, False, True, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 1, True, True, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 2, False, False, False), + # model_fixture_name tp cp sp cpu act + ("tiny_llama_model_path", 1, 1, False, False, False), + ("tiny_llama_model_path", 1, 1, True, False, False), + ("tiny_llama_model_path", 1, 1, False, True, False), + ("tiny_llama_model_path", 1, 1, False, False, True), + ("tiny_llama_model_path", 1, 2, False, False, False), + ("tiny_qwen2_model_path", 1, 1, True, True, False), + ("tiny_qwen2_model_path", 1, 1, True, False, True), + ("tiny_qwen2_model_path", 1, 1, False, True, True), + ("tiny_qwen2_model_path", 1, 1, True, True, True), + ("tiny_qwen2_model_path", 1, 2, False, False, False), + ("tiny_qwen3_model_path", 1, 1, True, True, False), + ("tiny_qwen3_model_path", 1, 1, True, False, True), + ("tiny_qwen3_model_path", 1, 1, False, True, True), + ("tiny_qwen3_model_path", 1, 1, True, True, True), + ("tiny_qwen3_model_path", 1, 2, False, False, False), ( - TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, + "tiny_gemma3_model_path", 1, 1, True, True, False, ), # gemma3 doesn't support spda - (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 1, 1, True, False, True), - (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 1, 1, False, True, True), - (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 1, 1, True, True, True), + ("tiny_gemma3_model_path", 1, 1, True, False, True), + ("tiny_gemma3_model_path", 1, 1, False, True, True), + ("tiny_gemma3_model_path", 1, 1, True, True, True), # CP doesn't support gemma3 due to spda input has attent_mask != None. ], indirect=True, @@ -372,9 +381,17 @@ def verify_loss_tensor(loss_tensor): @pytest.fixture def logprob_setup(request, two_gpu_virtual_cluster): """Setup and teardown specifically for training tests.""" - model_name, tp, cp, sequence_parallel, cpu_offload, activation_checkpointing = ( - request.param - ) + ( + model_fixture_name, + tp, + cp, + sequence_parallel, + cpu_offload, + activation_checkpointing, + ) = request.param + + # Get the actual model path from the requested fixture + model_name = request.getfixturevalue(model_fixture_name) policy = None data = None @@ -449,28 +466,29 @@ def logprob_setup(request, two_gpu_virtual_cluster): policy.shutdown() +@pytest.mark.hf_gated @pytest.mark.timeout(360) @pytest.mark.parametrize( "logprob_setup", [ # TP=2, CP=1 - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 2, 1, False, True, False), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 2, 1, False, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, 1, False, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, 1, False, True, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, 1, False, True, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, 1, False, True, False), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, 1, False, False, False), - (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 2, 1, False, True, False), - (TEST_ASSETS.TINY_GEMMA3_MODEL_PATH, 2, 1, False, False, False), + ("tiny_qwen2_model_path", 2, 1, False, True, False), + ("tiny_qwen2_model_path", 2, 1, False, False, False), + ("tiny_llama_model_path", 2, 1, False, False, False), + ("tiny_llama_model_path", 2, 1, False, True, False), + ("tiny_llama_model_path", 2, 1, False, True, True), + ("tiny_qwen3_model_path", 2, 1, False, True, False), + ("tiny_qwen3_model_path", 2, 1, False, False, False), + ("tiny_gemma3_model_path", 2, 1, False, True, False), + ("tiny_gemma3_model_path", 2, 1, False, False, False), # TP=1, CP=2 - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 2, False, True, False), - (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, 2, False, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, False, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, True, False), - (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 1, 2, False, True, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 2, False, True, False), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, 2, False, False, False), + ("tiny_qwen2_model_path", 1, 2, False, True, False), + ("tiny_qwen2_model_path", 1, 2, False, False, False), + ("tiny_llama_model_path", 1, 2, False, False, False), + ("tiny_llama_model_path", 1, 2, False, True, False), + ("tiny_llama_model_path", 1, 2, False, True, True), + ("tiny_qwen3_model_path", 1, 2, False, True, False), + ("tiny_qwen3_model_path", 1, 2, False, False, False), ], indirect=True, ) @@ -491,7 +509,10 @@ def test_dtensor_worker_logprob_tp2_or_cp2_matches_unsharded(logprob_setup): ) -def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(two_gpu_virtual_cluster): +@pytest.mark.hf_gated +def test_dtensor_tp_and_tied_model_with_custom_parallel_plan( + two_gpu_virtual_cluster, tiny_llama_tied_model_path +): """Test that DTensor with a tp > 1 and a tied model with a custom parallel plan works.""" from torch.distributed.tensor.parallel import ColwiseParallel from torch.distributed.tensor.placement_types import Replicate @@ -501,7 +522,7 @@ def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(two_gpu_virtual_clu "model.embed_tokens": ColwiseParallel(output_layouts=Replicate()), } config = create_test_config( - model_name=TEST_ASSETS.TINY_LLAMA_TIED_MODEL_PATH, + model_name=tiny_llama_tied_model_path, tp=2, cp=1, sequence_parallel=False, @@ -534,8 +555,11 @@ def test_dtensor_tp_and_tied_model_with_custom_parallel_plan(two_gpu_virtual_clu policy.shutdown() +@pytest.mark.hf_gated @pytest.mark.timeout(180) -def test_dtensor_loss_independent_of_microbatch_size_two_gpus(two_gpu_virtual_cluster): +def test_dtensor_loss_independent_of_microbatch_size_two_gpus( + two_gpu_virtual_cluster, tiny_llama_model_path +): """Tests that changing microbatch size while keeping global batch size constant does not affect loss values in DTensor.""" # Create test batch with global batch size of 8 global_batch_size = 8 @@ -569,7 +593,7 @@ def test_dtensor_loss_independent_of_microbatch_size_two_gpus(two_gpu_virtual_cl ) # Test with mbs=1, 2 microbatches per GPU - config = create_test_config() + config = create_test_config(tiny_llama_model_path) tokenizer = get_tokenizer(config["tokenizer"]) print("Creating training Policy with mbs=1...") @@ -605,7 +629,7 @@ def test_dtensor_loss_independent_of_microbatch_size_two_gpus(two_gpu_virtual_cl policy_mbs1.worker_group.shutdown() # Test with mbs=2, 1 microbatch per GPU - config = create_test_config() + config = create_test_config(tiny_llama_model_path) config["train_micro_batch_size"] = 2 config["generation"] = configure_generation_config(config["generation"], tokenizer) diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index a399bca0d5..38607ba59f 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -28,12 +28,11 @@ from nemo_rl.models.generation import configure_generation_config from nemo_rl.models.policy import PolicyConfig from nemo_rl.models.policy.lm_policy import Policy -from tests.unit.conftest import TEST_ASSETS from tests.unit.test_utils import SimpleLoss def create_megatron_test_config( - model_name: str = TEST_ASSETS.TINY_LLAMA_MODEL_PATH, + model_name: str, tp: int = 1, pp: int = 1, precision: str = "float32", @@ -152,7 +151,7 @@ def gc_collect(): @pytest.fixture -def policy_setup(request): +def policy_setup(request, tiny_llama_model_path): """Setup and teardown for policy tests - creates a virtual cluster and policy.""" # Get parameters from request if hasattr(request, "param") and request.param is not None: @@ -177,7 +176,7 @@ def policy_setup(request): max_colocated_worker_groups=1, ) - config = create_megatron_test_config(tp=tp, pp=pp) + config = create_megatron_test_config(tiny_llama_model_path, tp=tp, pp=pp) tokenizer = get_tokenizer(config["tokenizer"]) config["generation"] = configure_generation_config( config["generation"], tokenizer @@ -199,18 +198,21 @@ def policy_setup(request): @pytest.fixture def training_setup(request): """Setup and teardown specifically for training tests.""" - # Parse parameters: (num_gpus, tp, pp, model_name, config_updates) + # Parse parameters: (num_gpus, tp, pp, model_fixture_name, config_updates) if hasattr(request, "param") and request.param is not None: - num_gpus, tp, pp, model_name, config_updates = request.param + num_gpus, tp, pp, model_fixture_name, config_updates = request.param else: - num_gpus, tp, pp, model_name, config_updates = ( + num_gpus, tp, pp, model_fixture_name, config_updates = ( 2, 1, 1, - TEST_ASSETS.TINY_LLAMA_MODEL_PATH, + "tiny_llama_model_path", {}, ) + # Get the actual model path from the requested fixture + model_name = request.getfixturevalue(model_fixture_name) + policy = None cluster = None data = None @@ -317,24 +319,25 @@ def training_setup(request): cluster.shutdown() +@pytest.mark.hf_gated @pytest.mark.timeout(300) @pytest.mark.parametrize( "training_setup", [ - # (num_gpus, tp, pp, model_name, config_updates) - (2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {}), - (2, 2, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {}), - (2, 1, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH, {}), - (2, 2, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH, {}), - (2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {"precision": "bfloat16"}), + # (num_gpus, tp, pp, model_fixture_name, config_updates) + (2, 1, 1, "tiny_llama_model_path", {}), + (2, 2, 1, "tiny_llama_model_path", {}), + (2, 1, 1, "tiny_qwen2_model_path", {}), + (2, 2, 1, "tiny_qwen2_model_path", {}), + (2, 1, 1, "tiny_llama_model_path", {"precision": "bfloat16"}), ( 2, 1, 1, - TEST_ASSETS.TINY_LLAMA_MODEL_PATH, + "tiny_llama_model_path", {"activation_checkpointing": True}, ), - (2, 2, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH, {"sequence_parallel": True}), + (2, 2, 1, "tiny_llama_model_path", {"sequence_parallel": True}), ], indirect=True, ids=[ @@ -386,7 +389,7 @@ def verify_loss_tensor(loss_tensor): @pytest.fixture -def generation_setup(request): +def generation_setup(request, tiny_llama_model_path): """Setup and teardown specifically for generation tests.""" # Parse parameters: (num_gpus, tp, pp, generation_backend) if hasattr(request, "param") and request.param is not None: @@ -415,6 +418,7 @@ def generation_setup(request): ) config = create_megatron_test_config( + tiny_llama_model_path, tp=tp, pp=pp, generation_backend=generation_backend, @@ -536,11 +540,14 @@ def test_megatron_policy_generation(generation_setup): @pytest.fixture def logprob_setup(request): """Setup and teardown specifically for logprob tests.""" - # Parse parameters: (num_gpus, tp, pp, model_name) + # Parse parameters: (num_gpus, tp, pp, model_fixture_name) if hasattr(request, "param") and request.param is not None: - num_gpus, tp, pp, model_name = request.param + num_gpus, tp, pp, model_fixture_name = request.param else: - num_gpus, tp, pp, model_name = 2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH + num_gpus, tp, pp, model_fixture_name = 2, 1, 1, "tiny_llama_model_path" + + # Get the actual model path from the requested fixture + model_name = request.getfixturevalue(model_fixture_name) policy = None cluster = None @@ -616,14 +623,15 @@ def logprob_setup(request): @pytest.mark.timeout(180) +@pytest.mark.hf_gated @pytest.mark.parametrize( "logprob_setup", [ - # (num_gpus, tp, pp, model_name) - (2, 1, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH), - (2, 2, 1, TEST_ASSETS.TINY_LLAMA_MODEL_PATH), - (2, 1, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH), - (2, 2, 1, TEST_ASSETS.TINY_QWEN2_MODEL_PATH), + # (num_gpus, tp, pp, model_fixture_name) + (2, 1, 1, "tiny_llama_model_path"), + (2, 2, 1, "tiny_llama_model_path"), + (2, 1, 1, "tiny_qwen2_model_path"), + (2, 2, 1, "tiny_qwen2_model_path"), ], indirect=True, ids=["2gpu_dp2_llama", "2gpu_tp2_llama", "2gpu_dp2_qwen2", "2gpu_tp2_qwen2"], @@ -656,7 +664,8 @@ def test_megatron_policy_logprobs(logprob_setup): @pytest.mark.timeout(240) -def test_megatron_loss_independent_of_microbatch_size(): +@pytest.mark.hf_gated +def test_megatron_loss_independent_of_microbatch_size(tiny_llama_model_path): """Test that changing microbatch size while keeping global batch size constant does not affect loss values.""" num_gpus = 2 global_batch_size = 8 @@ -697,7 +706,7 @@ def test_megatron_loss_independent_of_microbatch_size(): max_colocated_worker_groups=1, ) - config1 = create_megatron_test_config() + config1 = create_megatron_test_config(tiny_llama_model_path) config1["train_micro_batch_size"] = 1 tokenizer = get_tokenizer(config1["tokenizer"]) config1["generation"] = configure_generation_config( @@ -745,7 +754,7 @@ def test_megatron_loss_independent_of_microbatch_size(): max_colocated_worker_groups=1, ) - config2 = create_megatron_test_config() + config2 = create_megatron_test_config(tiny_llama_model_path) config2["train_micro_batch_size"] = 2 config2["generation"] = configure_generation_config( config2["generation"], tokenizer @@ -774,7 +783,8 @@ def test_megatron_loss_independent_of_microbatch_size(): @pytest.mark.timeout(300) -def test_megatron_reference_policy_functionality(): +@pytest.mark.hf_gated +def test_megatron_reference_policy_functionality(tiny_llama_model_path): """Test Megatron reference policy functionality.""" num_gpus = 2 @@ -786,7 +796,7 @@ def test_megatron_reference_policy_functionality(): max_colocated_worker_groups=1, ) - config = create_megatron_test_config() + config = create_megatron_test_config(tiny_llama_model_path) config["megatron_cfg"]["optimizer"]["lr"] = 1e-2 # Increase from 5e-6 to 1e-2 config["megatron_cfg"]["optimizer"]["min_lr"] = 1e-3 # Increase min_lr as well @@ -894,6 +904,7 @@ def test_megatron_reference_policy_functionality(): @pytest.mark.timeout(400) +@pytest.mark.hf_gated @pytest.mark.parametrize( "num_gpus,tp,pp", [ @@ -903,12 +914,14 @@ def test_megatron_reference_policy_functionality(): ], ids=["2gpu_dp2_save_restore", "2gpu_pp2_save_restore", "2gpu_tp2_save_restore"], ) -def test_megatron_checkpoint_save_kill_and_restore(num_gpus, tp, pp): +def test_megatron_checkpoint_save_kill_and_restore( + num_gpus, tp, pp, tiny_llama_model_path +): """Test full checkpoint save/restore cycle: save -> kill worker -> restart -> verify restore.""" from copy import deepcopy # Use tiny model for faster testing - model_name = TEST_ASSETS.TINY_LLAMA_MODEL_PATH + model_name = tiny_llama_model_path tokenizer = get_tokenizer({"name": model_name}) with tempfile.TemporaryDirectory(prefix="megatron_save_restore_") as temp_dir: @@ -1146,7 +1159,8 @@ def test_megatron_checkpoint_save_kill_and_restore(num_gpus, tp, pp): @pytest.mark.timeout(300) -def test_megatron_dpo_training(): +@pytest.mark.hf_gated +def test_megatron_dpo_training(tiny_llama_model_path): """Test DPO training with Megatron backend.""" num_gpus = 2 batch_size = 8 @@ -1184,7 +1198,7 @@ def test_megatron_dpo_training(): max_colocated_worker_groups=1, ) - config = create_megatron_test_config() + config = create_megatron_test_config(tiny_llama_model_path) tokenizer = get_tokenizer(config["tokenizer"]) policy = Policy( @@ -1242,7 +1256,8 @@ def test_megatron_dpo_training(): @pytest.mark.timeout(300) -def test_megatron_sft_training(): +@pytest.mark.hf_gated +def test_megatron_sft_training(tiny_llama_model_path): """Test SFT training with Megatron backend.""" num_gpus = 2 batch_size = 8 @@ -1277,7 +1292,7 @@ def test_megatron_sft_training(): max_colocated_worker_groups=1, ) - config = create_megatron_test_config() + config = create_megatron_test_config(tiny_llama_model_path) tokenizer = get_tokenizer(config["tokenizer"]) policy = Policy( @@ -1323,8 +1338,9 @@ def test_megatron_sft_training(): cluster.shutdown() +@pytest.mark.hf_gated @pytest.mark.timeout(300) -def test_megatron_context_parallel_logprob_agreement(): +def test_megatron_context_parallel_logprob_agreement(tiny_llama_model_path): """Test that CP and non-CP models produce identical logprobs with sequence packing enabled.""" num_gpus = 2 batch_size = 4 @@ -1362,7 +1378,9 @@ def test_megatron_context_parallel_logprob_agreement(): max_colocated_worker_groups=1, ) - config_no_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16") + config_no_cp = create_megatron_test_config( + tiny_llama_model_path, tp=1, pp=1, precision="bfloat16" + ) # Ensure context parallel is disabled config_no_cp["megatron_cfg"]["context_parallel_size"] = 1 @@ -1442,7 +1460,9 @@ def test_megatron_context_parallel_logprob_agreement(): max_colocated_worker_groups=1, ) - config_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16") + config_cp = create_megatron_test_config( + tiny_llama_model_path, tp=1, pp=1, precision="bfloat16" + ) # Enable context parallel config_cp["megatron_cfg"]["context_parallel_size"] = 2 @@ -1521,8 +1541,9 @@ def test_megatron_context_parallel_logprob_agreement(): ) +@pytest.mark.hf_gated @pytest.mark.timeout(300) -def test_megatron_context_parallel_training_agreement(): +def test_megatron_context_parallel_training_agreement(tiny_llama_model_path): """Test that CP and non-CP models produce consistent training results with ClippedPG loss and sequence packing.""" num_gpus = 2 batch_size = 2 @@ -1580,7 +1601,9 @@ def test_megatron_context_parallel_training_agreement(): max_colocated_worker_groups=1, ) - config_no_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16") + config_no_cp = create_megatron_test_config( + tiny_llama_model_path, tp=1, pp=1, precision="bfloat16" + ) # Ensure context parallel is disabled config_no_cp["megatron_cfg"]["context_parallel_size"] = 1 config_no_cp["train_global_batch_size"] = 2 @@ -1642,7 +1665,9 @@ def test_megatron_context_parallel_training_agreement(): max_colocated_worker_groups=1, ) - config_cp = create_megatron_test_config(tp=1, pp=1, precision="bfloat16") + config_cp = create_megatron_test_config( + tiny_llama_model_path, tp=1, pp=1, precision="bfloat16" + ) # Enable context parallel config_cp["megatron_cfg"]["context_parallel_size"] = 2 config_cp["train_global_batch_size"] = 2