Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,8 @@ jobs:
UNIT_TEST_SCRIPT: |
cd /opt/nemo-rl
if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then
uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl -m \"not mcore\"
uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json -m mcore
uv run --no-sync bash -x ./tests/run_unit.sh --cov=nemo_rl --hf-gated
uv run --extra mcore bash -x ./tests/run_unit.sh --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --mcore-only
else
echo Skipping unit tests for docs-only level
fi
Expand Down
17 changes: 11 additions & 6 deletions docs/testing.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,19 @@ Unit tests require 2 GPUs to test the full suite.

```sh
# Run the unit tests using local GPUs

# Configuration 1: Default tests only - excludes both hf_gated and mcore tests
uv run --group test bash tests/run_unit.sh
```

:::{note}
Tests can also be run on Slurm with `ray.sub`, but note that some tests will be skipped
due to no GPUs being located on the head node. To run the full suite of tests, please
launch on a regular GPU allocation.
:::
# Configuration 2: Default + HF gated tests, excluding mcore tests
uv run --group test bash tests/run_unit.sh --hf-gated

# Configuration 3: ONLY mcore tests, excluding ones with hf_gated
uv run --extra mcore --group test bash tests/run_unit.sh --mcore-only

# Configuration 4: ONLY mcore tests, including ones with hf_gated
uv run --extra mcore --group test bash tests/run_unit.sh --mcore-only --hf-gated
```

### Run Unit Tests in a Hermetic Environment

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ testpaths = ["tests"]
python_files = "test_*.py"
markers = [
"mcore: marks tests that require the mcore extra",
"hf_gated: marks tests that require HuggingFace token access for gated models",
]

[tool.coverage.run]
Expand Down
4 changes: 4 additions & 0 deletions tests/unit/algorithms/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def get_format_with_simple_role_header(messages):
return message


@pytest.mark.hf_gated
def test_get_tokenizer_no_chat_template(conversation_messages):
"""Test get_tokenizer when no chat template is specified in config"""
config = {"name": "meta-llama/Llama-3.2-1B-Instruct"}
Expand All @@ -89,6 +90,7 @@ def test_get_tokenizer_no_chat_template(conversation_messages):
assert formatted == expected


@pytest.mark.hf_gated
def test_get_tokenizer_default_chat_template(conversation_messages):
"""Test get_tokenizer when chat_template is 'default' in config"""
config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": "default"}
Expand All @@ -100,6 +102,7 @@ def test_get_tokenizer_default_chat_template(conversation_messages):
assert formatted == expected


@pytest.mark.hf_gated
def test_get_tokenizer_null_chat_template(conversation_messages):
"""Test get_tokenizer when chat_template is None in config"""
config = {"name": "meta-llama/Llama-3.2-1B-Instruct", "chat_template": None}
Expand All @@ -113,6 +116,7 @@ def test_get_tokenizer_null_chat_template(conversation_messages):
assert formatted == expected


@pytest.mark.hf_gated
def test_get_tokenizer_custom_jinja_template(conversation_messages):
"""Test get_tokenizer when a custom jinja template is specified"""
custom_template = COMMON_CHAT_TEMPLATES.simple_role_header
Expand Down
102 changes: 73 additions & 29 deletions tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,32 +30,70 @@

dir_path = os.path.dirname(os.path.abspath(__file__))


def pytest_addoption(parser):
"""Add custom command line options for controlling test execution."""
parser.addoption(
"--hf-gated",
action="store_true",
default=False,
help="Include tests that require HuggingFace token access",
)
parser.addoption(
"--mcore-only",
action="store_true",
default=False,
help="Run ONLY mcore tests (combine with --hf-gated to include mcore+hf_gated tests)",
)


def pytest_collection_modifyitems(config, items):
"""Modify test collection to skip tests based on markers unless explicitly requested."""
run_hf_gated = config.getoption("--hf-gated")
run_mcore_only = config.getoption("--mcore-only")
marker_expr = config.getoption("-m", default="")

# If user specified -m marker expressions, let pytest handle everything normally
if marker_expr:
return

# Filter tests based on the desired configurations
new_items = []

if run_mcore_only and run_hf_gated:
# Configuration 4: Only mcore tests, including ones with hf_gated
new_items = [item for item in items if item.get_closest_marker("mcore")]
elif run_mcore_only:
# Configuration 3: Only mcore tests, excluding ones with hf_gated
new_items = [
item
for item in items
if item.get_closest_marker("mcore")
and not item.get_closest_marker("hf_gated")
]
elif run_hf_gated:
# Configuration 2: Default tests + hf_gated tests, excluding mcore
new_items = [item for item in items if not item.get_closest_marker("mcore")]
else:
# Configuration 1: Default only - exclude both hf_gated and mcore
new_items = [
item
for item in items
if not item.get_closest_marker("hf_gated")
and not item.get_closest_marker("mcore")
]

# Update the items list in-place
items[:] = new_items


TEST_ASSETS_DIR = os.path.join(dir_path, "test_assets")
UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json")
UNIT_RESULTS_FILE_DATED = os.path.join(
dir_path, f"unit_results/{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
)


# Mapping between asset and absolute path (each are populated from a session level fixture)
class TEST_ASSETS:
TINY_LLAMA_MODEL_PATH = os.path.join(
TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer"
)
TINY_LLAMA_TIED_MODEL_PATH = os.path.join(
TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer"
)
TINY_QWEN2_MODEL_PATH = os.path.join(
TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer"
)
TINY_QWEN3_MODEL_PATH = os.path.join(
TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer"
)
TINY_GEMMA3_MODEL_PATH = os.path.join(
TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer"
)


class UnitTestData(TypedDict):
exit_status: int | str
git_commit: str
Expand Down Expand Up @@ -198,6 +236,10 @@ def log_max_mem(self, metric_name: str):
session_data["metrics"][qualified_name]["_elapsed"] = end_time - start_time


def pytest_sessionstart(session):
os.makedirs(TEST_ASSETS_DIR, exist_ok=True)


def pytest_sessionfinish(session, exitstatus):
if not hasattr(session.config, "_unit_test_data"):
return
Expand Down Expand Up @@ -394,14 +436,14 @@ def mock_2gpu_distributed_env():
#######################


@pytest.fixture(scope="session", autouse=True)
@pytest.fixture(scope="session")
def tiny_llama_model_path():
"""Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
import shutil

from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM

model_path = TEST_ASSETS.TINY_LLAMA_MODEL_PATH
model_path = os.path.join(TEST_ASSETS_DIR, "tiny_llama_with_llama3.2_tokenizer")
# hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
# vocab_size=128256 (so we can re-use llama3.2 1b tokenizer)
config = LlamaConfig(
Expand All @@ -422,14 +464,16 @@ def tiny_llama_model_path():
yield model_path


@pytest.fixture(scope="session", autouse=True)
@pytest.fixture(scope="session")
def tiny_llama_tied_model_path():
"""Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
import shutil

from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM

model_path = TEST_ASSETS.TINY_LLAMA_TIED_MODEL_PATH
model_path = os.path.join(
TEST_ASSETS_DIR, "tiny_llama_tied_with_llama3.2_tokenizer"
)
# hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
# vocab_size=128256 (so we can re-use llama3.2 1b tokenizer)
config = LlamaConfig(
Expand All @@ -450,14 +494,14 @@ def tiny_llama_tied_model_path():
yield model_path


@pytest.fixture(scope="session", autouse=True)
@pytest.fixture(scope="session")
def tiny_qwen2_model_path():
"""Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
import shutil

from transformers import AutoTokenizer, Qwen2Config, Qwen2ForCausalLM

model_path = TEST_ASSETS.TINY_QWEN2_MODEL_PATH
model_path = os.path.join(TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer")
# hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
# vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer)
config = Qwen2Config(
Expand All @@ -478,14 +522,14 @@ def tiny_qwen2_model_path():
yield model_path


@pytest.fixture(scope="session", autouse=True)
@pytest.fixture(scope="session")
def tiny_qwen3_model_path():
"""Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
import shutil

from transformers import AutoTokenizer, Qwen3Config, Qwen3ForCausalLM

model_path = TEST_ASSETS.TINY_QWEN3_MODEL_PATH
model_path = os.path.join(TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer")
# hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
# vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer)
config = Qwen3Config(
Expand All @@ -506,14 +550,14 @@ def tiny_qwen3_model_path():
yield model_path


@pytest.fixture(scope="session", autouse=True)
@pytest.fixture(scope="session")
def tiny_gemma3_model_path():
"""Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
import shutil

from transformers import AutoTokenizer, Gemma3ForCausalLM, Gemma3TextConfig

model_path = TEST_ASSETS.TINY_GEMMA3_MODEL_PATH
model_path = os.path.join(TEST_ASSETS_DIR, "tiny_gemma3_with_gemma3_tokenizer")
# hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
# vocab_size=262144 so we can re-use gemma-3-1b tokenizer
config = Gemma3TextConfig(
Expand Down
1 change: 1 addition & 0 deletions tests/unit/data/hf_datasets/test_oai_format_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def test_custom_keys(sample_data):
assert dataset.system_prompt == "You are a helpful assistant."


@pytest.mark.hf_gated
@pytest.mark.parametrize("sample_data", [("messages", "system_key")], indirect=True)
def test_message_formatting(sample_data):
train_path, val_path = sample_data
Expand Down
1 change: 1 addition & 0 deletions tests/unit/data/hf_datasets/test_prompt_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def test_custom_keys(sample_data):
assert dataset.output_key == "answer"


@pytest.mark.hf_gated
@pytest.mark.parametrize("sample_data", [("question", "answer")], indirect=True)
def test_message_formatting(sample_data):
train_path, val_path = sample_data
Expand Down
1 change: 1 addition & 0 deletions tests/unit/data/hf_datasets/test_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from nemo_rl.data.hf_datasets.squad import SquadDataset


@pytest.mark.hf_gated
@pytest.mark.skip(reason="dataset download is flaky")
def test_squad_dataset():
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
Expand Down
3 changes: 3 additions & 0 deletions tests/unit/data/test_llm_message_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ def test_batch_pad_message_log_custom_pad_value(
)


@pytest.mark.hf_gated
def test_get_formatted_message_log_llama(
raw_chat_message_log: LLMMessageLogType,
) -> None:
Expand Down Expand Up @@ -372,6 +373,7 @@ def test_get_formatted_message_log_llama(
assert actual_text == expected_text


@pytest.mark.hf_gated
def test_get_formatted_message_log_add_generation_prompt_llama(
raw_chat_message_log: LLMMessageLogType,
) -> None:
Expand Down Expand Up @@ -499,6 +501,7 @@ def test_get_formatted_message_log_add_generation_prompt_qwen(
assert actual_text == expected_text


@pytest.mark.hf_gated
def test_formatted_message_log_empty_message():
message_logs = [
[
Expand Down
1 change: 1 addition & 0 deletions tests/unit/models/dtensor/test_parallelize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)


@pytest.mark.hf_gated
@pytest.mark.parametrize(
"model_name, parallelize_func, sequence_parallel",
[
Expand Down
12 changes: 7 additions & 5 deletions tests/unit/models/generation/test_vllm_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1620,13 +1620,15 @@ def test_vllm_megatron_weight_update_with_packing(cluster, test_input_data):
# Enable packing during test
os.environ["NEMO_RL_MEGATRON_IPC_TENSOR_PACKING_THRESHOLD"] = "1"

# Both policies must use the same model (Qwen2.5-0.5B) for weight transfer compatibility
model_name = "Qwen/Qwen2.5-0.5B"
# Both policies must use the same model for weight transfer compatibility
# NOTE: We have tried using Qwen/Qwen2.5-0.5B, but some small models exhibit variance depending
# on which hardware it is run on.
model_name = "Qwen/Qwen3-0.6B"
tokenizer = get_tokenizer({"name": model_name})

# Create Policy
megatron_config = get_basic_megatron_test_config(
tp=1, pp=1, precision="float32"
tp=1, pp=1, precision="bfloat16"
)
megatron_config["model_name"] = model_name
megatron_config["tokenizer"]["name"] = model_name
Expand All @@ -1653,8 +1655,8 @@ def test_vllm_megatron_weight_update_with_packing(cluster, test_input_data):
output_ids = outputs["output_ids"]
generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
assert generated_texts == [
"Hello, my name is John. I am a",
"The capital of France is Paris. It is the",
"Hello, my name is Lina. I'm",
"The capital of France is Paris. The capital of",
], "Output should be the same as the expected output"

finally:
Expand Down
1 change: 1 addition & 0 deletions tests/unit/models/generation/test_vllm_large_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def test_input_data(tokenizer):

# skip this test for now
@pytest.mark.skip(reason="Skipping large model test until we have resources in CI.")
@pytest.mark.hf_gated
@pytest.mark.asyncio
@pytest.mark.parametrize("tensor_parallel_size", [4, 8])
@pytest.mark.parametrize("pipeline_parallel_size", [2])
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/models/huggingface/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from nemo_rl.models.huggingface.common import ModelFlag, is_gemma_model


@pytest.mark.hf_gated
@pytest.mark.parametrize(
"model_name",
[
Expand All @@ -42,6 +43,7 @@ def test_gemma_models(model_name):
assert ModelFlag.VLLM_LOAD_FORMAT_AUTO.matches(model_name)


@pytest.mark.hf_gated
@pytest.mark.parametrize(
"model_name",
[
Expand Down
Loading
Loading