diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..ef8d92f --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,137 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +The RAPIDS CLI is a command-line tool for performing common RAPIDS operations, primarily focused on +health checks (`rapids doctor`) and debugging (`rapids debug`). It uses a plugin system that allows +RAPIDS libraries to register their own health checks via Python entry points. + +## Common Commands + +### Development Setup + +```bash +# Install in editable mode +pip install -e . + +# Install with test dependencies +pip install -e .[test] +``` + +### Testing + +```bash +# Run all tests (coverage reporting is automatic via pyproject.toml) +pytest + +# Run a specific test file +pytest rapids_cli/tests/test_gpu.py + +# Run a specific test function +pytest rapids_cli/tests/test_gpu.py::test_gpu_check_success + +# Generate coverage report without running tests +coverage report + +# View detailed HTML coverage report +coverage html && open htmlcov/index.html +``` + +### Linting and Pre-commit + +```bash +# Install pre-commit hooks +pre-commit install + +# Run all pre-commit checks +pre-commit run --all-files + +# Individual linters +black . # Format code +ruff check --fix . # Lint with ruff +mypy rapids_cli/ # Type checking +``` + +### Running the CLI + +```bash +# Run doctor checks +rapids doctor +rapids doctor --verbose +rapids doctor --dry-run + +# Run debug command +rapids debug +rapids debug --json +``` + +## Architecture + +### CLI Structure + +- **Entry point**: `rapids_cli/cli.py` defines the main CLI group and subcommands using rich-click +- **Doctor command**: `rapids_cli/doctor/doctor.py` contains the health check orchestration logic +- **Debug command**: `rapids_cli/debug/debug.py` gathers system/environment information +- **Checks**: Individual checks live in `rapids_cli/doctor/checks/` (gpu.py, cuda_driver.py, memory.py, + nvlink.py) + +### Plugin System + +The doctor command discovers and runs checks via Python entry points defined in `pyproject.toml`: + +- Entry point group: `rapids_doctor_check` +- Built-in checks are registered in `[project.entry-points.rapids_doctor_check]` +- External packages can register additional checks by adding their own entry points +- Check functions receive `verbose` kwarg and should accept `**kwargs` for forward compatibility +- Checks pass by returning successfully (any return value) and fail by raising exceptions +- Checks can issue warnings using Python's `warnings.warn()` which are caught and displayed + +### Check Function Contract + +- Accept `verbose=False` and `**kwargs` parameters +- Raise exceptions with helpful error messages for failures +- Return successfully for passing checks (return value is optional string for verbose output) +- Use `warnings.warn()` for non-fatal issues + +### Key Dependencies + +- `rich` and `rich-click` for terminal output and CLI interface +- `pynvml` (nvidia-ml-py) for GPU information +- `cuda-pathfinder` for locating CUDA installations +- `psutil` for system memory checks + +### Configuration + +- Package configuration in `pyproject.toml` (build system, dependencies, entry points) +- CLI settings in `rapids_cli/config.yml` (loaded via `config.py`) +- Dependencies managed via `dependencies.yaml` and `rapids-dependency-file-generator` + +## Code Style + +- Python 3.10+ (minimum version) +- Line length: 120 characters +- Use Google-style docstrings (enforced by ruff with pydocstyle convention) +- Enforce type hints (checked by mypy) +- SPDX license headers required on all files (enforced by pre-commit hook) +- All commits must be signed off with `-s` flag + +## Testing Notes + +Tests are located in `rapids_cli/tests/`. The test suite runs quickly with 53 tests covering all +modules. GPU-based tests run in CI on actual GPU hardware (L4 instances). + +### Coverage Requirements + +- Minimum coverage threshold: **95%** +- Coverage is automatically measured when running `pytest` +- Coverage reports are generated in XML format for CI and terminal format for local development +- Test files and `_version.py` are excluded from coverage measurements + +## CI/CD + +- Pre-commit checks run on all PRs (black, ruff, mypy, shellcheck, etc.) +- Builds both conda packages (noarch: python) and wheels (pure Python) +- Tests run on GPU nodes with CUDA available +- Uses RAPIDS shared workflows for build and test automation diff --git a/conda/recipes/rapids-cli/recipe.yaml b/conda/recipes/rapids-cli/recipe.yaml index 6041cb5..4e3ad4a 100644 --- a/conda/recipes/rapids-cli/recipe.yaml +++ b/conda/recipes/rapids-cli/recipe.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. # SPDX-License-Identifier: Apache-2.0 schema_version: 1 @@ -35,6 +35,7 @@ requirements: - nvidia-ml-py >=12.0 - packaging - psutil + - pyyaml - rich - rich-click diff --git a/dependencies.yaml b/dependencies.yaml index ee2c3f8..63e2900 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + # Dependency list for https://github.com/rapidsai/dependency-file-generator files: py_run_rapids_cli: @@ -62,6 +65,7 @@ dependencies: - cuda-pathfinder >=1.2.3 - packaging - psutil + - pyyaml - rich - rich-click - output_types: [conda] @@ -76,3 +80,4 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - pytest + - pytest-cov diff --git a/pyproject.toml b/pyproject.toml index aa598d6..10f13aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "nvidia-ml-py>=12.0", "packaging", "psutil", + "pyyaml", "rich", "rich-click", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. @@ -19,6 +20,7 @@ dependencies = [ [project.optional-dependencies] test = [ "pytest", + "pytest-cov", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. [project.scripts] @@ -63,8 +65,9 @@ select = [ # PyPI hard limit is 1GiB, but try to keep this as small as possible max_allowed_size_compressed = '10Mi' -[tool.pytest] -testpaths = ["tests"] +[tool.pytest.ini_options] +testpaths = ["rapids_cli/tests"] +addopts = "--cov=rapids_cli --cov-report=term-missing --cov-report=xml --cov-fail-under=95" [tool.ruff] # Exclude a variety of commonly ignored directories. @@ -140,3 +143,21 @@ convention = "google" [tool.mypy] exclude = ["examples", "venv", "ci", "docs", "conftest.py"] ignore_missing_imports = true + +[tool.coverage.run] +source = ["rapids_cli"] +omit = [ + "rapids_cli/tests/*", + "rapids_cli/_version.py", +] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "raise AssertionError", + "raise NotImplementedError", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", + "@abstractmethod", +] diff --git a/rapids_cli/tests/test_cli.py b/rapids_cli/tests/test_cli.py new file mode 100644 index 0000000..6526de6 --- /dev/null +++ b/rapids_cli/tests/test_cli.py @@ -0,0 +1,109 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +from unittest.mock import patch + +from click.testing import CliRunner + +from rapids_cli.cli import debug, doctor, rapids + + +def test_rapids_cli_help(): + """Test rapids CLI help output.""" + runner = CliRunner() + result = runner.invoke(rapids, ["--help"]) + assert result.exit_code == 0 + assert "The Rapids CLI is a command-line interface for RAPIDS" in result.output + + +def test_doctor_command_help(): + """Test doctor command help output.""" + runner = CliRunner() + result = runner.invoke(rapids, ["doctor", "--help"]) + assert result.exit_code == 0 + assert "Run health checks" in result.output + + +def test_debug_command_help(): + """Test debug command help output.""" + runner = CliRunner() + result = runner.invoke(rapids, ["debug", "--help"]) + assert result.exit_code == 0 + assert "Gather debugging information" in result.output + + +def test_doctor_command_success(): + """Test doctor command with successful checks.""" + runner = CliRunner() + with patch("rapids_cli.cli.doctor_check", return_value=True): + result = runner.invoke(rapids, ["doctor"]) + assert result.exit_code == 0 + + +def test_doctor_command_failure(): + """Test doctor command with failed checks.""" + runner = CliRunner() + with patch("rapids_cli.cli.doctor_check", return_value=False): + result = runner.invoke(rapids, ["doctor"]) + assert result.exit_code == 1 + assert "Health checks failed" in result.output + + +def test_doctor_command_verbose(): + """Test doctor command with verbose flag.""" + runner = CliRunner() + with patch("rapids_cli.cli.doctor_check", return_value=True) as mock_check: + result = runner.invoke(rapids, ["doctor", "--verbose"]) + assert result.exit_code == 0 + mock_check.assert_called_once_with(True, False, ()) + + +def test_doctor_command_dry_run(): + """Test doctor command with dry-run flag.""" + runner = CliRunner() + with patch("rapids_cli.cli.doctor_check", return_value=True) as mock_check: + result = runner.invoke(rapids, ["doctor", "--dry-run"]) + assert result.exit_code == 0 + mock_check.assert_called_once_with(False, True, ()) + + +def test_doctor_command_with_filters(): + """Test doctor command with filters.""" + runner = CliRunner() + with patch("rapids_cli.cli.doctor_check", return_value=True) as mock_check: + result = runner.invoke(rapids, ["doctor", "cudf", "cuml"]) + assert result.exit_code == 0 + mock_check.assert_called_once_with(False, False, ("cudf", "cuml")) + + +def test_debug_command_console(): + """Test debug command with console output.""" + runner = CliRunner() + with patch("rapids_cli.cli.run_debug") as mock_debug: + result = runner.invoke(rapids, ["debug"]) + assert result.exit_code == 0 + mock_debug.assert_called_once_with(output_format="console") + + +def test_debug_command_json(): + """Test debug command with JSON output.""" + runner = CliRunner() + with patch("rapids_cli.cli.run_debug") as mock_debug: + result = runner.invoke(rapids, ["debug", "--json"]) + assert result.exit_code == 0 + mock_debug.assert_called_once_with(output_format="json") + + +def test_doctor_standalone(): + """Test doctor command as standalone function.""" + runner = CliRunner() + with patch("rapids_cli.cli.doctor_check", return_value=True): + result = runner.invoke(doctor) + assert result.exit_code == 0 + + +def test_debug_standalone(): + """Test debug command as standalone function.""" + runner = CliRunner() + with patch("rapids_cli.cli.run_debug"): + result = runner.invoke(debug) + assert result.exit_code == 0 diff --git a/rapids_cli/tests/test_config.py b/rapids_cli/tests/test_config.py new file mode 100644 index 0000000..fdb2ae2 --- /dev/null +++ b/rapids_cli/tests/test_config.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +from rapids_cli.config import config + + +def test_config_loaded(): + """Test that config is loaded successfully.""" + assert config is not None + assert isinstance(config, dict) + + +def test_config_has_min_supported_versions(): + """Test that config contains minimum supported versions.""" + assert "min_supported_versions" in config + assert "gpu_compute_requirement" in config["min_supported_versions"] + + +def test_config_has_valid_subcommands(): + """Test that config contains valid subcommands.""" + assert "valid_subcommands" in config + assert "VALID_SUBCOMMANDS" in config["valid_subcommands"] + + +def test_config_has_os_requirements(): + """Test that config contains OS requirements.""" + assert "os_requirements" in config + assert "VALID_LINUX_OS_VERSIONS" in config["os_requirements"] + assert "OS_TO_MIN_SUPPORTED_VERSION" in config["os_requirements"] + + +def test_config_has_cudf_section(): + """Test that config contains cuDF section.""" + assert "cudf" in config + assert "cuda_requirement" in config["cudf"] + assert "driver_requirement" in config["cudf"] + assert "compute_requirement" in config["cudf"] + assert "links" in config["cudf"] + assert "description" in config["cudf"] + + +def test_config_has_cuml_section(): + """Test that config contains cuML section.""" + assert "cuml" in config + assert "links" in config["cuml"] + assert "description" in config["cuml"] diff --git a/rapids_cli/tests/test_debug.py b/rapids_cli/tests/test_debug.py new file mode 100644 index 0000000..91c330c --- /dev/null +++ b/rapids_cli/tests/test_debug.py @@ -0,0 +1,126 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +import json +from unittest.mock import MagicMock, patch + +from rapids_cli.debug.debug import ( + gather_command_output, + gather_cuda_version, + gather_package_versions, + gather_tools, + run_debug, +) + + +def test_gather_cuda_version(): + """Test CUDA version gathering.""" + with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040): + result = gather_cuda_version() + assert result == "12.4" + + +def test_gather_cuda_version_with_patch(): + """Test CUDA version with patch number.""" + with patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12345): + result = gather_cuda_version() + assert result == "12.34.5" + + +def test_gather_package_versions(): + """Test package version gathering.""" + result = gather_package_versions() + assert isinstance(result, dict) + assert len(result) > 0 + # Check that rapids-cli is in the installed packages + assert "rapids-cli" in result + + +def test_gather_command_output_success(): + """Test successful command output gathering.""" + result = gather_command_output(["echo", "test"]) + assert result == "test" + + +def test_gather_command_output_with_fallback(): + """Test command output with fallback.""" + result = gather_command_output(["nonexistent_command"], fallback_output="fallback") + assert result == "fallback" + + +def test_gather_command_output_no_fallback(): + """Test command output without fallback.""" + result = gather_command_output(["nonexistent_command"]) + assert result is None + + +def test_gather_tools(): + """Test tools gathering.""" + with ( + patch( + "rapids_cli.debug.debug.gather_command_output", + side_effect=lambda cmd, **kwargs: f"{cmd[0]} version", + ), + ): + result = gather_tools() + assert isinstance(result, dict) + assert "pip" in result + assert "conda" in result + assert "g++" in result + + +def test_run_debug_console(capsys): + """Test run_debug with console output.""" + mock_vm = MagicMock() + mock_vm.total = 32 * 1024**3 + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), + patch( + "cuda.pathfinder.find_nvidia_header_directory", + return_value="/usr/local/cuda/include", + ), + patch("pathlib.Path.glob", return_value=[]), + patch("rapids_cli.debug.debug.gather_package_versions", return_value={}), + patch("rapids_cli.debug.debug.gather_command_output", return_value=None), + patch("rapids_cli.debug.debug.gather_tools", return_value={}), + patch("pathlib.Path.read_text", return_value='NAME="Ubuntu"\nVERSION="22.04"'), + ): + run_debug(output_format="console") + + captured = capsys.readouterr() + assert "RAPIDS Debug Information" in captured.out + + +def test_run_debug_json(capsys): + """Test run_debug with JSON output.""" + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54.15"), + patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12040), + patch( + "cuda.pathfinder.find_nvidia_header_directory", + return_value="/usr/local/cuda/include", + ), + patch("pathlib.Path.glob", return_value=[]), + patch( + "rapids_cli.debug.debug.gather_package_versions", + return_value={"test": "1.0"}, + ), + patch( + "rapids_cli.debug.debug.gather_command_output", return_value="test output" + ), + patch("rapids_cli.debug.debug.gather_tools", return_value={"pip": "pip 23.0"}), + patch("pathlib.Path.read_text", return_value='NAME="Ubuntu"\nVERSION="22.04"'), + ): + run_debug(output_format="json") + + captured = capsys.readouterr() + output = json.loads(captured.out) + assert isinstance(output, dict) + assert "date" in output + assert "platform" in output + assert "driver_version" in output + assert "cuda_version" in output + assert "package_versions" in output diff --git a/rapids_cli/tests/test_doctor.py b/rapids_cli/tests/test_doctor.py new file mode 100644 index 0000000..7b9eb89 --- /dev/null +++ b/rapids_cli/tests/test_doctor.py @@ -0,0 +1,164 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +import warnings +from unittest.mock import MagicMock, patch + +from rapids_cli.doctor.doctor import CheckResult, doctor_check + + +def mock_passing_check(verbose=False, **kwargs): + """Mock check that passes.""" + return "Check passed" + + +def mock_failing_check(verbose=False, **kwargs): + """Mock check that fails.""" + raise ValueError("Check failed") + + +def mock_warning_check(verbose=False, **kwargs): + """Mock check that issues a warning.""" + warnings.warn("This is a warning", stacklevel=2) + return True + + +def test_doctor_check_all_pass(capsys): + """Test doctor_check with all checks passing.""" + mock_ep1 = MagicMock() + mock_ep1.name = "test_check_1" + mock_ep1.value = "test.module:check1" + mock_ep1.load.return_value = mock_passing_check + + mock_ep2 = MagicMock() + mock_ep2.name = "test_check_2" + mock_ep2.value = "test.module:check2" + mock_ep2.load.return_value = mock_passing_check + + with patch( + "rapids_cli.doctor.doctor.entry_points", return_value=[mock_ep1, mock_ep2] + ): + result = doctor_check(verbose=False, dry_run=False) + assert result is True + + captured = capsys.readouterr() + assert "All checks passed!" in captured.out + + +def test_doctor_check_with_failure(capsys): + """Test doctor_check with one check failing.""" + mock_ep1 = MagicMock() + mock_ep1.name = "passing_check" + mock_ep1.value = "test.module:check1" + mock_ep1.load.return_value = mock_passing_check + + mock_ep2 = MagicMock() + mock_ep2.name = "failing_check" + mock_ep2.value = "test.module:check2" + mock_ep2.load.return_value = mock_failing_check + + with patch( + "rapids_cli.doctor.doctor.entry_points", return_value=[mock_ep1, mock_ep2] + ): + result = doctor_check(verbose=False, dry_run=False) + assert result is False + + captured = capsys.readouterr() + assert "failing_check failed" in captured.out + + +def test_doctor_check_verbose(capsys): + """Test doctor_check with verbose flag.""" + mock_ep = MagicMock() + mock_ep.name = "test_check" + mock_ep.value = "test.module:check" + mock_ep.load.return_value = mock_passing_check + + with patch("rapids_cli.doctor.doctor.entry_points", return_value=[mock_ep]): + result = doctor_check(verbose=True, dry_run=False) + assert result is True + + captured = capsys.readouterr() + assert "Discovering checks" in captured.out + assert "Found check 'test_check'" in captured.out + assert "Discovered 1 checks" in captured.out + + +def test_doctor_check_dry_run(capsys): + """Test doctor_check with dry_run flag.""" + mock_ep = MagicMock() + mock_ep.name = "test_check" + mock_ep.value = "test.module:check" + mock_ep.load.return_value = mock_passing_check + + with patch("rapids_cli.doctor.doctor.entry_points", return_value=[mock_ep]): + result = doctor_check(verbose=False, dry_run=True) + assert result is True + + captured = capsys.readouterr() + assert "Dry run, skipping checks" in captured.out + + +def test_doctor_check_with_filters(capsys): + """Test doctor_check with filters.""" + mock_ep1 = MagicMock() + mock_ep1.name = "cudf_check" + mock_ep1.value = "cudf.module:check" + mock_ep1.load.return_value = mock_passing_check + + mock_ep2 = MagicMock() + mock_ep2.name = "cuml_check" + mock_ep2.value = "cuml.module:check" + mock_ep2.load.return_value = mock_passing_check + + with patch( + "rapids_cli.doctor.doctor.entry_points", return_value=[mock_ep1, mock_ep2] + ): + result = doctor_check(verbose=False, dry_run=False, filters=["cudf"]) + assert result is True + + +def test_doctor_check_with_warnings(capsys): + """Test doctor_check with checks that issue warnings.""" + mock_ep = MagicMock() + mock_ep.name = "warning_check" + mock_ep.value = "test.module:check" + mock_ep.load.return_value = mock_warning_check + + with patch("rapids_cli.doctor.doctor.entry_points", return_value=[mock_ep]): + result = doctor_check(verbose=False, dry_run=False) + assert result is True + + captured = capsys.readouterr() + assert "Warning" in captured.out + assert "This is a warning" in captured.out + + +def test_check_result_creation(): + """Test CheckResult dataclass creation.""" + result = CheckResult( + name="test_check", + description="Test check description", + status=True, + value="Success", + error=None, + warnings=None, + ) + assert result.name == "test_check" + assert result.description == "Test check description" + assert result.status is True + assert result.value == "Success" + assert result.error is None + assert result.warnings is None + + +def test_doctor_check_import_error(): + """Test that import errors are suppressed during check discovery.""" + mock_ep = MagicMock() + mock_ep.name = "broken_check" + mock_ep.value = "broken.module:check" + mock_ep.load.side_effect = ImportError("Module not found") + + with patch("rapids_cli.doctor.doctor.entry_points", return_value=[mock_ep]): + result = doctor_check(verbose=False, dry_run=False) + # Should still pass with no checks discovered + assert result is True diff --git a/rapids_cli/tests/test_gpu.py b/rapids_cli/tests/test_gpu.py new file mode 100644 index 0000000..a895bc2 --- /dev/null +++ b/rapids_cli/tests/test_gpu.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +from unittest.mock import patch + +import pytest + +from rapids_cli.doctor.checks.gpu import ( + REQUIRED_COMPUTE_CAPABILITY, + check_gpu_compute_capability, + gpu_check, +) + + +def test_gpu_check_success(): + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=2), + ): + result = gpu_check(verbose=True) + assert result == "GPU(s) detected: 2" + + +def test_gpu_check_no_gpus(): + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=0), + ): + with pytest.raises(AssertionError, match="No GPUs detected"): + gpu_check(verbose=False) + + +def test_gpu_check_nvml_error(): + import pynvml + + with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): + with pytest.raises(ValueError, match="No available GPUs detected"): + gpu_check(verbose=False) + + +def test_check_gpu_compute_capability_success(): + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=2), + patch("pynvml.nvmlDeviceGetHandleByIndex"), + patch( + "pynvml.nvmlDeviceGetCudaComputeCapability", + return_value=(REQUIRED_COMPUTE_CAPABILITY, 5), + ), + ): + result = check_gpu_compute_capability(verbose=True) + assert result is True + + +def test_check_gpu_compute_capability_insufficient(): + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlDeviceGetHandleByIndex"), + patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(6, 0)), + ): + with pytest.raises( + ValueError, + match=f"GPU 0 requires compute capability {REQUIRED_COMPUTE_CAPABILITY}", + ): + check_gpu_compute_capability(verbose=False) + + +def test_check_gpu_compute_capability_no_gpu(): + import pynvml + + with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): + with pytest.raises( + ValueError, match="No GPU - cannot determine GPU Compute Capability" + ): + check_gpu_compute_capability(verbose=False) diff --git a/rapids_cli/tests/test_memory.py b/rapids_cli/tests/test_memory.py new file mode 100644 index 0000000..572df33 --- /dev/null +++ b/rapids_cli/tests/test_memory.py @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +from unittest.mock import MagicMock, patch + +import pytest + +from rapids_cli.doctor.checks.memory import ( + check_memory_to_gpu_ratio, + get_gpu_memory, + get_system_memory, +) + + +def test_get_system_memory(): + mock_vm = MagicMock() + mock_vm.total = 32 * 1024**3 # 32 GB in bytes + with patch("psutil.virtual_memory", return_value=mock_vm): + result = get_system_memory(verbose=False) + assert result == 32.0 + + +def test_get_gpu_memory_single_gpu(): + mock_handle = MagicMock() + mock_memory_info = MagicMock() + mock_memory_info.total = 16 * 1024**3 # 16 GB in bytes + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=1), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), + patch("pynvml.nvmlShutdown"), + ): + result = get_gpu_memory(verbose=False) + assert result == 16.0 + + +def test_get_gpu_memory_multiple_gpus(): + mock_handle = MagicMock() + mock_memory_info = MagicMock() + mock_memory_info.total = 16 * 1024**3 # 16 GB per GPU + + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=4), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory_info), + patch("pynvml.nvmlShutdown"), + ): + result = get_gpu_memory(verbose=False) + assert result == 64.0 # 16 GB * 4 GPUs + + +def test_check_memory_to_gpu_ratio_good_ratio(): + with ( + patch("pynvml.nvmlInit"), + patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=64.0), + patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), + ): + result = check_memory_to_gpu_ratio(verbose=True) + assert result is True + + +def test_check_memory_to_gpu_ratio_warning(): + with ( + patch("pynvml.nvmlInit"), + patch("rapids_cli.doctor.checks.memory.get_system_memory", return_value=32.0), + patch("rapids_cli.doctor.checks.memory.get_gpu_memory", return_value=32.0), + ): + with pytest.warns(UserWarning, match="System Memory to total GPU Memory ratio"): + result = check_memory_to_gpu_ratio(verbose=True) + assert result is True + + +def test_check_memory_to_gpu_ratio_no_gpu(): + import pynvml + + with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): + with pytest.raises( + ValueError, match="GPU not found. Please ensure GPUs are installed." + ): + check_memory_to_gpu_ratio(verbose=False) diff --git a/rapids_cli/tests/test_nvlink.py b/rapids_cli/tests/test_nvlink.py new file mode 100644 index 0000000..e2d82c7 --- /dev/null +++ b/rapids_cli/tests/test_nvlink.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +from unittest.mock import MagicMock, patch + +import pytest + +from rapids_cli.doctor.checks.nvlink import check_nvlink_status + + +def test_check_nvlink_status_success(): + mock_handle = MagicMock() + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=2), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch("pynvml.nvmlDeviceGetNvLinkState", return_value=1), + ): + result = check_nvlink_status(verbose=True) + assert result is True + + +def test_check_nvlink_status_single_gpu(): + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=1), + ): + result = check_nvlink_status(verbose=False) + assert result is False + + +def test_check_nvlink_status_no_gpu(): + import pynvml + + with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)): + with pytest.raises( + ValueError, match="GPU not found. Please ensure GPUs are installed." + ): + check_nvlink_status(verbose=False) + + +def test_check_nvlink_status_nvml_error(): + import pynvml + + mock_handle = MagicMock() + with ( + patch("pynvml.nvmlInit"), + patch("pynvml.nvmlDeviceGetCount", return_value=2), + patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), + patch( + "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported + ), + ): + with pytest.raises(ValueError, match="NVLink 0 Status Check Failed"): + check_nvlink_status(verbose=False)