diff --git a/README.md b/README.md index 9b28e0b..712cdd7 100644 --- a/README.md +++ b/README.md @@ -70,3 +70,9 @@ poetry run pytest ## Output Training logs report the task id, training/test accuracy, and replay-memory accuracy every five epochs. Accuracy is computed via `test(...)` on both the current task and the accumulated memory set. + +## Deployment + +Platform-specific deployment guides: + +- [OLCF Frontier](./src/deployment/frontier/README.md) diff --git a/src/deployment/__init__.py b/src/deployment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/deployment/frontier/README.md b/src/deployment/frontier/README.md new file mode 100644 index 0000000..c95a1ad --- /dev/null +++ b/src/deployment/frontier/README.md @@ -0,0 +1,60 @@ +# Deployment + +## OLCF Froniter + +### Setup + +Clone the repo into your scratch directory and run the install script: + +```bash +cd $MEMBERWORK +git clone https://github.com/AI-ModCon/BaseSim_Framework.git +cd BaseSim_Framework +source ./src/deployment/frontier/install_venv.sh +``` + +`install_venv.sh` creates a virtual environment, installs Poetry, and uses it to resolve and install project dependencies. The environment is saved to `.venv` in the project root. The script runs the following: + +```bash +module load PrgEnv-gnu +module load python/3.13.0 +module load gcc/12.2.0 +module load rocm/6.4.2 + +python -m venv .venv # Create a virtual environment +source ./.venv/bin/activate # Activate environment +pip install poetry # Install poetry +poetry lock # Sync poetry +poetry install --no-cache # Install poetry + +poetry run pip install --force-reinstall \ + torch==2.9.1+rocm6.4 \ + torchvision==0.24.1+rocm6.4 \ + --index-url https://download.pytorch.org/whl/rocm6.4 +``` + +Prior to running experiments, test ROCM support from the project root: +```bash +poetry run pytest tests/test_rocm.py +``` + +### Submitting a Job + +> **Note:** The MNIST example requires to the dataset, which is downloaded on first run. Download it before submitting a batch job: +> +> ```bash +> poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist_data()" +> ``` + +The virtual environment can be sourced directly at the top of your SLURM script (`source .venv/bin/activate`), so Poetry is not needed at runtime — jobs run against the installed environment. + +From the project root: + +```bash +sbatch -A xxx src/deployment/frontier/mnist_example.sbatch +``` + +### Troubleshooting + +- **`poetry install` fails to connect to PyPI** — Run `poetry lock` first, then retry. The lock file caches package download specs and may be stale on a new host. +- **`poetry install` fails with disk quota errors** — Poetry's default cache is in the home directory, which has limited space. Retry with `poetry install --no-cache` or free up space in `$HOME`. diff --git a/src/deployment/frontier/install_venv.sh b/src/deployment/frontier/install_venv.sh new file mode 100644 index 0000000..b6b3a87 --- /dev/null +++ b/src/deployment/frontier/install_venv.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +module load PrgEnv-gnu +module load python/3.13.0 +module load gcc/12.2.0 +module load rocm/6.4.2 + +python -m venv .venv # Create a virtual environment +source ./.venv/bin/activate # Activate environment +pip install poetry # Install poetry +poetry lock # Sync poetry +poetry install --no-cache # Install poetry + +poetry run pip install --force-reinstall \ + torch==2.9.1+rocm6.4 \ + torchvision==0.24.1+rocm6.4 \ + --index-url https://download.pytorch.org/whl/rocm6.4 diff --git a/src/deployment/frontier/mnist_example.sbatch b/src/deployment/frontier/mnist_example.sbatch new file mode 100644 index 0000000..b50c9d0 --- /dev/null +++ b/src/deployment/frontier/mnist_example.sbatch @@ -0,0 +1,37 @@ +#!/bin/bash -l +#SBATCH -J modcon_basesim +#SBATCH -t 0:20:00 +#SBATCH -N 1 +#SBATCH -p batch +#SBATCH --exclusive +#SBATCH --ntasks-per-node=8 +#SBATCH -o output/mnist_example.o%j +#SBATCH -e output/mnist_example.e%j + +# Load required modules +module load PrgEnv-gnu +module load python/3.13.0 +module load gcc/12.2.0 +module load rocm/6.4.2 + +# +source ./.venv/bin/activate + +# ROCm/MIOpen flags +ACCOUNT=$(sacct -j $SLURM_JOB_ID --format=Account --noheader | head -1 | tr -d ' ') +mkdir -p $MEMBERWORK/$ACCOUNT/miopen +export MIOPEN_USER_DB_PATH=$MEMBERWORK/$ACCOUNT/miopen +export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$ACCOUNT/miopen +export WANDB_MODE=offline + +# Print environment info +echo "==============================================" +echo "MNIST Example" +echo "==============================================" +echo "Date: $(date)" +echo "Hostname: $(hostname)" +echo "ROCM_PATH: ${ROCM_PATH}" +echo "==============================================" + +# Run example +python -m src.main --config ./examples/mnist/mnist.toml diff --git a/tests/test_rocm.py b/tests/test_rocm.py new file mode 100644 index 0000000..c15f931 --- /dev/null +++ b/tests/test_rocm.py @@ -0,0 +1,138 @@ +"""Tests to verify ROCm/CUDA installation and PyTorch GPU support.""" + +import pytest +import torch + +requires_gpu = pytest.mark.skipif( + not torch.cuda.is_available(), reason="No CUDA/ROCm GPU available" +) +requires_rocm = pytest.mark.skipif( + getattr(torch.version, "hip", None) is None, reason="ROCm/HIP not available" +) + + +@pytest.fixture +def gpu_device(): + """Return the GPU device string.""" + return "cuda" + + +def test_torch_import(): + """Test that PyTorch can be imported.""" + assert torch is not None, "PyTorch import failed" + + +def test_torchvision_import(): + """Test that torchvision can be imported.""" + import torchvision + + assert torchvision is not None, "torchvision import failed" + + +@requires_gpu +def test_rocm_available(): + """Test that CUDA/ROCm is available through PyTorch.""" + assert torch.cuda.is_available(), "CUDA/ROCm is not available" + + +@requires_gpu +def test_gpu_count(): + """Test that at least one GPU is detected.""" + gpu_count = torch.cuda.device_count() + assert gpu_count > 0, f"No GPUs detected, found {gpu_count}" + + +@requires_gpu +def test_gpu_properties(): + """Test that GPU properties can be queried.""" + for i in range(torch.cuda.device_count()): + props = torch.cuda.get_device_properties(i) + assert props.name is not None + assert props.total_memory > 0 + + +@requires_gpu +def test_tensor_on_gpu(): + """Test that tensors can be created and moved to GPU.""" + x = torch.randn(100, 100).cuda() + assert x.is_cuda, "Tensor not on GPU" + y = x @ x.T + assert y.is_cuda, "Result tensor not on GPU" + + +@requires_gpu +def test_tensor_to_device(gpu_device): + """Test that tensor.to(device) works (used by harness and JVP tests).""" + x = torch.randn(32, 10) + x_gpu = x.to(gpu_device) + assert x_gpu.is_cuda, "tensor.to(device) failed" + assert x_gpu.shape == x.shape, "Shape changed after .to()" + + +@requires_gpu +def test_autograd_on_gpu(gpu_device): + """Test that backward pass and gradient computation work on GPU.""" + x = torch.randn(16, 4, device=gpu_device, requires_grad=True) + w = torch.randn(4, 2, device=gpu_device, requires_grad=True) + loss = (x @ w).sum() + loss.backward() + assert w.grad is not None, "Gradients not computed" + assert w.grad.is_cuda, "Gradients not on GPU" + assert w.grad.shape == w.shape, "Gradient shape mismatch" + + +@requires_gpu +def test_optimizer_step_on_gpu(gpu_device): + """Test that optimizer zero_grad/step work on GPU parameters.""" + param = torch.nn.Parameter(torch.randn(4, 4, device=gpu_device)) + optimizer = torch.optim.SGD([param], lr=0.1) + + initial = param.clone().detach() + loss = param.sum() + optimizer.zero_grad() + loss.backward() + optimizer.step() + + assert not torch.equal(param, initial), "Weights not updated after step" + + +@requires_gpu +def test_no_grad_context(gpu_device): + """Test that torch.no_grad() inference mode works on GPU.""" + w = torch.randn(4, 4, device=gpu_device, requires_grad=True) + with torch.no_grad(): + y = w @ w.T + assert y.is_cuda, "Output not on GPU" + assert not y.requires_grad, "Output should not require grad inside no_grad" + + +@requires_gpu +def test_tensor_clone_detach(gpu_device): + """Test that clone/detach work on GPU tensors (used for weight snapshots).""" + x = torch.randn(4, 4, device=gpu_device, requires_grad=True) + y = x.clone().detach() + assert y.is_cuda, "Cloned tensor not on GPU" + assert not y.requires_grad, "Detached tensor should not require grad" + assert torch.equal(x, y), "Cloned tensor values differ" + + +@requires_gpu +def test_torch_comparison_ops(gpu_device): + """Test torch.equal, torch.allclose, and torch.isnan on GPU tensors.""" + a = torch.tensor([1.0, 2.0, 3.0], device=gpu_device) + b = a.clone() + + assert torch.equal(a, b), "torch.equal failed on identical GPU tensors" + assert torch.allclose(a, b, atol=1e-6), "torch.allclose failed" + + c = torch.tensor([1.0, float("nan"), 3.0], device=gpu_device) + nan_mask = torch.isnan(c) + assert nan_mask[1].item(), "torch.isnan failed to detect NaN on GPU" + assert not nan_mask[0].item(), "torch.isnan false positive on GPU" + + +@requires_rocm +def test_torch_rocm_build(): + """Test that PyTorch was built with ROCm support.""" + hip_version = getattr(torch.version, "hip", None) + assert hip_version is not None, "PyTorch not built with ROCm/HIP support"