AI-ModCon · anagainaru · Mar 24, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/README.md b/README.md
@@ -70,3 +70,9 @@ poetry run pytest
 
 ## Output
 Training logs report the task id, training/test accuracy, and replay-memory accuracy every five epochs. Accuracy is computed via `test(...)` on both the current task and the accumulated memory set.
+
+## Deployment
+
+Platform-specific deployment guides:
+
+- [OLCF Frontier](./src/deployment/frontier/README.md)
diff --git a/src/deployment/__init__.py b/src/deployment/__init__.py
diff --git a/src/deployment/frontier/README.md b/src/deployment/frontier/README.md
@@ -0,0 +1,60 @@
+# Deployment
+
+## OLCF Froniter
+
+### Setup
+
+Clone the repo into your scratch directory and run the install script:
+
+```bash
+cd $MEMBERWORK
+git clone https://github.com/AI-ModCon/BaseSim_Framework.git
+cd BaseSim_Framework
+source ./src/deployment/frontier/install_venv.sh
+```
+
+`install_venv.sh` creates a virtual environment, installs Poetry, and uses it to resolve and install project dependencies. The environment is saved to `.venv` in the project root. The script runs the following:
+
+```bash
+module load PrgEnv-gnu
+module load python/3.13.0
+module load gcc/12.2.0
+module load rocm/6.4.2
+
+python -m venv .venv # Create a virtual environment
+source ./.venv/bin/activate # Activate environment
+pip install poetry # Install poetry
+poetry lock # Sync poetry
+poetry install --no-cache # Install poetry
+
+poetry run pip install --force-reinstall \
+     torch==2.9.1+rocm6.4 \
+     torchvision==0.24.1+rocm6.4 \
+     --index-url https://download.pytorch.org/whl/rocm6.4
+```
+
+Prior to running experiments, test ROCM support from the project root:
+```bash
+poetry run pytest tests/test_rocm.py
+```
+
+### Submitting a Job
+
+> **Note:** The MNIST example requires to the dataset, which is downloaded on first run. Download it before submitting a batch job:
+>
+> ```bash
+> poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist_data()"
+> ```
+
+The virtual environment can be sourced directly at the top of your SLURM script (`source .venv/bin/activate`), so Poetry is not needed at runtime — jobs run against the installed environment.
+
+From the project root:
+
+```bash
+sbatch -A xxx src/deployment/frontier/mnist_example.sbatch
+```
+
+### Troubleshooting
+
+- **`poetry install` fails to connect to PyPI** — Run `poetry lock` first, then retry. The lock file caches package download specs and may be stale on a new host.
+- **`poetry install` fails with disk quota errors** — Poetry's default cache is in the home directory, which has limited space. Retry with `poetry install --no-cache` or free up space in `$HOME`.
diff --git a/src/deployment/frontier/install_venv.sh b/src/deployment/frontier/install_venv.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+module load PrgEnv-gnu
+module load python/3.13.0
+module load gcc/12.2.0
+module load rocm/6.4.2
+
+python -m venv .venv # Create a virtual environment
+source ./.venv/bin/activate # Activate environment
+pip install poetry # Install poetry
+poetry lock # Sync poetry
+poetry install --no-cache # Install poetry
+
+poetry run pip install --force-reinstall \
+    torch==2.9.1+rocm6.4 \
+    torchvision==0.24.1+rocm6.4 \
+    --index-url https://download.pytorch.org/whl/rocm6.4
diff --git a/src/deployment/frontier/mnist_example.sbatch b/src/deployment/frontier/mnist_example.sbatch
@@ -0,0 +1,37 @@
+#!/bin/bash -l
+#SBATCH -J modcon_basesim
+#SBATCH -t 0:20:00
+#SBATCH -N 1
+#SBATCH -p batch
+#SBATCH --exclusive
+#SBATCH --ntasks-per-node=8
+#SBATCH -o output/mnist_example.o%j
+#SBATCH -e output/mnist_example.e%j
+
+# Load required modules
+module load PrgEnv-gnu
+module load python/3.13.0
+module load gcc/12.2.0
+module load rocm/6.4.2
+
+#
+source ./.venv/bin/activate
+
+# ROCm/MIOpen flags
+ACCOUNT=$(sacct -j $SLURM_JOB_ID --format=Account --noheader | head -1 | tr -d ' ')
+mkdir -p $MEMBERWORK/$ACCOUNT/miopen
+export MIOPEN_USER_DB_PATH=$MEMBERWORK/$ACCOUNT/miopen
+export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$ACCOUNT/miopen
+export WANDB_MODE=offline
+
+# Print environment info
+echo "=============================================="
+echo "MNIST Example"
+echo "=============================================="
+echo "Date: $(date)"
+echo "Hostname: $(hostname)"
+echo "ROCM_PATH: ${ROCM_PATH}"
+echo "=============================================="
+
+# Run example
+python -m src.main --config ./examples/mnist/mnist.toml
diff --git a/tests/test_rocm.py b/tests/test_rocm.py
@@ -0,0 +1,138 @@
+"""Tests to verify ROCm/CUDA installation and PyTorch GPU support."""
+
+import pytest
+import torch
+
+requires_gpu = pytest.mark.skipif(
+    not torch.cuda.is_available(), reason="No CUDA/ROCm GPU available"
+)
+requires_rocm = pytest.mark.skipif(
+    getattr(torch.version, "hip", None) is None, reason="ROCm/HIP not available"
+)
+
+
+@pytest.fixture
+def gpu_device():
+    """Return the GPU device string."""
+    return "cuda"
+
+
+def test_torch_import():
+    """Test that PyTorch can be imported."""
+    assert torch is not None, "PyTorch import failed"
+
+
+def test_torchvision_import():
+    """Test that torchvision can be imported."""
+    import torchvision
+
+    assert torchvision is not None, "torchvision import failed"
+
+
+@requires_gpu
+def test_rocm_available():
+    """Test that CUDA/ROCm is available through PyTorch."""
+    assert torch.cuda.is_available(), "CUDA/ROCm is not available"
+
+
+@requires_gpu
+def test_gpu_count():
+    """Test that at least one GPU is detected."""
+    gpu_count = torch.cuda.device_count()
+    assert gpu_count > 0, f"No GPUs detected, found {gpu_count}"
+
+
+@requires_gpu
+def test_gpu_properties():
+    """Test that GPU properties can be queried."""
+    for i in range(torch.cuda.device_count()):
+        props = torch.cuda.get_device_properties(i)
+        assert props.name is not None
+        assert props.total_memory > 0
+
+
+@requires_gpu
+def test_tensor_on_gpu():
+    """Test that tensors can be created and moved to GPU."""
+    x = torch.randn(100, 100).cuda()
+    assert x.is_cuda, "Tensor not on GPU"
+    y = x @ x.T
+    assert y.is_cuda, "Result tensor not on GPU"
+
+
+@requires_gpu
+def test_tensor_to_device(gpu_device):
+    """Test that tensor.to(device) works (used by harness and JVP tests)."""
+    x = torch.randn(32, 10)
+    x_gpu = x.to(gpu_device)
+    assert x_gpu.is_cuda, "tensor.to(device) failed"
+    assert x_gpu.shape == x.shape, "Shape changed after .to()"
+
+
+@requires_gpu
+def test_autograd_on_gpu(gpu_device):
+    """Test that backward pass and gradient computation work on GPU."""
+    x = torch.randn(16, 4, device=gpu_device, requires_grad=True)
+    w = torch.randn(4, 2, device=gpu_device, requires_grad=True)
+    loss = (x @ w).sum()
+    loss.backward()
+    assert w.grad is not None, "Gradients not computed"
+    assert w.grad.is_cuda, "Gradients not on GPU"
+    assert w.grad.shape == w.shape, "Gradient shape mismatch"
+
+
+@requires_gpu
+def test_optimizer_step_on_gpu(gpu_device):
+    """Test that optimizer zero_grad/step work on GPU parameters."""
+    param = torch.nn.Parameter(torch.randn(4, 4, device=gpu_device))
+    optimizer = torch.optim.SGD([param], lr=0.1)
+
+    initial = param.clone().detach()
+    loss = param.sum()
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+
+    assert not torch.equal(param, initial), "Weights not updated after step"
+
+
+@requires_gpu
+def test_no_grad_context(gpu_device):
+    """Test that torch.no_grad() inference mode works on GPU."""
+    w = torch.randn(4, 4, device=gpu_device, requires_grad=True)
+    with torch.no_grad():
+        y = w @ w.T
+    assert y.is_cuda, "Output not on GPU"
+    assert not y.requires_grad, "Output should not require grad inside no_grad"
+
+
+@requires_gpu
+def test_tensor_clone_detach(gpu_device):
+    """Test that clone/detach work on GPU tensors (used for weight snapshots)."""
+    x = torch.randn(4, 4, device=gpu_device, requires_grad=True)
+    y = x.clone().detach()
+    assert y.is_cuda, "Cloned tensor not on GPU"
+    assert not y.requires_grad, "Detached tensor should not require grad"
+    assert torch.equal(x, y), "Cloned tensor values differ"
+
+
+@requires_gpu
+def test_torch_comparison_ops(gpu_device):
+    """Test torch.equal, torch.allclose, and torch.isnan on GPU tensors."""
+    a = torch.tensor([1.0, 2.0, 3.0], device=gpu_device)
+    b = a.clone()
+
+    assert torch.equal(a, b), "torch.equal failed on identical GPU tensors"
+    assert torch.allclose(a, b, atol=1e-6), "torch.allclose failed"
+
+    c = torch.tensor([1.0, float("nan"), 3.0], device=gpu_device)
+    nan_mask = torch.isnan(c)
+    assert nan_mask[1].item(), "torch.isnan failed to detect NaN on GPU"
+    assert not nan_mask[0].item(), "torch.isnan false positive on GPU"
+
+
+@requires_rocm
+def test_torch_rocm_build():
+    """Test that PyTorch was built with ROCm support."""
+    hip_version = getattr(torch.version, "hip", None)
+    assert hip_version is not None, "PyTorch not built with ROCm/HIP support"