Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,9 @@ poetry run pytest

## Output
Training logs report the task id, training/test accuracy, and replay-memory accuracy every five epochs. Accuracy is computed via `test(...)` on both the current task and the accumulated memory set.

## Deployment

Platform-specific deployment guides:

- [OLCF Frontier](./src/deployment/frontier/README.md)
Empty file added src/deployment/__init__.py
Empty file.
60 changes: 60 additions & 0 deletions src/deployment/frontier/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Deployment

## OLCF Froniter

### Setup

Clone the repo into your scratch directory and run the install script:

```bash
cd $MEMBERWORK
git clone https://github.com/AI-ModCon/BaseSim_Framework.git
cd BaseSim_Framework
source ./src/deployment/frontier/install_venv.sh
```

`install_venv.sh` creates a virtual environment, installs Poetry, and uses it to resolve and install project dependencies. The environment is saved to `.venv` in the project root. The script runs the following:

```bash
module load PrgEnv-gnu
module load python/3.13.0
module load gcc/12.2.0
module load rocm/6.4.2

python -m venv .venv # Create a virtual environment
source ./.venv/bin/activate # Activate environment
pip install poetry # Install poetry
poetry lock # Sync poetry
poetry install --no-cache # Install poetry

poetry run pip install --force-reinstall \
torch==2.9.1+rocm6.4 \
torchvision==0.24.1+rocm6.4 \
--index-url https://download.pytorch.org/whl/rocm6.4
```

Prior to running experiments, test ROCM support from the project root:
```bash
poetry run pytest tests/test_rocm.py
```

### Submitting a Job

> **Note:** The MNIST example requires to the dataset, which is downloaded on first run. Download it before submitting a batch job:
>
> ```bash
> poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist_data()"
> ```

The virtual environment can be sourced directly at the top of your SLURM script (`source .venv/bin/activate`), so Poetry is not needed at runtime — jobs run against the installed environment.

From the project root:

```bash
sbatch -A xxx src/deployment/frontier/mnist_example.sbatch
```

### Troubleshooting

- **`poetry install` fails to connect to PyPI** — Run `poetry lock` first, then retry. The lock file caches package download specs and may be stale on a new host.
- **`poetry install` fails with disk quota errors** — Poetry's default cache is in the home directory, which has limited space. Retry with `poetry install --no-cache` or free up space in `$HOME`.
17 changes: 17 additions & 0 deletions src/deployment/frontier/install_venv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

module load PrgEnv-gnu
module load python/3.13.0
module load gcc/12.2.0
module load rocm/6.4.2

python -m venv .venv # Create a virtual environment
source ./.venv/bin/activate # Activate environment
pip install poetry # Install poetry
poetry lock # Sync poetry
poetry install --no-cache # Install poetry

poetry run pip install --force-reinstall \
torch==2.9.1+rocm6.4 \
torchvision==0.24.1+rocm6.4 \
--index-url https://download.pytorch.org/whl/rocm6.4
37 changes: 37 additions & 0 deletions src/deployment/frontier/mnist_example.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash -l
#SBATCH -J modcon_basesim
#SBATCH -t 0:20:00
#SBATCH -N 1
#SBATCH -p batch
#SBATCH --exclusive
#SBATCH --ntasks-per-node=8
#SBATCH -o output/mnist_example.o%j
#SBATCH -e output/mnist_example.e%j

# Load required modules
module load PrgEnv-gnu
module load python/3.13.0
module load gcc/12.2.0
module load rocm/6.4.2

#
source ./.venv/bin/activate

# ROCm/MIOpen flags
ACCOUNT=$(sacct -j $SLURM_JOB_ID --format=Account --noheader | head -1 | tr -d ' ')
mkdir -p $MEMBERWORK/$ACCOUNT/miopen
export MIOPEN_USER_DB_PATH=$MEMBERWORK/$ACCOUNT/miopen
export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$ACCOUNT/miopen
export WANDB_MODE=offline

# Print environment info
echo "=============================================="
echo "MNIST Example"
echo "=============================================="
echo "Date: $(date)"
echo "Hostname: $(hostname)"
echo "ROCM_PATH: ${ROCM_PATH}"
echo "=============================================="

# Run example
python -m src.main --config ./examples/mnist/mnist.toml
138 changes: 138 additions & 0 deletions tests/test_rocm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""Tests to verify ROCm/CUDA installation and PyTorch GPU support."""

import pytest
import torch

requires_gpu = pytest.mark.skipif(
not torch.cuda.is_available(), reason="No CUDA/ROCm GPU available"
)
requires_rocm = pytest.mark.skipif(
getattr(torch.version, "hip", None) is None, reason="ROCm/HIP not available"
)


@pytest.fixture
def gpu_device():
"""Return the GPU device string."""
return "cuda"


def test_torch_import():
"""Test that PyTorch can be imported."""
assert torch is not None, "PyTorch import failed"


def test_torchvision_import():
"""Test that torchvision can be imported."""
import torchvision

assert torchvision is not None, "torchvision import failed"


@requires_gpu
def test_rocm_available():
"""Test that CUDA/ROCm is available through PyTorch."""
assert torch.cuda.is_available(), "CUDA/ROCm is not available"


@requires_gpu
def test_gpu_count():
"""Test that at least one GPU is detected."""
gpu_count = torch.cuda.device_count()
assert gpu_count > 0, f"No GPUs detected, found {gpu_count}"


@requires_gpu
def test_gpu_properties():
"""Test that GPU properties can be queried."""
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
assert props.name is not None
assert props.total_memory > 0


@requires_gpu
def test_tensor_on_gpu():
"""Test that tensors can be created and moved to GPU."""
x = torch.randn(100, 100).cuda()
assert x.is_cuda, "Tensor not on GPU"
y = x @ x.T
assert y.is_cuda, "Result tensor not on GPU"


@requires_gpu
def test_tensor_to_device(gpu_device):
"""Test that tensor.to(device) works (used by harness and JVP tests)."""
x = torch.randn(32, 10)
x_gpu = x.to(gpu_device)
assert x_gpu.is_cuda, "tensor.to(device) failed"
assert x_gpu.shape == x.shape, "Shape changed after .to()"


@requires_gpu
def test_autograd_on_gpu(gpu_device):
"""Test that backward pass and gradient computation work on GPU."""
x = torch.randn(16, 4, device=gpu_device, requires_grad=True)
w = torch.randn(4, 2, device=gpu_device, requires_grad=True)
loss = (x @ w).sum()
loss.backward()
assert w.grad is not None, "Gradients not computed"
assert w.grad.is_cuda, "Gradients not on GPU"
assert w.grad.shape == w.shape, "Gradient shape mismatch"


@requires_gpu
def test_optimizer_step_on_gpu(gpu_device):
"""Test that optimizer zero_grad/step work on GPU parameters."""
param = torch.nn.Parameter(torch.randn(4, 4, device=gpu_device))
optimizer = torch.optim.SGD([param], lr=0.1)

initial = param.clone().detach()
loss = param.sum()
optimizer.zero_grad()
loss.backward()
optimizer.step()

assert not torch.equal(param, initial), "Weights not updated after step"


@requires_gpu
def test_no_grad_context(gpu_device):
"""Test that torch.no_grad() inference mode works on GPU."""
w = torch.randn(4, 4, device=gpu_device, requires_grad=True)
with torch.no_grad():
y = w @ w.T
assert y.is_cuda, "Output not on GPU"
assert not y.requires_grad, "Output should not require grad inside no_grad"


@requires_gpu
def test_tensor_clone_detach(gpu_device):
"""Test that clone/detach work on GPU tensors (used for weight snapshots)."""
x = torch.randn(4, 4, device=gpu_device, requires_grad=True)
y = x.clone().detach()
assert y.is_cuda, "Cloned tensor not on GPU"
assert not y.requires_grad, "Detached tensor should not require grad"
assert torch.equal(x, y), "Cloned tensor values differ"


@requires_gpu
def test_torch_comparison_ops(gpu_device):
"""Test torch.equal, torch.allclose, and torch.isnan on GPU tensors."""
a = torch.tensor([1.0, 2.0, 3.0], device=gpu_device)
b = a.clone()

assert torch.equal(a, b), "torch.equal failed on identical GPU tensors"
assert torch.allclose(a, b, atol=1e-6), "torch.allclose failed"

c = torch.tensor([1.0, float("nan"), 3.0], device=gpu_device)
nan_mask = torch.isnan(c)
assert nan_mask[1].item(), "torch.isnan failed to detect NaN on GPU"
assert not nan_mask[0].item(), "torch.isnan false positive on GPU"


@requires_rocm
def test_torch_rocm_build():
"""Test that PyTorch was built with ROCm support."""
hip_version = getattr(torch.version, "hip", None)
assert hip_version is not None, "PyTorch not built with ROCm/HIP support"
Loading