From f3fa2247107103338e1d65ba7f535ea01afb1732 Mon Sep 17 00:00:00 2001 From: Rafael Zamora-Resendiz Date: Mon, 9 Feb 2026 11:48:50 -0500 Subject: [PATCH 01/12] Rocm installation script. --- src/deployment/__init__.py | 0 src/deployment/frontier/install_rocm.sh | 12 ++++++++++++ 2 files changed, 12 insertions(+) create mode 100644 src/deployment/__init__.py create mode 100644 src/deployment/frontier/install_rocm.sh diff --git a/src/deployment/__init__.py b/src/deployment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/deployment/frontier/install_rocm.sh b/src/deployment/frontier/install_rocm.sh new file mode 100644 index 0000000..7a9ec78 --- /dev/null +++ b/src/deployment/frontier/install_rocm.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +module load PrgEnv-gnu +module load gcc/12.2.0 +module load rocm/6.4.2 + +poetry lock +poetry install +poetry run pip install --force-reinstall \ + torch==2.9.1+rocm6.4 \ + torchvision==0.24.1+rocm6.4 \ + --index-url https://download.pytorch.org/whl/rocm6.4 From a183f945c24086aa5d5320ea4c56669de9f75ebd Mon Sep 17 00:00:00 2001 From: Rafael Zamora-Resendiz Date: Mon, 9 Feb 2026 12:22:19 -0500 Subject: [PATCH 02/12] Frontier slurm scripts. --- src/deployment/frontier/mnist_example.sbatch | 32 +++++++++++++++ .../frontier/test_rocm_install.sbatch | 40 +++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 src/deployment/frontier/mnist_example.sbatch create mode 100644 src/deployment/frontier/test_rocm_install.sbatch diff --git a/src/deployment/frontier/mnist_example.sbatch b/src/deployment/frontier/mnist_example.sbatch new file mode 100644 index 0000000..bf175b4 --- /dev/null +++ b/src/deployment/frontier/mnist_example.sbatch @@ -0,0 +1,32 @@ +#!/bin/bash -l +#SBATCH -J modcon_basesim +#SBATCH -t 0:20:00 +#SBATCH -N 1 +#SBATCH -p batch +#SBATCH --exclusive +#SBATCH --ntasks-per-node=8 +#SBATCH -o output/mnist_example.o%j +#SBATCH -e output/mnist_example.e%j + +# Load required modules +module load PrgEnv-gnu +module load gcc/12.2.0 +module load rocm/6.4.2 + +# ROCm/MIOpen flags +mkdir -p $MEMBERWORK/$SBATCH_ACCOUNT/miopen +export MIOPEN_USER_DB_PATH=$MEMBERWORK/$SBATCH_ACCOUNT/miopen +export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$SBATCH_ACCOUNT/miopen +export WANDB_MODE=offline + +# Print environment info +echo "==============================================" +echo "MNIST Example" +echo "==============================================" +echo "Date: $(date)" +echo "Hostname: $(hostname)" +echo "ROCM_PATH: ${ROCM_PATH}" +echo "==============================================" + +# Run example +poetry run python -m src.main --config ./examples/mnist/mnist.toml diff --git a/src/deployment/frontier/test_rocm_install.sbatch b/src/deployment/frontier/test_rocm_install.sbatch new file mode 100644 index 0000000..50c8f0c --- /dev/null +++ b/src/deployment/frontier/test_rocm_install.sbatch @@ -0,0 +1,40 @@ +#!/bin/bash -l +#SBATCH -J test_rocm_install +#SBATCH -t 0:10:00 +#SBATCH -N 1 +#SBATCH --exclusive +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node=1 +#SBATCH -o output/test_rocm_install.o%j +#SBATCH -e output/test_rocm_install.e%j + +# Load required modules +module load PrgEnv-gnu +module load gcc/12.2.0 +module load rocm/6.4.2 + +# ROCm/MIOpen flags +mkdir -p $MEMBERWORK/$SBATCH_ACCOUNT/miopen +export MIOPEN_USER_DB_PATH=$MEMBERWORK/$SBATCH_ACCOUNT/miopen +export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$SBATCH_ACCOUNT/miopen + +# WANDBD set to offline; Need to set up proxy connections. +export WANDB_MODE=offline + +# Print environment info +echo "==============================================" +echo "ROCm Installation & Harness Test" +echo "==============================================" +echo "Date: $(date)" +echo "Hostname: $(hostname)" +echo "ROCM_PATH: ${ROCM_PATH}" +echo "==============================================" + +# Test torch rocm compatability +poetry run pytest tests/deployment/frontier/test_rocm_install.py -v + +# Test model harness (loaders, model, etc) +poetry run pytest tests/deployment/frontier/test_model_harness_rocm.py -v + +# Test jvp update +poetry run pytest tests/deployment/frontier/test_jvp_update_rocm.py -v From fa68f1eb0923bcd12bdcb75dd69dfd49beb1fda3 Mon Sep 17 00:00:00 2001 From: Rafael Zamora-Resendiz Date: Mon, 9 Feb 2026 12:23:52 -0500 Subject: [PATCH 03/12] Frontier Rocm tests. --- tests/conftest.py | 5 + .../frontier/test_jvp_update_rocm.py | 258 ++++++++++++++++++ .../frontier/test_model_harness_rocm.py | 228 ++++++++++++++++ .../deployment/frontier/test_rocm_install.py | 60 ++++ 4 files changed, 551 insertions(+) create mode 100644 tests/conftest.py create mode 100644 tests/deployment/frontier/test_jvp_update_rocm.py create mode 100644 tests/deployment/frontier/test_model_harness_rocm.py create mode 100644 tests/deployment/frontier/test_rocm_install.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..1b1ba82 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,5 @@ +import sys +from pathlib import Path + +repo_root = Path(__file__).parent.parent +sys.path.insert(0, str(repo_root)) diff --git a/tests/deployment/frontier/test_jvp_update_rocm.py b/tests/deployment/frontier/test_jvp_update_rocm.py new file mode 100644 index 0000000..a39e8c2 --- /dev/null +++ b/tests/deployment/frontier/test_jvp_update_rocm.py @@ -0,0 +1,258 @@ +"""Tests to verify JVP regularized update works correctly with ROCm.""" + +import pytest +import torch + +from config.configuration import ( + Config, + ModelCfg, + DataCfg, + TrainCfg, + ContinualLearningCfg, + DriftDetectionCfg, +) + +from examples.mnist.model import MNIST_CNN + +from training.updater.jvp_reg import JVPRegUpdater +from profilers import FLOPSProfiler + + +@pytest.fixture +def rocm_config(): + """Create a config for ROCm/GPU testing.""" + return Config( + model=ModelCfg(name="mnist_cnn", pretrained_path=""), + data=DataCfg(name="mnist", path="./data"), + train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001), + continual_learning=ContinualLearningCfg( + jvp_reg=0.001, deltax_norm=1.0, max_iter=5 + ), + drift_detection=DriftDetectionCfg(), + seed=42, + device="cuda", + multi_gpu=False, + ) + + +@pytest.fixture +def harness_with_history(rocm_config): + """Create MNIST harness with historical data.""" + harness = MNIST_CNN(rocm_config) + harness.update_data_stream() # First stream + harness.update_data_stream() # Second stream (creates history) + return harness + + +class TestJVPRegularizedLoss: + """Tests for JVPRegularizedLoss module.""" + + def test_jvp_loss_creation(self, harness_with_history): + """Test that JVPRegularizedLoss can be created.""" + criterion = harness_with_history.get_criterion() + jvp_loss = JVPRegUpdater( + model=harness_with_history.model, + criterion=criterion, + jvp_reg=0.001, + deltax_norm=1.0, + ) + assert jvp_loss is not None + + def test_jvp_loss_forward(self, rocm_config, harness_with_history): + """Test that JVPRegularizedLoss forward pass works on GPU.""" + criterion = harness_with_history.get_criterion() + jvp_loss = JVPRegularizedLoss( + model=harness_with_history.model, + criterion=criterion, + jvp_reg=0.001, + deltax_norm=1.0, + ) + + # Get batches + train_loader, _ = harness_with_history.get_cur_data_loaders() + hist_train_loader, _ = harness_with_history.get_hist_data_loaders() + + train_batch = next(iter(train_loader)) + hist_batch = next(iter(hist_train_loader)) + + # Move to device + train_batch = [b.to(rocm_config.device) for b in train_batch] + hist_batch = [b.to(rocm_config.device) for b in hist_batch] + + # Forward pass + grad_dict, loss_curr, loss_mem = jvp_loss(train_batch, hist_batch) + + assert grad_dict is not None, "Gradient dict is None" + assert loss_curr is not None, "Current loss is None" + assert loss_mem is not None, "Memory loss is None" + + def test_jvp_loss_gradients_on_gpu(self, rocm_config, harness_with_history): + """Test that JVP gradients are computed on GPU.""" + criterion = harness_with_history.get_criterion() + jvp_loss = JVPRegularizedLoss( + model=harness_with_history.model, + criterion=criterion, + jvp_reg=0.001, + deltax_norm=1.0, + ) + + # Get batches + train_loader, _ = harness_with_history.get_cur_data_loaders() + hist_train_loader, _ = harness_with_history.get_hist_data_loaders() + + train_batch = next(iter(train_loader)) + hist_batch = next(iter(hist_train_loader)) + + train_batch = [b.to(rocm_config.device) for b in train_batch] + hist_batch = [b.to(rocm_config.device) for b in hist_batch] + + # Compute gradients + grad_dict, _, _ = jvp_loss(train_batch, hist_batch) + + # Check gradients exist for all parameters + for name, param in harness_with_history.model.named_parameters(): + assert name in grad_dict, f"No gradient for {name}" + assert grad_dict[name].is_cuda, f"Gradient for {name} not on GPU" + assert not torch.isnan(grad_dict[name]).any(), f"NaN in gradient for {name}" + + +class TestJVPUpdateStep: + """Tests for step_method_jvp_reg function.""" + + def test_jvp_step_runs(self, rocm_config, harness_with_history): + """Test that JVP update step executes without error.""" + criterion = harness_with_history.get_criterion() + optimizer = harness_with_history.get_optmizer() + model = harness_with_history.model + profiler = FLOPSProfiler() + + jvp_loss = JVPRegularizedLoss( + model=model, + criterion=criterion, + jvp_reg=rocm_config.continuous_learning.jvp_reg, + deltax_norm=rocm_config.continuous_learning.deltax_norm, + ) + + # Get batches + train_loader, _ = harness_with_history.get_cur_data_loaders() + hist_train_loader, _ = harness_with_history.get_hist_data_loaders() + + train_batch = next(iter(train_loader)) + hist_batch = next(iter(hist_train_loader)) + + train_batch = [b.to(rocm_config.device) for b in train_batch] + hist_batch = [b.to(rocm_config.device) for b in hist_batch] + + # Run update step + loss_curr, loss_mem, loss_total = step_method_jvp_reg( + model=model, + criterion=criterion, + optimizer=optimizer, + cfg=rocm_config, + iter=0, + train_batch=train_batch, + hist_batch=hist_batch, + profiler=profiler, + jvp_loss=jvp_loss, + ) + + assert loss_curr > 0, "Current loss should be positive" + assert loss_mem > 0, "Memory loss should be positive" + assert loss_total > 0, "Total loss should be positive" + + def test_jvp_step_updates_weights(self, rocm_config, harness_with_history): + """Test that JVP update step modifies model weights.""" + criterion = harness_with_history.get_criterion() + optimizer = harness_with_history.get_optmizer() + model = harness_with_history.model + profiler = FLOPSProfiler() + + # Get initial weights + initial_weights = { + name: param.clone().detach() for name, param in model.named_parameters() + } + + jvp_loss = JVPRegularizedLoss( + model=model, + criterion=criterion, + jvp_reg=rocm_config.continuous_learning.jvp_reg, + deltax_norm=rocm_config.continuous_learning.deltax_norm, + ) + + # Get batches + train_loader, _ = harness_with_history.get_cur_data_loaders() + hist_train_loader, _ = harness_with_history.get_hist_data_loaders() + + train_batch = next(iter(train_loader)) + hist_batch = next(iter(hist_train_loader)) + + train_batch = [b.to(rocm_config.device) for b in train_batch] + hist_batch = [b.to(rocm_config.device) for b in hist_batch] + + # Run update step + step_method_jvp_reg( + model=model, + criterion=criterion, + optimizer=optimizer, + cfg=rocm_config, + iter=0, + train_batch=train_batch, + hist_batch=hist_batch, + profiler=profiler, + jvp_loss=jvp_loss, + ) + + # Check weights changed + weights_changed = False + for name, param in model.named_parameters(): + if not torch.allclose(param, initial_weights[name], atol=1e-6): + weights_changed = True + break + + assert weights_changed, "No weights updated after JVP step" + + def test_jvp_step_multiple_iterations(self, rocm_config, harness_with_history): + """Test that multiple JVP update steps work correctly.""" + criterion = harness_with_history.get_criterion() + optimizer = harness_with_history.get_optmizer() + model = harness_with_history.model + profiler = FLOPSProfiler() + + jvp_loss = JVPRegularizedLoss( + model=model, + criterion=criterion, + jvp_reg=rocm_config.continuous_learning.jvp_reg, + deltax_norm=rocm_config.continuous_learning.deltax_norm, + ) + + # Get loaders + train_loader, _ = harness_with_history.get_cur_data_loaders() + hist_train_loader, _ = harness_with_history.get_hist_data_loaders() + + train_iter = iter(train_loader) + hist_iter = iter(hist_train_loader) + + losses = [] + for i in range(5): + train_batch = next(train_iter) + hist_batch = next(hist_iter) + + train_batch = [b.to(rocm_config.device) for b in train_batch] + hist_batch = [b.to(rocm_config.device) for b in hist_batch] + + loss_curr, loss_mem, loss_total = step_method_jvp_reg( + model=model, + criterion=criterion, + optimizer=optimizer, + cfg=rocm_config, + iter=i, + train_batch=train_batch, + hist_batch=hist_batch, + profiler=profiler, + jvp_loss=jvp_loss, + ) + + losses.append(loss_total) + + # All losses should be positive + assert all(loss > 0 for loss in losses), "Some losses are not positive" diff --git a/tests/deployment/frontier/test_model_harness_rocm.py b/tests/deployment/frontier/test_model_harness_rocm.py new file mode 100644 index 0000000..c8df404 --- /dev/null +++ b/tests/deployment/frontier/test_model_harness_rocm.py @@ -0,0 +1,228 @@ +"""Tests to verify MNIST model harness works correctly with ROCm.""" + +import pytest +import torch + +from config.configuration import ( + Config, + ModelCfg, + DataCfg, + TrainCfg, + ContinualLearningCfg, + DriftDetectionCfg, +) +from examples.mnist.model import MNIST_CNN + + +@pytest.fixture +def rocm_config(): + """Create a config for ROCm/GPU testing.""" + return Config( + model=ModelCfg(name="mnist_cnn", pretrained_path=""), + data=DataCfg(name="mnist", path="./data"), + train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001), + continual_learning=ContinualLearningCfg(), + drift_detection=DriftDetectionCfg(), + seed=42, + device="cuda", + multi_gpu=False, + ) + + +@pytest.fixture +def harness(rocm_config): + """Create MNIST harness and initialize data stream.""" + harness = MNIST_CNN(rocm_config) + harness.update_data_stream() + return harness + + +class TestModelLoading: + """Tests for model loading and GPU placement.""" + + def test_model_on_gpu(self, harness): + """Test that model is moved to GPU.""" + device = next(harness.model.parameters()).device + assert device.type == "cuda", f"Model not on GPU, found {device}" + + def test_model_device_matches_config(self, harness): + """Test that model device matches config device.""" + device = next(harness.model.parameters()).device + assert str(device).startswith("cuda") + + +class TestDataLoader: + """Tests for data loader functionality.""" + + def test_data_loaders_created(self, harness): + """Test that data loaders are created after update_data_stream.""" + train_loader, val_loader = harness.get_cur_data_loaders() + assert train_loader is not None, "Train loader is None" + assert val_loader is not None, "Val loader is None" + + def test_data_loader_batch_shape(self, harness): + """Test that data loader produces correct batch shapes.""" + train_loader, _ = harness.get_cur_data_loaders() + batch = next(iter(train_loader)) + x, y = batch + assert x.dim() == 3, f"Expected 3D input (B, H, W), got {x.dim()}D" + assert y.dim() == 1, f"Expected 1D labels, got {y.dim()}D" + assert x.shape[0] == y.shape[0], "Batch size mismatch between x and y" + + def test_data_moves_to_gpu(self, harness): + """Test that data can be moved to GPU.""" + train_loader, _ = harness.get_cur_data_loaders() + batch = next(iter(train_loader)) + x, y = batch + x_gpu = x.to(harness.cfg.device) + y_gpu = y.to(harness.cfg.device) + assert x_gpu.is_cuda, "Input tensor not on GPU" + assert y_gpu.is_cuda, "Label tensor not on GPU" + + +class TestForwardPass: + """Tests for model forward pass.""" + + def test_forward_pass_runs(self, harness): + """Test that forward pass executes without error.""" + train_loader, _ = harness.get_cur_data_loaders() + batch = next(iter(train_loader)) + x, y = batch + x = x.to(harness.cfg.device) + + harness.model.eval() + with torch.no_grad(): + output = harness.model(x) + + assert output is not None, "Forward pass returned None" + + def test_forward_pass_output_shape(self, harness): + """Test that forward pass produces correct output shape.""" + train_loader, _ = harness.get_cur_data_loaders() + batch = next(iter(train_loader)) + x, y = batch + x = x.to(harness.cfg.device) + + harness.model.eval() + with torch.no_grad(): + output = harness.model(x) + + assert output.shape[0] == x.shape[0], "Batch size mismatch" + assert output.shape[1] == 10, f"Expected 10 classes, got {output.shape[1]}" + + def test_forward_pass_output_on_gpu(self, harness): + """Test that forward pass output is on GPU.""" + train_loader, _ = harness.get_cur_data_loaders() + batch = next(iter(train_loader)) + x, _ = batch + x = x.to(harness.cfg.device) + + harness.model.eval() + with torch.no_grad(): + output = harness.model(x) + + assert output.is_cuda, "Output tensor not on GPU" + + +class TestEval: + """Tests for harness eval method.""" + + def test_eval_runs(self, harness): + """Test that eval method executes without error.""" + metrics = harness.eval() + assert metrics is not None, "Eval returned None" + + def test_eval_returns_metrics(self, harness): + """Test that eval returns expected number of metrics.""" + metrics = harness.eval() + assert len(metrics) == len(harness.eval_metrics), ( + f"Expected {len(harness.eval_metrics)} metrics, got {len(metrics)}" + ) + + def test_eval_metrics_are_valid(self, harness): + """Test that eval metrics are valid floats.""" + metrics = harness.eval() + for i, metric in enumerate(metrics): + assert isinstance(metric, float), f"Metric {i} is not a float" + assert not torch.isnan(torch.tensor(metric)), f"Metric {i} is NaN" + + +class TestTrainingStep: + """Tests for a single training step.""" + + def test_training_step(self, harness): + """Test that a single training step executes without error.""" + train_loader, _ = harness.get_cur_data_loaders() + batch = next(iter(train_loader)) + x, y = batch + x = x.to(harness.cfg.device) + y = y.to(harness.cfg.device) + + harness.model.train() + optimizer = harness.get_optmizer() + criterion = harness.get_criterion() + + optimizer.zero_grad() + output = harness.model(x) + loss = criterion(output, y) + loss.backward() + optimizer.step() + + assert loss.item() > 0, "Loss should be positive" + + def test_gradients_computed(self, harness): + """Test that gradients are computed during backward pass.""" + train_loader, _ = harness.get_cur_data_loaders() + batch = next(iter(train_loader)) + x, y = batch + x = x.to(harness.cfg.device) + y = y.to(harness.cfg.device) + + harness.model.train() + optimizer = harness.get_optmizer() + criterion = harness.get_criterion() + + optimizer.zero_grad() + output = harness.model(x) + loss = criterion(output, y) + loss.backward() + + has_grad = False + for param in harness.model.parameters(): + if param.grad is not None and param.grad.abs().sum() > 0: + has_grad = True + break + + assert has_grad, "No gradients computed" + + def test_weights_updated(self, harness): + """Test that weights are updated after optimizer step.""" + train_loader, _ = harness.get_cur_data_loaders() + batch = next(iter(train_loader)) + x, y = batch + x = x.to(harness.cfg.device) + y = y.to(harness.cfg.device) + + harness.model.train() + optimizer = harness.get_optmizer() + criterion = harness.get_criterion() + + # Get initial weights + initial_weights = { + name: param.clone() for name, param in harness.model.named_parameters() + } + + optimizer.zero_grad() + output = harness.model(x) + loss = criterion(output, y) + loss.backward() + optimizer.step() + + # Check weights changed + weights_changed = False + for name, param in harness.model.named_parameters(): + if not torch.equal(param, initial_weights[name]): + weights_changed = True + break + + assert weights_changed, "Weights not updated after optimizer step" diff --git a/tests/deployment/frontier/test_rocm_install.py b/tests/deployment/frontier/test_rocm_install.py new file mode 100644 index 0000000..3e7c7b8 --- /dev/null +++ b/tests/deployment/frontier/test_rocm_install.py @@ -0,0 +1,60 @@ +"""Tests to verify ROCm installation and PyTorch GPU support.""" + + +def test_torch_import(): + """Test that PyTorch can be imported.""" + import torch + + assert torch is not None, "PyTorch import failed" + + +def test_torchvision_import(): + """Test that torchvision can be imported.""" + import torchvision + + assert torchvision is not None, "torchvision import failed" + + +def test_rocm_available(): + """Test that ROCm/HIP is available through PyTorch.""" + import torch + + assert torch.cuda.is_available(), "CUDA/ROCm is not available" + + +def test_gpu_count(): + """Test that at least one GPU is detected.""" + import torch + + gpu_count = torch.cuda.device_count() + assert gpu_count > 0, f"No GPUs detected, found {gpu_count}" + + +def test_gpu_properties(): + """Test that GPU properties can be queried.""" + import torch + + assert torch.cuda.is_available(), "CUDA/ROCm not available" + for i in range(torch.cuda.device_count()): + props = torch.cuda.get_device_properties(i) + assert props.name is not None + assert props.total_memory > 0 + + +def test_tensor_on_gpu(): + """Test that tensors can be created and moved to GPU.""" + import torch + + assert torch.cuda.is_available(), "CUDA/ROCm not available" + x = torch.randn(100, 100).cuda() + assert x.is_cuda, "Tensor not on GPU" + y = x @ x.T + assert y.is_cuda, "Result tensor not on GPU" + + +def test_torch_rocm_build(): + """Test that PyTorch was built with ROCm support.""" + import torch + + hip_version = getattr(torch.version, "hip", None) + assert hip_version is not None, "PyTorch not built with ROCm/HIP support" From e06402e6152196332ff532c1226bce45914931d7 Mon Sep 17 00:00:00 2001 From: "Rafael Zamora-Resendiz (AMCRD)" Date: Mon, 9 Feb 2026 12:28:20 -0500 Subject: [PATCH 04/12] Updated interfaces with JVP updater. --- .../frontier/test_jvp_update_rocm.py | 174 +++++++----------- 1 file changed, 68 insertions(+), 106 deletions(-) diff --git a/tests/deployment/frontier/test_jvp_update_rocm.py b/tests/deployment/frontier/test_jvp_update_rocm.py index a39e8c2..25880a6 100644 --- a/tests/deployment/frontier/test_jvp_update_rocm.py +++ b/tests/deployment/frontier/test_jvp_update_rocm.py @@ -15,7 +15,6 @@ from examples.mnist.model import MNIST_CNN from training.updater.jvp_reg import JVPRegUpdater -from profilers import FLOPSProfiler @pytest.fixture @@ -26,7 +25,7 @@ def rocm_config(): data=DataCfg(name="mnist", path="./data"), train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001), continual_learning=ContinualLearningCfg( - jvp_reg=0.001, deltax_norm=1.0, max_iter=5 + jvp_lambda=0.001, jvp_deltax_norm=1.0, max_iter=5 ), drift_detection=DriftDetectionCfg(), seed=42, @@ -44,28 +43,22 @@ def harness_with_history(rocm_config): return harness -class TestJVPRegularizedLoss: - """Tests for JVPRegularizedLoss module.""" +class TestJVPRegUpdater: + """Tests for JVPRegUpdater class.""" - def test_jvp_loss_creation(self, harness_with_history): - """Test that JVPRegularizedLoss can be created.""" - criterion = harness_with_history.get_criterion() - jvp_loss = JVPRegUpdater( - model=harness_with_history.model, - criterion=criterion, - jvp_reg=0.001, - deltax_norm=1.0, + def test_jvp_updater_creation(self, rocm_config, harness_with_history): + """Test that JVPRegUpdater can be created.""" + jvp_updater = JVPRegUpdater( + cfg=rocm_config, + modelHarness=harness_with_history, ) - assert jvp_loss is not None - - def test_jvp_loss_forward(self, rocm_config, harness_with_history): - """Test that JVPRegularizedLoss forward pass works on GPU.""" - criterion = harness_with_history.get_criterion() - jvp_loss = JVPRegularizedLoss( - model=harness_with_history.model, - criterion=criterion, - jvp_reg=0.001, - deltax_norm=1.0, + assert jvp_updater is not None + + def test_jvp_updater_forward_backward(self, rocm_config, harness_with_history): + """Test that JVPRegUpdater forward-backward pass works on GPU.""" + jvp_updater = JVPRegUpdater( + cfg=rocm_config, + modelHarness=harness_with_history, ) # Get batches @@ -76,24 +69,23 @@ def test_jvp_loss_forward(self, rocm_config, harness_with_history): hist_batch = next(iter(hist_train_loader)) # Move to device - train_batch = [b.to(rocm_config.device) for b in train_batch] - hist_batch = [b.to(rocm_config.device) for b in hist_batch] + train_batch = tuple(b.to(rocm_config.device) for b in train_batch) + hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch) - # Forward pass - grad_dict, loss_curr, loss_mem = jvp_loss(train_batch, hist_batch) + # Run forward-backward pass + jvp_updater.update_pre_fwd_bwd() + loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch) + loss_mem = jvp_updater.update_post_fwd_bwd() - assert grad_dict is not None, "Gradient dict is None" assert loss_curr is not None, "Current loss is None" assert loss_mem is not None, "Memory loss is None" + assert loss_curr > 0, "Current loss should be positive" - def test_jvp_loss_gradients_on_gpu(self, rocm_config, harness_with_history): + def test_jvp_gradients_on_gpu(self, rocm_config, harness_with_history): """Test that JVP gradients are computed on GPU.""" - criterion = harness_with_history.get_criterion() - jvp_loss = JVPRegularizedLoss( - model=harness_with_history.model, - criterion=criterion, - jvp_reg=0.001, - deltax_norm=1.0, + jvp_updater = JVPRegUpdater( + cfg=rocm_config, + modelHarness=harness_with_history, ) # Get batches @@ -103,34 +95,31 @@ def test_jvp_loss_gradients_on_gpu(self, rocm_config, harness_with_history): train_batch = next(iter(train_loader)) hist_batch = next(iter(hist_train_loader)) - train_batch = [b.to(rocm_config.device) for b in train_batch] - hist_batch = [b.to(rocm_config.device) for b in hist_batch] + train_batch = tuple(b.to(rocm_config.device) for b in train_batch) + hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch) # Compute gradients - grad_dict, _, _ = jvp_loss(train_batch, hist_batch) + jvp_updater.update_pre_fwd_bwd() + jvp_updater.fwd_bwd(train_batch, hist_batch) + jvp_updater.update_post_fwd_bwd() # Check gradients exist for all parameters for name, param in harness_with_history.model.named_parameters(): - assert name in grad_dict, f"No gradient for {name}" - assert grad_dict[name].is_cuda, f"Gradient for {name} not on GPU" - assert not torch.isnan(grad_dict[name]).any(), f"NaN in gradient for {name}" + assert param.grad is not None, f"No gradient for {name}" + assert param.grad.is_cuda, f"Gradient for {name} not on GPU" + assert not torch.isnan(param.grad).any(), f"NaN in gradient for {name}" class TestJVPUpdateStep: - """Tests for step_method_jvp_reg function.""" + """Tests for JVP update step with optimizer.""" def test_jvp_step_runs(self, rocm_config, harness_with_history): """Test that JVP update step executes without error.""" - criterion = harness_with_history.get_criterion() optimizer = harness_with_history.get_optmizer() - model = harness_with_history.model - profiler = FLOPSProfiler() - jvp_loss = JVPRegularizedLoss( - model=model, - criterion=criterion, - jvp_reg=rocm_config.continuous_learning.jvp_reg, - deltax_norm=rocm_config.continuous_learning.deltax_norm, + jvp_updater = JVPRegUpdater( + cfg=rocm_config, + modelHarness=harness_with_history, ) # Get batches @@ -140,43 +129,32 @@ def test_jvp_step_runs(self, rocm_config, harness_with_history): train_batch = next(iter(train_loader)) hist_batch = next(iter(hist_train_loader)) - train_batch = [b.to(rocm_config.device) for b in train_batch] - hist_batch = [b.to(rocm_config.device) for b in hist_batch] + train_batch = tuple(b.to(rocm_config.device) for b in train_batch) + hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch) # Run update step - loss_curr, loss_mem, loss_total = step_method_jvp_reg( - model=model, - criterion=criterion, - optimizer=optimizer, - cfg=rocm_config, - iter=0, - train_batch=train_batch, - hist_batch=hist_batch, - profiler=profiler, - jvp_loss=jvp_loss, - ) + optimizer.zero_grad() + jvp_updater.update_pre_fwd_bwd() + loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch) + loss_mem = jvp_updater.update_post_fwd_bwd() + optimizer.step() assert loss_curr > 0, "Current loss should be positive" - assert loss_mem > 0, "Memory loss should be positive" - assert loss_total > 0, "Total loss should be positive" + assert loss_mem >= 0, "Memory loss should be non-negative" def test_jvp_step_updates_weights(self, rocm_config, harness_with_history): """Test that JVP update step modifies model weights.""" - criterion = harness_with_history.get_criterion() optimizer = harness_with_history.get_optmizer() model = harness_with_history.model - profiler = FLOPSProfiler() # Get initial weights initial_weights = { name: param.clone().detach() for name, param in model.named_parameters() } - jvp_loss = JVPRegularizedLoss( - model=model, - criterion=criterion, - jvp_reg=rocm_config.continuous_learning.jvp_reg, - deltax_norm=rocm_config.continuous_learning.deltax_norm, + jvp_updater = JVPRegUpdater( + cfg=rocm_config, + modelHarness=harness_with_history, ) # Get batches @@ -186,21 +164,15 @@ def test_jvp_step_updates_weights(self, rocm_config, harness_with_history): train_batch = next(iter(train_loader)) hist_batch = next(iter(hist_train_loader)) - train_batch = [b.to(rocm_config.device) for b in train_batch] - hist_batch = [b.to(rocm_config.device) for b in hist_batch] + train_batch = tuple(b.to(rocm_config.device) for b in train_batch) + hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch) # Run update step - step_method_jvp_reg( - model=model, - criterion=criterion, - optimizer=optimizer, - cfg=rocm_config, - iter=0, - train_batch=train_batch, - hist_batch=hist_batch, - profiler=profiler, - jvp_loss=jvp_loss, - ) + optimizer.zero_grad() + jvp_updater.update_pre_fwd_bwd() + jvp_updater.fwd_bwd(train_batch, hist_batch) + jvp_updater.update_post_fwd_bwd() + optimizer.step() # Check weights changed weights_changed = False @@ -213,16 +185,11 @@ def test_jvp_step_updates_weights(self, rocm_config, harness_with_history): def test_jvp_step_multiple_iterations(self, rocm_config, harness_with_history): """Test that multiple JVP update steps work correctly.""" - criterion = harness_with_history.get_criterion() optimizer = harness_with_history.get_optmizer() - model = harness_with_history.model - profiler = FLOPSProfiler() - jvp_loss = JVPRegularizedLoss( - model=model, - criterion=criterion, - jvp_reg=rocm_config.continuous_learning.jvp_reg, - deltax_norm=rocm_config.continuous_learning.deltax_norm, + jvp_updater = JVPRegUpdater( + cfg=rocm_config, + modelHarness=harness_with_history, ) # Get loaders @@ -237,21 +204,16 @@ def test_jvp_step_multiple_iterations(self, rocm_config, harness_with_history): train_batch = next(train_iter) hist_batch = next(hist_iter) - train_batch = [b.to(rocm_config.device) for b in train_batch] - hist_batch = [b.to(rocm_config.device) for b in hist_batch] - - loss_curr, loss_mem, loss_total = step_method_jvp_reg( - model=model, - criterion=criterion, - optimizer=optimizer, - cfg=rocm_config, - iter=i, - train_batch=train_batch, - hist_batch=hist_batch, - profiler=profiler, - jvp_loss=jvp_loss, - ) + train_batch = tuple(b.to(rocm_config.device) for b in train_batch) + hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch) + + optimizer.zero_grad() + jvp_updater.update_pre_fwd_bwd() + loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch) + loss_mem = jvp_updater.update_post_fwd_bwd() + optimizer.step() + loss_total = loss_curr + loss_mem losses.append(loss_total) # All losses should be positive From 4e07151c5e79af639ccf6b07b0eb009f942e62aa Mon Sep 17 00:00:00 2001 From: Rafael Zamora-Resendiz Date: Mon, 9 Feb 2026 12:39:55 -0500 Subject: [PATCH 05/12] Fixed config parameters. --- tests/deployment/frontier/test_jvp_update_rocm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/deployment/frontier/test_jvp_update_rocm.py b/tests/deployment/frontier/test_jvp_update_rocm.py index 25880a6..245c95a 100644 --- a/tests/deployment/frontier/test_jvp_update_rocm.py +++ b/tests/deployment/frontier/test_jvp_update_rocm.py @@ -25,7 +25,7 @@ def rocm_config(): data=DataCfg(name="mnist", path="./data"), train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001), continual_learning=ContinualLearningCfg( - jvp_lambda=0.001, jvp_deltax_norm=1.0, max_iter=5 + jvp_lambda=0.001, jvp_deltax_norm=1.0, ), drift_detection=DriftDetectionCfg(), seed=42, From e6d140a2db9bf8827e6a3dd1b321f789b568d432 Mon Sep 17 00:00:00 2001 From: Rafael Zamora-Resendiz Date: Mon, 9 Feb 2026 12:50:52 -0500 Subject: [PATCH 06/12] Added deployment readme. --- src/deployment/frontier/README.md | 44 +++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 src/deployment/frontier/README.md diff --git a/src/deployment/frontier/README.md b/src/deployment/frontier/README.md new file mode 100644 index 0000000..9c9b12a --- /dev/null +++ b/src/deployment/frontier/README.md @@ -0,0 +1,44 @@ +# Deployment + +## OLCF's Frontier + +### Setup +First, create a local virtual enviroment in scratch directory and clone repo: + +```bash +cd $MEMBERWORK # User scratch space +module load python # Load stable python +python -m venv my_env # Create a virtual environment +source ./my_env/bin/activate +pip install poetry +git clone https://github.com/AI-ModCon/BaseSim_Framework.git +``` + +To install dependencies and torch libraries with ROCM support (6.4.2), run from the project root: + +```bash +cd ./BaseSim_Framework +source ./src/deployment/frontier/install_rocm.sh +``` + +### Testing Setup + +> Note: Testing model harness and jvp update requires MNIST dataset download on first run. +> Download the dataset before submitting the run using: + +```bash +poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist_data()" +``` + +Prior to running experiments, test ROCM support from the project root: +> Pass project account via PROJECT_ACCOUNT +```bash +SLURM_ACCOUNT=lrnxxx sbatch src/deployment/frontier/test_rocm_install.sbatch +``` + +### Submit Job +If test pass, its safe to submit run from project root: + +```bash +SLURM_ACCOUNT=lrnxxx sbatch src/deployment/frontier/mnist_example.sbatch +``` From dbff5fc3667bbe2d5eae02613236f5d1eb6fae54 Mon Sep 17 00:00:00 2001 From: Rafael Zamora-Resendiz Date: Mon, 9 Feb 2026 13:03:29 -0500 Subject: [PATCH 07/12] Passes ruff and mypy. --- tests/deployment/frontier/test_jvp_update_rocm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/deployment/frontier/test_jvp_update_rocm.py b/tests/deployment/frontier/test_jvp_update_rocm.py index 245c95a..f45da2f 100644 --- a/tests/deployment/frontier/test_jvp_update_rocm.py +++ b/tests/deployment/frontier/test_jvp_update_rocm.py @@ -25,7 +25,8 @@ def rocm_config(): data=DataCfg(name="mnist", path="./data"), train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001), continual_learning=ContinualLearningCfg( - jvp_lambda=0.001, jvp_deltax_norm=1.0, + jvp_lambda=0.001, + jvp_deltax_norm=1.0, ), drift_detection=DriftDetectionCfg(), seed=42, From 2450b0a89f0c17b1f28cd755560bf53a921882c2 Mon Sep 17 00:00:00 2001 From: Rafael Zamora-Resendiz Date: Wed, 18 Feb 2026 14:30:00 -0500 Subject: [PATCH 08/12] Reduced tests to minimal Rocm support checks. --- src/deployment/frontier/README.md | 20 +- src/deployment/frontier/mnist_example.sbatch | 7 +- .../frontier/test_rocm_install.sbatch | 40 --- .../frontier/test_jvp_update_rocm.py | 221 ----------------- .../frontier/test_model_harness_rocm.py | 228 ------------------ .../deployment/frontier/test_rocm_install.py | 60 ----- tests/test_rocm.py | 138 +++++++++++ 7 files changed, 152 insertions(+), 562 deletions(-) delete mode 100644 src/deployment/frontier/test_rocm_install.sbatch delete mode 100644 tests/deployment/frontier/test_jvp_update_rocm.py delete mode 100644 tests/deployment/frontier/test_model_harness_rocm.py delete mode 100644 tests/deployment/frontier/test_rocm_install.py create mode 100644 tests/test_rocm.py diff --git a/src/deployment/frontier/README.md b/src/deployment/frontier/README.md index 9c9b12a..9f8147d 100644 --- a/src/deployment/frontier/README.md +++ b/src/deployment/frontier/README.md @@ -21,23 +21,23 @@ cd ./BaseSim_Framework source ./src/deployment/frontier/install_rocm.sh ``` -### Testing Setup +Prior to running experiments, test ROCM support from the project root: +> Pass project account via PROJECT_ACCOUNT +```bash +poetry run pytest tests/test_rocm.py +``` + + +### Submit Job -> Note: Testing model harness and jvp update requires MNIST dataset download on first run. +> Note: Requires MNIST dataset download on first run. > Download the dataset before submitting the run using: ```bash poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist_data()" ``` -Prior to running experiments, test ROCM support from the project root: -> Pass project account via PROJECT_ACCOUNT -```bash -SLURM_ACCOUNT=lrnxxx sbatch src/deployment/frontier/test_rocm_install.sbatch -``` - -### Submit Job -If test pass, its safe to submit run from project root: +Submit run from project root: ```bash SLURM_ACCOUNT=lrnxxx sbatch src/deployment/frontier/mnist_example.sbatch diff --git a/src/deployment/frontier/mnist_example.sbatch b/src/deployment/frontier/mnist_example.sbatch index bf175b4..358d00f 100644 --- a/src/deployment/frontier/mnist_example.sbatch +++ b/src/deployment/frontier/mnist_example.sbatch @@ -14,9 +14,10 @@ module load gcc/12.2.0 module load rocm/6.4.2 # ROCm/MIOpen flags -mkdir -p $MEMBERWORK/$SBATCH_ACCOUNT/miopen -export MIOPEN_USER_DB_PATH=$MEMBERWORK/$SBATCH_ACCOUNT/miopen -export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$SBATCH_ACCOUNT/miopen +ACCOUNT=$(sacct -j $SLURM_JOB_ID --format=Account --noheader | head -1 | tr -d ' ') +mkdir -p $MEMBERWORK/$ACCOUNT/miopen +export MIOPEN_USER_DB_PATH=$MEMBERWORK/$ACCOUNT/miopen +export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$ACCOUNT/miopen export WANDB_MODE=offline # Print environment info diff --git a/src/deployment/frontier/test_rocm_install.sbatch b/src/deployment/frontier/test_rocm_install.sbatch deleted file mode 100644 index 50c8f0c..0000000 --- a/src/deployment/frontier/test_rocm_install.sbatch +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -l -#SBATCH -J test_rocm_install -#SBATCH -t 0:10:00 -#SBATCH -N 1 -#SBATCH --exclusive -#SBATCH --ntasks-per-node=1 -#SBATCH --gpus-per-node=1 -#SBATCH -o output/test_rocm_install.o%j -#SBATCH -e output/test_rocm_install.e%j - -# Load required modules -module load PrgEnv-gnu -module load gcc/12.2.0 -module load rocm/6.4.2 - -# ROCm/MIOpen flags -mkdir -p $MEMBERWORK/$SBATCH_ACCOUNT/miopen -export MIOPEN_USER_DB_PATH=$MEMBERWORK/$SBATCH_ACCOUNT/miopen -export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$SBATCH_ACCOUNT/miopen - -# WANDBD set to offline; Need to set up proxy connections. -export WANDB_MODE=offline - -# Print environment info -echo "==============================================" -echo "ROCm Installation & Harness Test" -echo "==============================================" -echo "Date: $(date)" -echo "Hostname: $(hostname)" -echo "ROCM_PATH: ${ROCM_PATH}" -echo "==============================================" - -# Test torch rocm compatability -poetry run pytest tests/deployment/frontier/test_rocm_install.py -v - -# Test model harness (loaders, model, etc) -poetry run pytest tests/deployment/frontier/test_model_harness_rocm.py -v - -# Test jvp update -poetry run pytest tests/deployment/frontier/test_jvp_update_rocm.py -v diff --git a/tests/deployment/frontier/test_jvp_update_rocm.py b/tests/deployment/frontier/test_jvp_update_rocm.py deleted file mode 100644 index f45da2f..0000000 --- a/tests/deployment/frontier/test_jvp_update_rocm.py +++ /dev/null @@ -1,221 +0,0 @@ -"""Tests to verify JVP regularized update works correctly with ROCm.""" - -import pytest -import torch - -from config.configuration import ( - Config, - ModelCfg, - DataCfg, - TrainCfg, - ContinualLearningCfg, - DriftDetectionCfg, -) - -from examples.mnist.model import MNIST_CNN - -from training.updater.jvp_reg import JVPRegUpdater - - -@pytest.fixture -def rocm_config(): - """Create a config for ROCm/GPU testing.""" - return Config( - model=ModelCfg(name="mnist_cnn", pretrained_path=""), - data=DataCfg(name="mnist", path="./data"), - train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001), - continual_learning=ContinualLearningCfg( - jvp_lambda=0.001, - jvp_deltax_norm=1.0, - ), - drift_detection=DriftDetectionCfg(), - seed=42, - device="cuda", - multi_gpu=False, - ) - - -@pytest.fixture -def harness_with_history(rocm_config): - """Create MNIST harness with historical data.""" - harness = MNIST_CNN(rocm_config) - harness.update_data_stream() # First stream - harness.update_data_stream() # Second stream (creates history) - return harness - - -class TestJVPRegUpdater: - """Tests for JVPRegUpdater class.""" - - def test_jvp_updater_creation(self, rocm_config, harness_with_history): - """Test that JVPRegUpdater can be created.""" - jvp_updater = JVPRegUpdater( - cfg=rocm_config, - modelHarness=harness_with_history, - ) - assert jvp_updater is not None - - def test_jvp_updater_forward_backward(self, rocm_config, harness_with_history): - """Test that JVPRegUpdater forward-backward pass works on GPU.""" - jvp_updater = JVPRegUpdater( - cfg=rocm_config, - modelHarness=harness_with_history, - ) - - # Get batches - train_loader, _ = harness_with_history.get_cur_data_loaders() - hist_train_loader, _ = harness_with_history.get_hist_data_loaders() - - train_batch = next(iter(train_loader)) - hist_batch = next(iter(hist_train_loader)) - - # Move to device - train_batch = tuple(b.to(rocm_config.device) for b in train_batch) - hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch) - - # Run forward-backward pass - jvp_updater.update_pre_fwd_bwd() - loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch) - loss_mem = jvp_updater.update_post_fwd_bwd() - - assert loss_curr is not None, "Current loss is None" - assert loss_mem is not None, "Memory loss is None" - assert loss_curr > 0, "Current loss should be positive" - - def test_jvp_gradients_on_gpu(self, rocm_config, harness_with_history): - """Test that JVP gradients are computed on GPU.""" - jvp_updater = JVPRegUpdater( - cfg=rocm_config, - modelHarness=harness_with_history, - ) - - # Get batches - train_loader, _ = harness_with_history.get_cur_data_loaders() - hist_train_loader, _ = harness_with_history.get_hist_data_loaders() - - train_batch = next(iter(train_loader)) - hist_batch = next(iter(hist_train_loader)) - - train_batch = tuple(b.to(rocm_config.device) for b in train_batch) - hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch) - - # Compute gradients - jvp_updater.update_pre_fwd_bwd() - jvp_updater.fwd_bwd(train_batch, hist_batch) - jvp_updater.update_post_fwd_bwd() - - # Check gradients exist for all parameters - for name, param in harness_with_history.model.named_parameters(): - assert param.grad is not None, f"No gradient for {name}" - assert param.grad.is_cuda, f"Gradient for {name} not on GPU" - assert not torch.isnan(param.grad).any(), f"NaN in gradient for {name}" - - -class TestJVPUpdateStep: - """Tests for JVP update step with optimizer.""" - - def test_jvp_step_runs(self, rocm_config, harness_with_history): - """Test that JVP update step executes without error.""" - optimizer = harness_with_history.get_optmizer() - - jvp_updater = JVPRegUpdater( - cfg=rocm_config, - modelHarness=harness_with_history, - ) - - # Get batches - train_loader, _ = harness_with_history.get_cur_data_loaders() - hist_train_loader, _ = harness_with_history.get_hist_data_loaders() - - train_batch = next(iter(train_loader)) - hist_batch = next(iter(hist_train_loader)) - - train_batch = tuple(b.to(rocm_config.device) for b in train_batch) - hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch) - - # Run update step - optimizer.zero_grad() - jvp_updater.update_pre_fwd_bwd() - loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch) - loss_mem = jvp_updater.update_post_fwd_bwd() - optimizer.step() - - assert loss_curr > 0, "Current loss should be positive" - assert loss_mem >= 0, "Memory loss should be non-negative" - - def test_jvp_step_updates_weights(self, rocm_config, harness_with_history): - """Test that JVP update step modifies model weights.""" - optimizer = harness_with_history.get_optmizer() - model = harness_with_history.model - - # Get initial weights - initial_weights = { - name: param.clone().detach() for name, param in model.named_parameters() - } - - jvp_updater = JVPRegUpdater( - cfg=rocm_config, - modelHarness=harness_with_history, - ) - - # Get batches - train_loader, _ = harness_with_history.get_cur_data_loaders() - hist_train_loader, _ = harness_with_history.get_hist_data_loaders() - - train_batch = next(iter(train_loader)) - hist_batch = next(iter(hist_train_loader)) - - train_batch = tuple(b.to(rocm_config.device) for b in train_batch) - hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch) - - # Run update step - optimizer.zero_grad() - jvp_updater.update_pre_fwd_bwd() - jvp_updater.fwd_bwd(train_batch, hist_batch) - jvp_updater.update_post_fwd_bwd() - optimizer.step() - - # Check weights changed - weights_changed = False - for name, param in model.named_parameters(): - if not torch.allclose(param, initial_weights[name], atol=1e-6): - weights_changed = True - break - - assert weights_changed, "No weights updated after JVP step" - - def test_jvp_step_multiple_iterations(self, rocm_config, harness_with_history): - """Test that multiple JVP update steps work correctly.""" - optimizer = harness_with_history.get_optmizer() - - jvp_updater = JVPRegUpdater( - cfg=rocm_config, - modelHarness=harness_with_history, - ) - - # Get loaders - train_loader, _ = harness_with_history.get_cur_data_loaders() - hist_train_loader, _ = harness_with_history.get_hist_data_loaders() - - train_iter = iter(train_loader) - hist_iter = iter(hist_train_loader) - - losses = [] - for i in range(5): - train_batch = next(train_iter) - hist_batch = next(hist_iter) - - train_batch = tuple(b.to(rocm_config.device) for b in train_batch) - hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch) - - optimizer.zero_grad() - jvp_updater.update_pre_fwd_bwd() - loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch) - loss_mem = jvp_updater.update_post_fwd_bwd() - optimizer.step() - - loss_total = loss_curr + loss_mem - losses.append(loss_total) - - # All losses should be positive - assert all(loss > 0 for loss in losses), "Some losses are not positive" diff --git a/tests/deployment/frontier/test_model_harness_rocm.py b/tests/deployment/frontier/test_model_harness_rocm.py deleted file mode 100644 index c8df404..0000000 --- a/tests/deployment/frontier/test_model_harness_rocm.py +++ /dev/null @@ -1,228 +0,0 @@ -"""Tests to verify MNIST model harness works correctly with ROCm.""" - -import pytest -import torch - -from config.configuration import ( - Config, - ModelCfg, - DataCfg, - TrainCfg, - ContinualLearningCfg, - DriftDetectionCfg, -) -from examples.mnist.model import MNIST_CNN - - -@pytest.fixture -def rocm_config(): - """Create a config for ROCm/GPU testing.""" - return Config( - model=ModelCfg(name="mnist_cnn", pretrained_path=""), - data=DataCfg(name="mnist", path="./data"), - train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001), - continual_learning=ContinualLearningCfg(), - drift_detection=DriftDetectionCfg(), - seed=42, - device="cuda", - multi_gpu=False, - ) - - -@pytest.fixture -def harness(rocm_config): - """Create MNIST harness and initialize data stream.""" - harness = MNIST_CNN(rocm_config) - harness.update_data_stream() - return harness - - -class TestModelLoading: - """Tests for model loading and GPU placement.""" - - def test_model_on_gpu(self, harness): - """Test that model is moved to GPU.""" - device = next(harness.model.parameters()).device - assert device.type == "cuda", f"Model not on GPU, found {device}" - - def test_model_device_matches_config(self, harness): - """Test that model device matches config device.""" - device = next(harness.model.parameters()).device - assert str(device).startswith("cuda") - - -class TestDataLoader: - """Tests for data loader functionality.""" - - def test_data_loaders_created(self, harness): - """Test that data loaders are created after update_data_stream.""" - train_loader, val_loader = harness.get_cur_data_loaders() - assert train_loader is not None, "Train loader is None" - assert val_loader is not None, "Val loader is None" - - def test_data_loader_batch_shape(self, harness): - """Test that data loader produces correct batch shapes.""" - train_loader, _ = harness.get_cur_data_loaders() - batch = next(iter(train_loader)) - x, y = batch - assert x.dim() == 3, f"Expected 3D input (B, H, W), got {x.dim()}D" - assert y.dim() == 1, f"Expected 1D labels, got {y.dim()}D" - assert x.shape[0] == y.shape[0], "Batch size mismatch between x and y" - - def test_data_moves_to_gpu(self, harness): - """Test that data can be moved to GPU.""" - train_loader, _ = harness.get_cur_data_loaders() - batch = next(iter(train_loader)) - x, y = batch - x_gpu = x.to(harness.cfg.device) - y_gpu = y.to(harness.cfg.device) - assert x_gpu.is_cuda, "Input tensor not on GPU" - assert y_gpu.is_cuda, "Label tensor not on GPU" - - -class TestForwardPass: - """Tests for model forward pass.""" - - def test_forward_pass_runs(self, harness): - """Test that forward pass executes without error.""" - train_loader, _ = harness.get_cur_data_loaders() - batch = next(iter(train_loader)) - x, y = batch - x = x.to(harness.cfg.device) - - harness.model.eval() - with torch.no_grad(): - output = harness.model(x) - - assert output is not None, "Forward pass returned None" - - def test_forward_pass_output_shape(self, harness): - """Test that forward pass produces correct output shape.""" - train_loader, _ = harness.get_cur_data_loaders() - batch = next(iter(train_loader)) - x, y = batch - x = x.to(harness.cfg.device) - - harness.model.eval() - with torch.no_grad(): - output = harness.model(x) - - assert output.shape[0] == x.shape[0], "Batch size mismatch" - assert output.shape[1] == 10, f"Expected 10 classes, got {output.shape[1]}" - - def test_forward_pass_output_on_gpu(self, harness): - """Test that forward pass output is on GPU.""" - train_loader, _ = harness.get_cur_data_loaders() - batch = next(iter(train_loader)) - x, _ = batch - x = x.to(harness.cfg.device) - - harness.model.eval() - with torch.no_grad(): - output = harness.model(x) - - assert output.is_cuda, "Output tensor not on GPU" - - -class TestEval: - """Tests for harness eval method.""" - - def test_eval_runs(self, harness): - """Test that eval method executes without error.""" - metrics = harness.eval() - assert metrics is not None, "Eval returned None" - - def test_eval_returns_metrics(self, harness): - """Test that eval returns expected number of metrics.""" - metrics = harness.eval() - assert len(metrics) == len(harness.eval_metrics), ( - f"Expected {len(harness.eval_metrics)} metrics, got {len(metrics)}" - ) - - def test_eval_metrics_are_valid(self, harness): - """Test that eval metrics are valid floats.""" - metrics = harness.eval() - for i, metric in enumerate(metrics): - assert isinstance(metric, float), f"Metric {i} is not a float" - assert not torch.isnan(torch.tensor(metric)), f"Metric {i} is NaN" - - -class TestTrainingStep: - """Tests for a single training step.""" - - def test_training_step(self, harness): - """Test that a single training step executes without error.""" - train_loader, _ = harness.get_cur_data_loaders() - batch = next(iter(train_loader)) - x, y = batch - x = x.to(harness.cfg.device) - y = y.to(harness.cfg.device) - - harness.model.train() - optimizer = harness.get_optmizer() - criterion = harness.get_criterion() - - optimizer.zero_grad() - output = harness.model(x) - loss = criterion(output, y) - loss.backward() - optimizer.step() - - assert loss.item() > 0, "Loss should be positive" - - def test_gradients_computed(self, harness): - """Test that gradients are computed during backward pass.""" - train_loader, _ = harness.get_cur_data_loaders() - batch = next(iter(train_loader)) - x, y = batch - x = x.to(harness.cfg.device) - y = y.to(harness.cfg.device) - - harness.model.train() - optimizer = harness.get_optmizer() - criterion = harness.get_criterion() - - optimizer.zero_grad() - output = harness.model(x) - loss = criterion(output, y) - loss.backward() - - has_grad = False - for param in harness.model.parameters(): - if param.grad is not None and param.grad.abs().sum() > 0: - has_grad = True - break - - assert has_grad, "No gradients computed" - - def test_weights_updated(self, harness): - """Test that weights are updated after optimizer step.""" - train_loader, _ = harness.get_cur_data_loaders() - batch = next(iter(train_loader)) - x, y = batch - x = x.to(harness.cfg.device) - y = y.to(harness.cfg.device) - - harness.model.train() - optimizer = harness.get_optmizer() - criterion = harness.get_criterion() - - # Get initial weights - initial_weights = { - name: param.clone() for name, param in harness.model.named_parameters() - } - - optimizer.zero_grad() - output = harness.model(x) - loss = criterion(output, y) - loss.backward() - optimizer.step() - - # Check weights changed - weights_changed = False - for name, param in harness.model.named_parameters(): - if not torch.equal(param, initial_weights[name]): - weights_changed = True - break - - assert weights_changed, "Weights not updated after optimizer step" diff --git a/tests/deployment/frontier/test_rocm_install.py b/tests/deployment/frontier/test_rocm_install.py deleted file mode 100644 index 3e7c7b8..0000000 --- a/tests/deployment/frontier/test_rocm_install.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Tests to verify ROCm installation and PyTorch GPU support.""" - - -def test_torch_import(): - """Test that PyTorch can be imported.""" - import torch - - assert torch is not None, "PyTorch import failed" - - -def test_torchvision_import(): - """Test that torchvision can be imported.""" - import torchvision - - assert torchvision is not None, "torchvision import failed" - - -def test_rocm_available(): - """Test that ROCm/HIP is available through PyTorch.""" - import torch - - assert torch.cuda.is_available(), "CUDA/ROCm is not available" - - -def test_gpu_count(): - """Test that at least one GPU is detected.""" - import torch - - gpu_count = torch.cuda.device_count() - assert gpu_count > 0, f"No GPUs detected, found {gpu_count}" - - -def test_gpu_properties(): - """Test that GPU properties can be queried.""" - import torch - - assert torch.cuda.is_available(), "CUDA/ROCm not available" - for i in range(torch.cuda.device_count()): - props = torch.cuda.get_device_properties(i) - assert props.name is not None - assert props.total_memory > 0 - - -def test_tensor_on_gpu(): - """Test that tensors can be created and moved to GPU.""" - import torch - - assert torch.cuda.is_available(), "CUDA/ROCm not available" - x = torch.randn(100, 100).cuda() - assert x.is_cuda, "Tensor not on GPU" - y = x @ x.T - assert y.is_cuda, "Result tensor not on GPU" - - -def test_torch_rocm_build(): - """Test that PyTorch was built with ROCm support.""" - import torch - - hip_version = getattr(torch.version, "hip", None) - assert hip_version is not None, "PyTorch not built with ROCm/HIP support" diff --git a/tests/test_rocm.py b/tests/test_rocm.py new file mode 100644 index 0000000..c15f931 --- /dev/null +++ b/tests/test_rocm.py @@ -0,0 +1,138 @@ +"""Tests to verify ROCm/CUDA installation and PyTorch GPU support.""" + +import pytest +import torch + +requires_gpu = pytest.mark.skipif( + not torch.cuda.is_available(), reason="No CUDA/ROCm GPU available" +) +requires_rocm = pytest.mark.skipif( + getattr(torch.version, "hip", None) is None, reason="ROCm/HIP not available" +) + + +@pytest.fixture +def gpu_device(): + """Return the GPU device string.""" + return "cuda" + + +def test_torch_import(): + """Test that PyTorch can be imported.""" + assert torch is not None, "PyTorch import failed" + + +def test_torchvision_import(): + """Test that torchvision can be imported.""" + import torchvision + + assert torchvision is not None, "torchvision import failed" + + +@requires_gpu +def test_rocm_available(): + """Test that CUDA/ROCm is available through PyTorch.""" + assert torch.cuda.is_available(), "CUDA/ROCm is not available" + + +@requires_gpu +def test_gpu_count(): + """Test that at least one GPU is detected.""" + gpu_count = torch.cuda.device_count() + assert gpu_count > 0, f"No GPUs detected, found {gpu_count}" + + +@requires_gpu +def test_gpu_properties(): + """Test that GPU properties can be queried.""" + for i in range(torch.cuda.device_count()): + props = torch.cuda.get_device_properties(i) + assert props.name is not None + assert props.total_memory > 0 + + +@requires_gpu +def test_tensor_on_gpu(): + """Test that tensors can be created and moved to GPU.""" + x = torch.randn(100, 100).cuda() + assert x.is_cuda, "Tensor not on GPU" + y = x @ x.T + assert y.is_cuda, "Result tensor not on GPU" + + +@requires_gpu +def test_tensor_to_device(gpu_device): + """Test that tensor.to(device) works (used by harness and JVP tests).""" + x = torch.randn(32, 10) + x_gpu = x.to(gpu_device) + assert x_gpu.is_cuda, "tensor.to(device) failed" + assert x_gpu.shape == x.shape, "Shape changed after .to()" + + +@requires_gpu +def test_autograd_on_gpu(gpu_device): + """Test that backward pass and gradient computation work on GPU.""" + x = torch.randn(16, 4, device=gpu_device, requires_grad=True) + w = torch.randn(4, 2, device=gpu_device, requires_grad=True) + loss = (x @ w).sum() + loss.backward() + assert w.grad is not None, "Gradients not computed" + assert w.grad.is_cuda, "Gradients not on GPU" + assert w.grad.shape == w.shape, "Gradient shape mismatch" + + +@requires_gpu +def test_optimizer_step_on_gpu(gpu_device): + """Test that optimizer zero_grad/step work on GPU parameters.""" + param = torch.nn.Parameter(torch.randn(4, 4, device=gpu_device)) + optimizer = torch.optim.SGD([param], lr=0.1) + + initial = param.clone().detach() + loss = param.sum() + optimizer.zero_grad() + loss.backward() + optimizer.step() + + assert not torch.equal(param, initial), "Weights not updated after step" + + +@requires_gpu +def test_no_grad_context(gpu_device): + """Test that torch.no_grad() inference mode works on GPU.""" + w = torch.randn(4, 4, device=gpu_device, requires_grad=True) + with torch.no_grad(): + y = w @ w.T + assert y.is_cuda, "Output not on GPU" + assert not y.requires_grad, "Output should not require grad inside no_grad" + + +@requires_gpu +def test_tensor_clone_detach(gpu_device): + """Test that clone/detach work on GPU tensors (used for weight snapshots).""" + x = torch.randn(4, 4, device=gpu_device, requires_grad=True) + y = x.clone().detach() + assert y.is_cuda, "Cloned tensor not on GPU" + assert not y.requires_grad, "Detached tensor should not require grad" + assert torch.equal(x, y), "Cloned tensor values differ" + + +@requires_gpu +def test_torch_comparison_ops(gpu_device): + """Test torch.equal, torch.allclose, and torch.isnan on GPU tensors.""" + a = torch.tensor([1.0, 2.0, 3.0], device=gpu_device) + b = a.clone() + + assert torch.equal(a, b), "torch.equal failed on identical GPU tensors" + assert torch.allclose(a, b, atol=1e-6), "torch.allclose failed" + + c = torch.tensor([1.0, float("nan"), 3.0], device=gpu_device) + nan_mask = torch.isnan(c) + assert nan_mask[1].item(), "torch.isnan failed to detect NaN on GPU" + assert not nan_mask[0].item(), "torch.isnan false positive on GPU" + + +@requires_rocm +def test_torch_rocm_build(): + """Test that PyTorch was built with ROCm support.""" + hip_version = getattr(torch.version, "hip", None) + assert hip_version is not None, "PyTorch not built with ROCm/HIP support" From 5287f9723e39ef4abcd61a6a4610088afafafa8f Mon Sep 17 00:00:00 2001 From: Rafael Zamora-Resendiz <15003285+rz4@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:41:27 -0500 Subject: [PATCH 09/12] Update README.md --- src/deployment/frontier/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/deployment/frontier/README.md b/src/deployment/frontier/README.md index 9f8147d..cffa812 100644 --- a/src/deployment/frontier/README.md +++ b/src/deployment/frontier/README.md @@ -22,7 +22,6 @@ source ./src/deployment/frontier/install_rocm.sh ``` Prior to running experiments, test ROCM support from the project root: -> Pass project account via PROJECT_ACCOUNT ```bash poetry run pytest tests/test_rocm.py ``` @@ -40,5 +39,5 @@ poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist Submit run from project root: ```bash -SLURM_ACCOUNT=lrnxxx sbatch src/deployment/frontier/mnist_example.sbatch +sbatch -A lrnxxx src/deployment/frontier/mnist_example.sbatch ``` From 459781beb608e5aa7da6ec41bec956da106ac62b Mon Sep 17 00:00:00 2001 From: Rafael Zamora-Resendiz Date: Tue, 17 Mar 2026 13:17:40 -0400 Subject: [PATCH 10/12] Standardized virtual environment install. Follows same deployment as Perlmutter w/ additional ROCM dependencies. --- .../frontier/{install_rocm.sh => install_venv.sh} | 9 +++++++-- src/deployment/frontier/mnist_example.sbatch | 6 +++++- 2 files changed, 12 insertions(+), 3 deletions(-) rename src/deployment/frontier/{install_rocm.sh => install_venv.sh} (50%) diff --git a/src/deployment/frontier/install_rocm.sh b/src/deployment/frontier/install_venv.sh similarity index 50% rename from src/deployment/frontier/install_rocm.sh rename to src/deployment/frontier/install_venv.sh index 7a9ec78..b6b3a87 100644 --- a/src/deployment/frontier/install_rocm.sh +++ b/src/deployment/frontier/install_venv.sh @@ -1,11 +1,16 @@ #!/bin/bash module load PrgEnv-gnu +module load python/3.13.0 module load gcc/12.2.0 module load rocm/6.4.2 -poetry lock -poetry install +python -m venv .venv # Create a virtual environment +source ./.venv/bin/activate # Activate environment +pip install poetry # Install poetry +poetry lock # Sync poetry +poetry install --no-cache # Install poetry + poetry run pip install --force-reinstall \ torch==2.9.1+rocm6.4 \ torchvision==0.24.1+rocm6.4 \ diff --git a/src/deployment/frontier/mnist_example.sbatch b/src/deployment/frontier/mnist_example.sbatch index 358d00f..b50c9d0 100644 --- a/src/deployment/frontier/mnist_example.sbatch +++ b/src/deployment/frontier/mnist_example.sbatch @@ -10,9 +10,13 @@ # Load required modules module load PrgEnv-gnu +module load python/3.13.0 module load gcc/12.2.0 module load rocm/6.4.2 +# +source ./.venv/bin/activate + # ROCm/MIOpen flags ACCOUNT=$(sacct -j $SLURM_JOB_ID --format=Account --noheader | head -1 | tr -d ' ') mkdir -p $MEMBERWORK/$ACCOUNT/miopen @@ -30,4 +34,4 @@ echo "ROCM_PATH: ${ROCM_PATH}" echo "==============================================" # Run example -poetry run python -m src.main --config ./examples/mnist/mnist.toml +python -m src.main --config ./examples/mnist/mnist.toml From f6ba24f41480745806c9fabb2cead0d464f562dc Mon Sep 17 00:00:00 2001 From: Rafael Zamora-Resendiz Date: Tue, 17 Mar 2026 13:21:02 -0400 Subject: [PATCH 11/12] Added link to deployment doc. --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 9b28e0b..712cdd7 100644 --- a/README.md +++ b/README.md @@ -70,3 +70,9 @@ poetry run pytest ## Output Training logs report the task id, training/test accuracy, and replay-memory accuracy every five epochs. Accuracy is computed via `test(...)` on both the current task and the accumulated memory set. + +## Deployment + +Platform-specific deployment guides: + +- [OLCF Frontier](./src/deployment/frontier/README.md) From f2300de08880d7530dea8b442542a81299b74fa3 Mon Sep 17 00:00:00 2001 From: Rafael Zamora-Resendiz Date: Tue, 17 Mar 2026 13:28:41 -0400 Subject: [PATCH 12/12] Updated deployment README. --- src/deployment/frontier/README.md | 53 ++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/src/deployment/frontier/README.md b/src/deployment/frontier/README.md index cffa812..c95a1ad 100644 --- a/src/deployment/frontier/README.md +++ b/src/deployment/frontier/README.md @@ -1,24 +1,36 @@ # Deployment -## OLCF's Frontier +## OLCF Froniter ### Setup -First, create a local virtual enviroment in scratch directory and clone repo: + +Clone the repo into your scratch directory and run the install script: ```bash -cd $MEMBERWORK # User scratch space -module load python # Load stable python -python -m venv my_env # Create a virtual environment -source ./my_env/bin/activate -pip install poetry +cd $MEMBERWORK git clone https://github.com/AI-ModCon/BaseSim_Framework.git +cd BaseSim_Framework +source ./src/deployment/frontier/install_venv.sh ``` -To install dependencies and torch libraries with ROCM support (6.4.2), run from the project root: +`install_venv.sh` creates a virtual environment, installs Poetry, and uses it to resolve and install project dependencies. The environment is saved to `.venv` in the project root. The script runs the following: ```bash -cd ./BaseSim_Framework -source ./src/deployment/frontier/install_rocm.sh +module load PrgEnv-gnu +module load python/3.13.0 +module load gcc/12.2.0 +module load rocm/6.4.2 + +python -m venv .venv # Create a virtual environment +source ./.venv/bin/activate # Activate environment +pip install poetry # Install poetry +poetry lock # Sync poetry +poetry install --no-cache # Install poetry + +poetry run pip install --force-reinstall \ + torch==2.9.1+rocm6.4 \ + torchvision==0.24.1+rocm6.4 \ + --index-url https://download.pytorch.org/whl/rocm6.4 ``` Prior to running experiments, test ROCM support from the project root: @@ -26,18 +38,23 @@ Prior to running experiments, test ROCM support from the project root: poetry run pytest tests/test_rocm.py ``` +### Submitting a Job + +> **Note:** The MNIST example requires to the dataset, which is downloaded on first run. Download it before submitting a batch job: +> +> ```bash +> poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist_data()" +> ``` -### Submit Job +The virtual environment can be sourced directly at the top of your SLURM script (`source .venv/bin/activate`), so Poetry is not needed at runtime — jobs run against the installed environment. -> Note: Requires MNIST dataset download on first run. -> Download the dataset before submitting the run using: +From the project root: ```bash -poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist_data()" +sbatch -A xxx src/deployment/frontier/mnist_example.sbatch ``` -Submit run from project root: +### Troubleshooting -```bash -sbatch -A lrnxxx src/deployment/frontier/mnist_example.sbatch -``` +- **`poetry install` fails to connect to PyPI** — Run `poetry lock` first, then retry. The lock file caches package download specs and may be stale on a new host. +- **`poetry install` fails with disk quota errors** — Poetry's default cache is in the home directory, which has limited space. Retry with `poetry install --no-cache` or free up space in `$HOME`.