From f3fa2247107103338e1d65ba7f535ea01afb1732 Mon Sep 17 00:00:00 2001
From: Rafael Zamora-Resendiz <rzamora@login09.frontier.olcf.ornl.gov>
Date: Mon, 9 Feb 2026 11:48:50 -0500
Subject: [PATCH 01/12] Rocm installation script.

---
 src/deployment/__init__.py              |  0
 src/deployment/frontier/install_rocm.sh | 12 ++++++++++++
 2 files changed, 12 insertions(+)
 create mode 100644 src/deployment/__init__.py
 create mode 100644 src/deployment/frontier/install_rocm.sh

diff --git a/src/deployment/__init__.py b/src/deployment/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/deployment/frontier/install_rocm.sh b/src/deployment/frontier/install_rocm.sh
new file mode 100644
index 0000000..7a9ec78
--- /dev/null
+++ b/src/deployment/frontier/install_rocm.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+module load PrgEnv-gnu
+module load gcc/12.2.0
+module load rocm/6.4.2
+
+poetry lock
+poetry install
+poetry run pip install --force-reinstall \
+    torch==2.9.1+rocm6.4 \
+    torchvision==0.24.1+rocm6.4 \
+    --index-url https://download.pytorch.org/whl/rocm6.4

From a183f945c24086aa5d5320ea4c56669de9f75ebd Mon Sep 17 00:00:00 2001
From: Rafael Zamora-Resendiz <rzamora@login05.frontier.olcf.ornl.gov>
Date: Mon, 9 Feb 2026 12:22:19 -0500
Subject: [PATCH 02/12] Frontier slurm scripts.

---
 src/deployment/frontier/mnist_example.sbatch  | 32 +++++++++++++++
 .../frontier/test_rocm_install.sbatch         | 40 +++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 src/deployment/frontier/mnist_example.sbatch
 create mode 100644 src/deployment/frontier/test_rocm_install.sbatch

diff --git a/src/deployment/frontier/mnist_example.sbatch b/src/deployment/frontier/mnist_example.sbatch
new file mode 100644
index 0000000..bf175b4
--- /dev/null
+++ b/src/deployment/frontier/mnist_example.sbatch
@@ -0,0 +1,32 @@
+#!/bin/bash -l
+#SBATCH -J modcon_basesim
+#SBATCH -t 0:20:00
+#SBATCH -N 1
+#SBATCH -p batch
+#SBATCH --exclusive
+#SBATCH --ntasks-per-node=8
+#SBATCH -o output/mnist_example.o%j
+#SBATCH -e output/mnist_example.e%j
+
+# Load required modules
+module load PrgEnv-gnu
+module load gcc/12.2.0
+module load rocm/6.4.2
+
+# ROCm/MIOpen flags
+mkdir -p $MEMBERWORK/$SBATCH_ACCOUNT/miopen
+export MIOPEN_USER_DB_PATH=$MEMBERWORK/$SBATCH_ACCOUNT/miopen
+export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$SBATCH_ACCOUNT/miopen
+export WANDB_MODE=offline
+
+# Print environment info
+echo "=============================================="
+echo "MNIST Example"
+echo "=============================================="
+echo "Date: $(date)"
+echo "Hostname: $(hostname)"
+echo "ROCM_PATH: ${ROCM_PATH}"
+echo "=============================================="
+
+# Run example
+poetry run python -m src.main --config ./examples/mnist/mnist.toml
diff --git a/src/deployment/frontier/test_rocm_install.sbatch b/src/deployment/frontier/test_rocm_install.sbatch
new file mode 100644
index 0000000..50c8f0c
--- /dev/null
+++ b/src/deployment/frontier/test_rocm_install.sbatch
@@ -0,0 +1,40 @@
+#!/bin/bash -l
+#SBATCH -J test_rocm_install
+#SBATCH -t 0:10:00
+#SBATCH -N 1
+#SBATCH --exclusive
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-node=1
+#SBATCH -o output/test_rocm_install.o%j
+#SBATCH -e output/test_rocm_install.e%j
+
+# Load required modules
+module load PrgEnv-gnu
+module load gcc/12.2.0
+module load rocm/6.4.2
+
+# ROCm/MIOpen flags
+mkdir -p $MEMBERWORK/$SBATCH_ACCOUNT/miopen
+export MIOPEN_USER_DB_PATH=$MEMBERWORK/$SBATCH_ACCOUNT/miopen
+export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$SBATCH_ACCOUNT/miopen
+
+# WANDBD set to offline; Need to set up proxy connections.
+export WANDB_MODE=offline
+
+# Print environment info
+echo "=============================================="
+echo "ROCm Installation & Harness Test"
+echo "=============================================="
+echo "Date: $(date)"
+echo "Hostname: $(hostname)"
+echo "ROCM_PATH: ${ROCM_PATH}"
+echo "=============================================="
+
+# Test torch rocm compatability
+poetry run pytest tests/deployment/frontier/test_rocm_install.py -v
+
+# Test model harness (loaders, model, etc)
+poetry run pytest tests/deployment/frontier/test_model_harness_rocm.py -v
+
+# Test jvp update
+poetry run pytest tests/deployment/frontier/test_jvp_update_rocm.py -v

From fa68f1eb0923bcd12bdcb75dd69dfd49beb1fda3 Mon Sep 17 00:00:00 2001
From: Rafael Zamora-Resendiz <rzamora@login05.frontier.olcf.ornl.gov>
Date: Mon, 9 Feb 2026 12:23:52 -0500
Subject: [PATCH 03/12] Frontier Rocm tests.

---
 tests/conftest.py                             |   5 +
 .../frontier/test_jvp_update_rocm.py          | 258 ++++++++++++++++++
 .../frontier/test_model_harness_rocm.py       | 228 ++++++++++++++++
 .../deployment/frontier/test_rocm_install.py  |  60 ++++
 4 files changed, 551 insertions(+)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/deployment/frontier/test_jvp_update_rocm.py
 create mode 100644 tests/deployment/frontier/test_model_harness_rocm.py
 create mode 100644 tests/deployment/frontier/test_rocm_install.py

diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..1b1ba82
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,5 @@
+import sys
+from pathlib import Path
+
+repo_root = Path(__file__).parent.parent
+sys.path.insert(0, str(repo_root))
diff --git a/tests/deployment/frontier/test_jvp_update_rocm.py b/tests/deployment/frontier/test_jvp_update_rocm.py
new file mode 100644
index 0000000..a39e8c2
--- /dev/null
+++ b/tests/deployment/frontier/test_jvp_update_rocm.py
@@ -0,0 +1,258 @@
+"""Tests to verify JVP regularized update works correctly with ROCm."""
+
+import pytest
+import torch
+
+from config.configuration import (
+    Config,
+    ModelCfg,
+    DataCfg,
+    TrainCfg,
+    ContinualLearningCfg,
+    DriftDetectionCfg,
+)
+
+from examples.mnist.model import MNIST_CNN
+
+from training.updater.jvp_reg import JVPRegUpdater
+from profilers import FLOPSProfiler
+
+
+@pytest.fixture
+def rocm_config():
+    """Create a config for ROCm/GPU testing."""
+    return Config(
+        model=ModelCfg(name="mnist_cnn", pretrained_path=""),
+        data=DataCfg(name="mnist", path="./data"),
+        train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001),
+        continual_learning=ContinualLearningCfg(
+            jvp_reg=0.001, deltax_norm=1.0, max_iter=5
+        ),
+        drift_detection=DriftDetectionCfg(),
+        seed=42,
+        device="cuda",
+        multi_gpu=False,
+    )
+
+
+@pytest.fixture
+def harness_with_history(rocm_config):
+    """Create MNIST harness with historical data."""
+    harness = MNIST_CNN(rocm_config)
+    harness.update_data_stream()  # First stream
+    harness.update_data_stream()  # Second stream (creates history)
+    return harness
+
+
+class TestJVPRegularizedLoss:
+    """Tests for JVPRegularizedLoss module."""
+
+    def test_jvp_loss_creation(self, harness_with_history):
+        """Test that JVPRegularizedLoss can be created."""
+        criterion = harness_with_history.get_criterion()
+        jvp_loss = JVPRegUpdater(
+            model=harness_with_history.model,
+            criterion=criterion,
+            jvp_reg=0.001,
+            deltax_norm=1.0,
+        )
+        assert jvp_loss is not None
+
+    def test_jvp_loss_forward(self, rocm_config, harness_with_history):
+        """Test that JVPRegularizedLoss forward pass works on GPU."""
+        criterion = harness_with_history.get_criterion()
+        jvp_loss = JVPRegularizedLoss(
+            model=harness_with_history.model,
+            criterion=criterion,
+            jvp_reg=0.001,
+            deltax_norm=1.0,
+        )
+
+        # Get batches
+        train_loader, _ = harness_with_history.get_cur_data_loaders()
+        hist_train_loader, _ = harness_with_history.get_hist_data_loaders()
+
+        train_batch = next(iter(train_loader))
+        hist_batch = next(iter(hist_train_loader))
+
+        # Move to device
+        train_batch = [b.to(rocm_config.device) for b in train_batch]
+        hist_batch = [b.to(rocm_config.device) for b in hist_batch]
+
+        # Forward pass
+        grad_dict, loss_curr, loss_mem = jvp_loss(train_batch, hist_batch)
+
+        assert grad_dict is not None, "Gradient dict is None"
+        assert loss_curr is not None, "Current loss is None"
+        assert loss_mem is not None, "Memory loss is None"
+
+    def test_jvp_loss_gradients_on_gpu(self, rocm_config, harness_with_history):
+        """Test that JVP gradients are computed on GPU."""
+        criterion = harness_with_history.get_criterion()
+        jvp_loss = JVPRegularizedLoss(
+            model=harness_with_history.model,
+            criterion=criterion,
+            jvp_reg=0.001,
+            deltax_norm=1.0,
+        )
+
+        # Get batches
+        train_loader, _ = harness_with_history.get_cur_data_loaders()
+        hist_train_loader, _ = harness_with_history.get_hist_data_loaders()
+
+        train_batch = next(iter(train_loader))
+        hist_batch = next(iter(hist_train_loader))
+
+        train_batch = [b.to(rocm_config.device) for b in train_batch]
+        hist_batch = [b.to(rocm_config.device) for b in hist_batch]
+
+        # Compute gradients
+        grad_dict, _, _ = jvp_loss(train_batch, hist_batch)
+
+        # Check gradients exist for all parameters
+        for name, param in harness_with_history.model.named_parameters():
+            assert name in grad_dict, f"No gradient for {name}"
+            assert grad_dict[name].is_cuda, f"Gradient for {name} not on GPU"
+            assert not torch.isnan(grad_dict[name]).any(), f"NaN in gradient for {name}"
+
+
+class TestJVPUpdateStep:
+    """Tests for step_method_jvp_reg function."""
+
+    def test_jvp_step_runs(self, rocm_config, harness_with_history):
+        """Test that JVP update step executes without error."""
+        criterion = harness_with_history.get_criterion()
+        optimizer = harness_with_history.get_optmizer()
+        model = harness_with_history.model
+        profiler = FLOPSProfiler()
+
+        jvp_loss = JVPRegularizedLoss(
+            model=model,
+            criterion=criterion,
+            jvp_reg=rocm_config.continuous_learning.jvp_reg,
+            deltax_norm=rocm_config.continuous_learning.deltax_norm,
+        )
+
+        # Get batches
+        train_loader, _ = harness_with_history.get_cur_data_loaders()
+        hist_train_loader, _ = harness_with_history.get_hist_data_loaders()
+
+        train_batch = next(iter(train_loader))
+        hist_batch = next(iter(hist_train_loader))
+
+        train_batch = [b.to(rocm_config.device) for b in train_batch]
+        hist_batch = [b.to(rocm_config.device) for b in hist_batch]
+
+        # Run update step
+        loss_curr, loss_mem, loss_total = step_method_jvp_reg(
+            model=model,
+            criterion=criterion,
+            optimizer=optimizer,
+            cfg=rocm_config,
+            iter=0,
+            train_batch=train_batch,
+            hist_batch=hist_batch,
+            profiler=profiler,
+            jvp_loss=jvp_loss,
+        )
+
+        assert loss_curr > 0, "Current loss should be positive"
+        assert loss_mem > 0, "Memory loss should be positive"
+        assert loss_total > 0, "Total loss should be positive"
+
+    def test_jvp_step_updates_weights(self, rocm_config, harness_with_history):
+        """Test that JVP update step modifies model weights."""
+        criterion = harness_with_history.get_criterion()
+        optimizer = harness_with_history.get_optmizer()
+        model = harness_with_history.model
+        profiler = FLOPSProfiler()
+
+        # Get initial weights
+        initial_weights = {
+            name: param.clone().detach() for name, param in model.named_parameters()
+        }
+
+        jvp_loss = JVPRegularizedLoss(
+            model=model,
+            criterion=criterion,
+            jvp_reg=rocm_config.continuous_learning.jvp_reg,
+            deltax_norm=rocm_config.continuous_learning.deltax_norm,
+        )
+
+        # Get batches
+        train_loader, _ = harness_with_history.get_cur_data_loaders()
+        hist_train_loader, _ = harness_with_history.get_hist_data_loaders()
+
+        train_batch = next(iter(train_loader))
+        hist_batch = next(iter(hist_train_loader))
+
+        train_batch = [b.to(rocm_config.device) for b in train_batch]
+        hist_batch = [b.to(rocm_config.device) for b in hist_batch]
+
+        # Run update step
+        step_method_jvp_reg(
+            model=model,
+            criterion=criterion,
+            optimizer=optimizer,
+            cfg=rocm_config,
+            iter=0,
+            train_batch=train_batch,
+            hist_batch=hist_batch,
+            profiler=profiler,
+            jvp_loss=jvp_loss,
+        )
+
+        # Check weights changed
+        weights_changed = False
+        for name, param in model.named_parameters():
+            if not torch.allclose(param, initial_weights[name], atol=1e-6):
+                weights_changed = True
+                break
+
+        assert weights_changed, "No weights updated after JVP step"
+
+    def test_jvp_step_multiple_iterations(self, rocm_config, harness_with_history):
+        """Test that multiple JVP update steps work correctly."""
+        criterion = harness_with_history.get_criterion()
+        optimizer = harness_with_history.get_optmizer()
+        model = harness_with_history.model
+        profiler = FLOPSProfiler()
+
+        jvp_loss = JVPRegularizedLoss(
+            model=model,
+            criterion=criterion,
+            jvp_reg=rocm_config.continuous_learning.jvp_reg,
+            deltax_norm=rocm_config.continuous_learning.deltax_norm,
+        )
+
+        # Get loaders
+        train_loader, _ = harness_with_history.get_cur_data_loaders()
+        hist_train_loader, _ = harness_with_history.get_hist_data_loaders()
+
+        train_iter = iter(train_loader)
+        hist_iter = iter(hist_train_loader)
+
+        losses = []
+        for i in range(5):
+            train_batch = next(train_iter)
+            hist_batch = next(hist_iter)
+
+            train_batch = [b.to(rocm_config.device) for b in train_batch]
+            hist_batch = [b.to(rocm_config.device) for b in hist_batch]
+
+            loss_curr, loss_mem, loss_total = step_method_jvp_reg(
+                model=model,
+                criterion=criterion,
+                optimizer=optimizer,
+                cfg=rocm_config,
+                iter=i,
+                train_batch=train_batch,
+                hist_batch=hist_batch,
+                profiler=profiler,
+                jvp_loss=jvp_loss,
+            )
+
+            losses.append(loss_total)
+
+        # All losses should be positive
+        assert all(loss > 0 for loss in losses), "Some losses are not positive"
diff --git a/tests/deployment/frontier/test_model_harness_rocm.py b/tests/deployment/frontier/test_model_harness_rocm.py
new file mode 100644
index 0000000..c8df404
--- /dev/null
+++ b/tests/deployment/frontier/test_model_harness_rocm.py
@@ -0,0 +1,228 @@
+"""Tests to verify MNIST model harness works correctly with ROCm."""
+
+import pytest
+import torch
+
+from config.configuration import (
+    Config,
+    ModelCfg,
+    DataCfg,
+    TrainCfg,
+    ContinualLearningCfg,
+    DriftDetectionCfg,
+)
+from examples.mnist.model import MNIST_CNN
+
+
+@pytest.fixture
+def rocm_config():
+    """Create a config for ROCm/GPU testing."""
+    return Config(
+        model=ModelCfg(name="mnist_cnn", pretrained_path=""),
+        data=DataCfg(name="mnist", path="./data"),
+        train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001),
+        continual_learning=ContinualLearningCfg(),
+        drift_detection=DriftDetectionCfg(),
+        seed=42,
+        device="cuda",
+        multi_gpu=False,
+    )
+
+
+@pytest.fixture
+def harness(rocm_config):
+    """Create MNIST harness and initialize data stream."""
+    harness = MNIST_CNN(rocm_config)
+    harness.update_data_stream()
+    return harness
+
+
+class TestModelLoading:
+    """Tests for model loading and GPU placement."""
+
+    def test_model_on_gpu(self, harness):
+        """Test that model is moved to GPU."""
+        device = next(harness.model.parameters()).device
+        assert device.type == "cuda", f"Model not on GPU, found {device}"
+
+    def test_model_device_matches_config(self, harness):
+        """Test that model device matches config device."""
+        device = next(harness.model.parameters()).device
+        assert str(device).startswith("cuda")
+
+
+class TestDataLoader:
+    """Tests for data loader functionality."""
+
+    def test_data_loaders_created(self, harness):
+        """Test that data loaders are created after update_data_stream."""
+        train_loader, val_loader = harness.get_cur_data_loaders()
+        assert train_loader is not None, "Train loader is None"
+        assert val_loader is not None, "Val loader is None"
+
+    def test_data_loader_batch_shape(self, harness):
+        """Test that data loader produces correct batch shapes."""
+        train_loader, _ = harness.get_cur_data_loaders()
+        batch = next(iter(train_loader))
+        x, y = batch
+        assert x.dim() == 3, f"Expected 3D input (B, H, W), got {x.dim()}D"
+        assert y.dim() == 1, f"Expected 1D labels, got {y.dim()}D"
+        assert x.shape[0] == y.shape[0], "Batch size mismatch between x and y"
+
+    def test_data_moves_to_gpu(self, harness):
+        """Test that data can be moved to GPU."""
+        train_loader, _ = harness.get_cur_data_loaders()
+        batch = next(iter(train_loader))
+        x, y = batch
+        x_gpu = x.to(harness.cfg.device)
+        y_gpu = y.to(harness.cfg.device)
+        assert x_gpu.is_cuda, "Input tensor not on GPU"
+        assert y_gpu.is_cuda, "Label tensor not on GPU"
+
+
+class TestForwardPass:
+    """Tests for model forward pass."""
+
+    def test_forward_pass_runs(self, harness):
+        """Test that forward pass executes without error."""
+        train_loader, _ = harness.get_cur_data_loaders()
+        batch = next(iter(train_loader))
+        x, y = batch
+        x = x.to(harness.cfg.device)
+
+        harness.model.eval()
+        with torch.no_grad():
+            output = harness.model(x)
+
+        assert output is not None, "Forward pass returned None"
+
+    def test_forward_pass_output_shape(self, harness):
+        """Test that forward pass produces correct output shape."""
+        train_loader, _ = harness.get_cur_data_loaders()
+        batch = next(iter(train_loader))
+        x, y = batch
+        x = x.to(harness.cfg.device)
+
+        harness.model.eval()
+        with torch.no_grad():
+            output = harness.model(x)
+
+        assert output.shape[0] == x.shape[0], "Batch size mismatch"
+        assert output.shape[1] == 10, f"Expected 10 classes, got {output.shape[1]}"
+
+    def test_forward_pass_output_on_gpu(self, harness):
+        """Test that forward pass output is on GPU."""
+        train_loader, _ = harness.get_cur_data_loaders()
+        batch = next(iter(train_loader))
+        x, _ = batch
+        x = x.to(harness.cfg.device)
+
+        harness.model.eval()
+        with torch.no_grad():
+            output = harness.model(x)
+
+        assert output.is_cuda, "Output tensor not on GPU"
+
+
+class TestEval:
+    """Tests for harness eval method."""
+
+    def test_eval_runs(self, harness):
+        """Test that eval method executes without error."""
+        metrics = harness.eval()
+        assert metrics is not None, "Eval returned None"
+
+    def test_eval_returns_metrics(self, harness):
+        """Test that eval returns expected number of metrics."""
+        metrics = harness.eval()
+        assert len(metrics) == len(harness.eval_metrics), (
+            f"Expected {len(harness.eval_metrics)} metrics, got {len(metrics)}"
+        )
+
+    def test_eval_metrics_are_valid(self, harness):
+        """Test that eval metrics are valid floats."""
+        metrics = harness.eval()
+        for i, metric in enumerate(metrics):
+            assert isinstance(metric, float), f"Metric {i} is not a float"
+            assert not torch.isnan(torch.tensor(metric)), f"Metric {i} is NaN"
+
+
+class TestTrainingStep:
+    """Tests for a single training step."""
+
+    def test_training_step(self, harness):
+        """Test that a single training step executes without error."""
+        train_loader, _ = harness.get_cur_data_loaders()
+        batch = next(iter(train_loader))
+        x, y = batch
+        x = x.to(harness.cfg.device)
+        y = y.to(harness.cfg.device)
+
+        harness.model.train()
+        optimizer = harness.get_optmizer()
+        criterion = harness.get_criterion()
+
+        optimizer.zero_grad()
+        output = harness.model(x)
+        loss = criterion(output, y)
+        loss.backward()
+        optimizer.step()
+
+        assert loss.item() > 0, "Loss should be positive"
+
+    def test_gradients_computed(self, harness):
+        """Test that gradients are computed during backward pass."""
+        train_loader, _ = harness.get_cur_data_loaders()
+        batch = next(iter(train_loader))
+        x, y = batch
+        x = x.to(harness.cfg.device)
+        y = y.to(harness.cfg.device)
+
+        harness.model.train()
+        optimizer = harness.get_optmizer()
+        criterion = harness.get_criterion()
+
+        optimizer.zero_grad()
+        output = harness.model(x)
+        loss = criterion(output, y)
+        loss.backward()
+
+        has_grad = False
+        for param in harness.model.parameters():
+            if param.grad is not None and param.grad.abs().sum() > 0:
+                has_grad = True
+                break
+
+        assert has_grad, "No gradients computed"
+
+    def test_weights_updated(self, harness):
+        """Test that weights are updated after optimizer step."""
+        train_loader, _ = harness.get_cur_data_loaders()
+        batch = next(iter(train_loader))
+        x, y = batch
+        x = x.to(harness.cfg.device)
+        y = y.to(harness.cfg.device)
+
+        harness.model.train()
+        optimizer = harness.get_optmizer()
+        criterion = harness.get_criterion()
+
+        # Get initial weights
+        initial_weights = {
+            name: param.clone() for name, param in harness.model.named_parameters()
+        }
+
+        optimizer.zero_grad()
+        output = harness.model(x)
+        loss = criterion(output, y)
+        loss.backward()
+        optimizer.step()
+
+        # Check weights changed
+        weights_changed = False
+        for name, param in harness.model.named_parameters():
+            if not torch.equal(param, initial_weights[name]):
+                weights_changed = True
+                break
+
+        assert weights_changed, "Weights not updated after optimizer step"
diff --git a/tests/deployment/frontier/test_rocm_install.py b/tests/deployment/frontier/test_rocm_install.py
new file mode 100644
index 0000000..3e7c7b8
--- /dev/null
+++ b/tests/deployment/frontier/test_rocm_install.py
@@ -0,0 +1,60 @@
+"""Tests to verify ROCm installation and PyTorch GPU support."""
+
+
+def test_torch_import():
+    """Test that PyTorch can be imported."""
+    import torch
+
+    assert torch is not None, "PyTorch import failed"
+
+
+def test_torchvision_import():
+    """Test that torchvision can be imported."""
+    import torchvision
+
+    assert torchvision is not None, "torchvision import failed"
+
+
+def test_rocm_available():
+    """Test that ROCm/HIP is available through PyTorch."""
+    import torch
+
+    assert torch.cuda.is_available(), "CUDA/ROCm is not available"
+
+
+def test_gpu_count():
+    """Test that at least one GPU is detected."""
+    import torch
+
+    gpu_count = torch.cuda.device_count()
+    assert gpu_count > 0, f"No GPUs detected, found {gpu_count}"
+
+
+def test_gpu_properties():
+    """Test that GPU properties can be queried."""
+    import torch
+
+    assert torch.cuda.is_available(), "CUDA/ROCm not available"
+    for i in range(torch.cuda.device_count()):
+        props = torch.cuda.get_device_properties(i)
+        assert props.name is not None
+        assert props.total_memory > 0
+
+
+def test_tensor_on_gpu():
+    """Test that tensors can be created and moved to GPU."""
+    import torch
+
+    assert torch.cuda.is_available(), "CUDA/ROCm not available"
+    x = torch.randn(100, 100).cuda()
+    assert x.is_cuda, "Tensor not on GPU"
+    y = x @ x.T
+    assert y.is_cuda, "Result tensor not on GPU"
+
+
+def test_torch_rocm_build():
+    """Test that PyTorch was built with ROCm support."""
+    import torch
+
+    hip_version = getattr(torch.version, "hip", None)
+    assert hip_version is not None, "PyTorch not built with ROCm/HIP support"

From e06402e6152196332ff532c1226bce45914931d7 Mon Sep 17 00:00:00 2001
From: "Rafael Zamora-Resendiz (AMCRD)" <rzamora@lbnl-syseng.local>
Date: Mon, 9 Feb 2026 12:28:20 -0500
Subject: [PATCH 04/12] Updated interfaces with JVP updater.

---
 .../frontier/test_jvp_update_rocm.py          | 174 +++++++-----------
 1 file changed, 68 insertions(+), 106 deletions(-)

diff --git a/tests/deployment/frontier/test_jvp_update_rocm.py b/tests/deployment/frontier/test_jvp_update_rocm.py
index a39e8c2..25880a6 100644
--- a/tests/deployment/frontier/test_jvp_update_rocm.py
+++ b/tests/deployment/frontier/test_jvp_update_rocm.py
@@ -15,7 +15,6 @@
 from examples.mnist.model import MNIST_CNN
 
 from training.updater.jvp_reg import JVPRegUpdater
-from profilers import FLOPSProfiler
 
 
 @pytest.fixture
@@ -26,7 +25,7 @@ def rocm_config():
         data=DataCfg(name="mnist", path="./data"),
         train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001),
         continual_learning=ContinualLearningCfg(
-            jvp_reg=0.001, deltax_norm=1.0, max_iter=5
+            jvp_lambda=0.001, jvp_deltax_norm=1.0, max_iter=5
         ),
         drift_detection=DriftDetectionCfg(),
         seed=42,
@@ -44,28 +43,22 @@ def harness_with_history(rocm_config):
     return harness
 
 
-class TestJVPRegularizedLoss:
-    """Tests for JVPRegularizedLoss module."""
+class TestJVPRegUpdater:
+    """Tests for JVPRegUpdater class."""
 
-    def test_jvp_loss_creation(self, harness_with_history):
-        """Test that JVPRegularizedLoss can be created."""
-        criterion = harness_with_history.get_criterion()
-        jvp_loss = JVPRegUpdater(
-            model=harness_with_history.model,
-            criterion=criterion,
-            jvp_reg=0.001,
-            deltax_norm=1.0,
+    def test_jvp_updater_creation(self, rocm_config, harness_with_history):
+        """Test that JVPRegUpdater can be created."""
+        jvp_updater = JVPRegUpdater(
+            cfg=rocm_config,
+            modelHarness=harness_with_history,
         )
-        assert jvp_loss is not None
-
-    def test_jvp_loss_forward(self, rocm_config, harness_with_history):
-        """Test that JVPRegularizedLoss forward pass works on GPU."""
-        criterion = harness_with_history.get_criterion()
-        jvp_loss = JVPRegularizedLoss(
-            model=harness_with_history.model,
-            criterion=criterion,
-            jvp_reg=0.001,
-            deltax_norm=1.0,
+        assert jvp_updater is not None
+
+    def test_jvp_updater_forward_backward(self, rocm_config, harness_with_history):
+        """Test that JVPRegUpdater forward-backward pass works on GPU."""
+        jvp_updater = JVPRegUpdater(
+            cfg=rocm_config,
+            modelHarness=harness_with_history,
         )
 
         # Get batches
@@ -76,24 +69,23 @@ def test_jvp_loss_forward(self, rocm_config, harness_with_history):
         hist_batch = next(iter(hist_train_loader))
 
         # Move to device
-        train_batch = [b.to(rocm_config.device) for b in train_batch]
-        hist_batch = [b.to(rocm_config.device) for b in hist_batch]
+        train_batch = tuple(b.to(rocm_config.device) for b in train_batch)
+        hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch)
 
-        # Forward pass
-        grad_dict, loss_curr, loss_mem = jvp_loss(train_batch, hist_batch)
+        # Run forward-backward pass
+        jvp_updater.update_pre_fwd_bwd()
+        loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch)
+        loss_mem = jvp_updater.update_post_fwd_bwd()
 
-        assert grad_dict is not None, "Gradient dict is None"
         assert loss_curr is not None, "Current loss is None"
         assert loss_mem is not None, "Memory loss is None"
+        assert loss_curr > 0, "Current loss should be positive"
 
-    def test_jvp_loss_gradients_on_gpu(self, rocm_config, harness_with_history):
+    def test_jvp_gradients_on_gpu(self, rocm_config, harness_with_history):
         """Test that JVP gradients are computed on GPU."""
-        criterion = harness_with_history.get_criterion()
-        jvp_loss = JVPRegularizedLoss(
-            model=harness_with_history.model,
-            criterion=criterion,
-            jvp_reg=0.001,
-            deltax_norm=1.0,
+        jvp_updater = JVPRegUpdater(
+            cfg=rocm_config,
+            modelHarness=harness_with_history,
         )
 
         # Get batches
@@ -103,34 +95,31 @@ def test_jvp_loss_gradients_on_gpu(self, rocm_config, harness_with_history):
         train_batch = next(iter(train_loader))
         hist_batch = next(iter(hist_train_loader))
 
-        train_batch = [b.to(rocm_config.device) for b in train_batch]
-        hist_batch = [b.to(rocm_config.device) for b in hist_batch]
+        train_batch = tuple(b.to(rocm_config.device) for b in train_batch)
+        hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch)
 
         # Compute gradients
-        grad_dict, _, _ = jvp_loss(train_batch, hist_batch)
+        jvp_updater.update_pre_fwd_bwd()
+        jvp_updater.fwd_bwd(train_batch, hist_batch)
+        jvp_updater.update_post_fwd_bwd()
 
         # Check gradients exist for all parameters
         for name, param in harness_with_history.model.named_parameters():
-            assert name in grad_dict, f"No gradient for {name}"
-            assert grad_dict[name].is_cuda, f"Gradient for {name} not on GPU"
-            assert not torch.isnan(grad_dict[name]).any(), f"NaN in gradient for {name}"
+            assert param.grad is not None, f"No gradient for {name}"
+            assert param.grad.is_cuda, f"Gradient for {name} not on GPU"
+            assert not torch.isnan(param.grad).any(), f"NaN in gradient for {name}"
 
 
 class TestJVPUpdateStep:
-    """Tests for step_method_jvp_reg function."""
+    """Tests for JVP update step with optimizer."""
 
     def test_jvp_step_runs(self, rocm_config, harness_with_history):
         """Test that JVP update step executes without error."""
-        criterion = harness_with_history.get_criterion()
         optimizer = harness_with_history.get_optmizer()
-        model = harness_with_history.model
-        profiler = FLOPSProfiler()
 
-        jvp_loss = JVPRegularizedLoss(
-            model=model,
-            criterion=criterion,
-            jvp_reg=rocm_config.continuous_learning.jvp_reg,
-            deltax_norm=rocm_config.continuous_learning.deltax_norm,
+        jvp_updater = JVPRegUpdater(
+            cfg=rocm_config,
+            modelHarness=harness_with_history,
         )
 
         # Get batches
@@ -140,43 +129,32 @@ def test_jvp_step_runs(self, rocm_config, harness_with_history):
         train_batch = next(iter(train_loader))
         hist_batch = next(iter(hist_train_loader))
 
-        train_batch = [b.to(rocm_config.device) for b in train_batch]
-        hist_batch = [b.to(rocm_config.device) for b in hist_batch]
+        train_batch = tuple(b.to(rocm_config.device) for b in train_batch)
+        hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch)
 
         # Run update step
-        loss_curr, loss_mem, loss_total = step_method_jvp_reg(
-            model=model,
-            criterion=criterion,
-            optimizer=optimizer,
-            cfg=rocm_config,
-            iter=0,
-            train_batch=train_batch,
-            hist_batch=hist_batch,
-            profiler=profiler,
-            jvp_loss=jvp_loss,
-        )
+        optimizer.zero_grad()
+        jvp_updater.update_pre_fwd_bwd()
+        loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch)
+        loss_mem = jvp_updater.update_post_fwd_bwd()
+        optimizer.step()
 
         assert loss_curr > 0, "Current loss should be positive"
-        assert loss_mem > 0, "Memory loss should be positive"
-        assert loss_total > 0, "Total loss should be positive"
+        assert loss_mem >= 0, "Memory loss should be non-negative"
 
     def test_jvp_step_updates_weights(self, rocm_config, harness_with_history):
         """Test that JVP update step modifies model weights."""
-        criterion = harness_with_history.get_criterion()
         optimizer = harness_with_history.get_optmizer()
         model = harness_with_history.model
-        profiler = FLOPSProfiler()
 
         # Get initial weights
         initial_weights = {
             name: param.clone().detach() for name, param in model.named_parameters()
         }
 
-        jvp_loss = JVPRegularizedLoss(
-            model=model,
-            criterion=criterion,
-            jvp_reg=rocm_config.continuous_learning.jvp_reg,
-            deltax_norm=rocm_config.continuous_learning.deltax_norm,
+        jvp_updater = JVPRegUpdater(
+            cfg=rocm_config,
+            modelHarness=harness_with_history,
         )
 
         # Get batches
@@ -186,21 +164,15 @@ def test_jvp_step_updates_weights(self, rocm_config, harness_with_history):
         train_batch = next(iter(train_loader))
         hist_batch = next(iter(hist_train_loader))
 
-        train_batch = [b.to(rocm_config.device) for b in train_batch]
-        hist_batch = [b.to(rocm_config.device) for b in hist_batch]
+        train_batch = tuple(b.to(rocm_config.device) for b in train_batch)
+        hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch)
 
         # Run update step
-        step_method_jvp_reg(
-            model=model,
-            criterion=criterion,
-            optimizer=optimizer,
-            cfg=rocm_config,
-            iter=0,
-            train_batch=train_batch,
-            hist_batch=hist_batch,
-            profiler=profiler,
-            jvp_loss=jvp_loss,
-        )
+        optimizer.zero_grad()
+        jvp_updater.update_pre_fwd_bwd()
+        jvp_updater.fwd_bwd(train_batch, hist_batch)
+        jvp_updater.update_post_fwd_bwd()
+        optimizer.step()
 
         # Check weights changed
         weights_changed = False
@@ -213,16 +185,11 @@ def test_jvp_step_updates_weights(self, rocm_config, harness_with_history):
 
     def test_jvp_step_multiple_iterations(self, rocm_config, harness_with_history):
         """Test that multiple JVP update steps work correctly."""
-        criterion = harness_with_history.get_criterion()
         optimizer = harness_with_history.get_optmizer()
-        model = harness_with_history.model
-        profiler = FLOPSProfiler()
 
-        jvp_loss = JVPRegularizedLoss(
-            model=model,
-            criterion=criterion,
-            jvp_reg=rocm_config.continuous_learning.jvp_reg,
-            deltax_norm=rocm_config.continuous_learning.deltax_norm,
+        jvp_updater = JVPRegUpdater(
+            cfg=rocm_config,
+            modelHarness=harness_with_history,
         )
 
         # Get loaders
@@ -237,21 +204,16 @@ def test_jvp_step_multiple_iterations(self, rocm_config, harness_with_history):
             train_batch = next(train_iter)
             hist_batch = next(hist_iter)
 
-            train_batch = [b.to(rocm_config.device) for b in train_batch]
-            hist_batch = [b.to(rocm_config.device) for b in hist_batch]
-
-            loss_curr, loss_mem, loss_total = step_method_jvp_reg(
-                model=model,
-                criterion=criterion,
-                optimizer=optimizer,
-                cfg=rocm_config,
-                iter=i,
-                train_batch=train_batch,
-                hist_batch=hist_batch,
-                profiler=profiler,
-                jvp_loss=jvp_loss,
-            )
+            train_batch = tuple(b.to(rocm_config.device) for b in train_batch)
+            hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch)
+
+            optimizer.zero_grad()
+            jvp_updater.update_pre_fwd_bwd()
+            loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch)
+            loss_mem = jvp_updater.update_post_fwd_bwd()
+            optimizer.step()
 
+            loss_total = loss_curr + loss_mem
             losses.append(loss_total)
 
         # All losses should be positive

From 4e07151c5e79af639ccf6b07b0eb009f942e62aa Mon Sep 17 00:00:00 2001
From: Rafael Zamora-Resendiz <rzamora@login05.frontier.olcf.ornl.gov>
Date: Mon, 9 Feb 2026 12:39:55 -0500
Subject: [PATCH 05/12] Fixed config parameters.

---
 tests/deployment/frontier/test_jvp_update_rocm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/deployment/frontier/test_jvp_update_rocm.py b/tests/deployment/frontier/test_jvp_update_rocm.py
index 25880a6..245c95a 100644
--- a/tests/deployment/frontier/test_jvp_update_rocm.py
+++ b/tests/deployment/frontier/test_jvp_update_rocm.py
@@ -25,7 +25,7 @@ def rocm_config():
         data=DataCfg(name="mnist", path="./data"),
         train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001),
         continual_learning=ContinualLearningCfg(
-            jvp_lambda=0.001, jvp_deltax_norm=1.0, max_iter=5
+            jvp_lambda=0.001, jvp_deltax_norm=1.0,
         ),
         drift_detection=DriftDetectionCfg(),
         seed=42,

From e6d140a2db9bf8827e6a3dd1b321f789b568d432 Mon Sep 17 00:00:00 2001
From: Rafael Zamora-Resendiz <rzamora@login05.frontier.olcf.ornl.gov>
Date: Mon, 9 Feb 2026 12:50:52 -0500
Subject: [PATCH 06/12] Added deployment readme.

---
 src/deployment/frontier/README.md | 44 +++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 src/deployment/frontier/README.md

diff --git a/src/deployment/frontier/README.md b/src/deployment/frontier/README.md
new file mode 100644
index 0000000..9c9b12a
--- /dev/null
+++ b/src/deployment/frontier/README.md
@@ -0,0 +1,44 @@
+# Deployment
+
+## OLCF's Frontier
+
+### Setup
+First, create a local virtual enviroment in scratch directory and clone repo:
+
+```bash
+cd $MEMBERWORK # User scratch space
+module load python # Load stable python
+python -m venv my_env # Create a virtual environment
+source ./my_env/bin/activate
+pip install poetry
+git clone https://github.com/AI-ModCon/BaseSim_Framework.git
+```
+
+To install dependencies and torch libraries with ROCM support (6.4.2), run from the project root:
+
+```bash
+cd ./BaseSim_Framework
+source ./src/deployment/frontier/install_rocm.sh
+```
+
+### Testing Setup
+
+> Note: Testing model harness and jvp update requires MNIST dataset download on first run.
+> Download the dataset before submitting the run using:
+
+```bash
+poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist_data()"
+```
+
+Prior to running experiments, test ROCM support from the project root:
+> Pass project account via PROJECT_ACCOUNT
+```bash
+SLURM_ACCOUNT=lrnxxx sbatch src/deployment/frontier/test_rocm_install.sbatch
+```
+
+### Submit Job
+If test pass, its safe to submit run from project root:
+
+```bash
+SLURM_ACCOUNT=lrnxxx sbatch src/deployment/frontier/mnist_example.sbatch
+```

From dbff5fc3667bbe2d5eae02613236f5d1eb6fae54 Mon Sep 17 00:00:00 2001
From: Rafael Zamora-Resendiz <rzamora@login05.frontier.olcf.ornl.gov>
Date: Mon, 9 Feb 2026 13:03:29 -0500
Subject: [PATCH 07/12] Passes ruff and mypy.

---
 tests/deployment/frontier/test_jvp_update_rocm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/deployment/frontier/test_jvp_update_rocm.py b/tests/deployment/frontier/test_jvp_update_rocm.py
index 245c95a..f45da2f 100644
--- a/tests/deployment/frontier/test_jvp_update_rocm.py
+++ b/tests/deployment/frontier/test_jvp_update_rocm.py
@@ -25,7 +25,8 @@ def rocm_config():
         data=DataCfg(name="mnist", path="./data"),
         train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001),
         continual_learning=ContinualLearningCfg(
-            jvp_lambda=0.001, jvp_deltax_norm=1.0,
+            jvp_lambda=0.001,
+            jvp_deltax_norm=1.0,
         ),
         drift_detection=DriftDetectionCfg(),
         seed=42,

From 2450b0a89f0c17b1f28cd755560bf53a921882c2 Mon Sep 17 00:00:00 2001
From: Rafael Zamora-Resendiz <rzamora@login11.frontier.olcf.ornl.gov>
Date: Wed, 18 Feb 2026 14:30:00 -0500
Subject: [PATCH 08/12] Reduced tests to minimal Rocm support checks.

---
 src/deployment/frontier/README.md             |  20 +-
 src/deployment/frontier/mnist_example.sbatch  |   7 +-
 .../frontier/test_rocm_install.sbatch         |  40 ---
 .../frontier/test_jvp_update_rocm.py          | 221 -----------------
 .../frontier/test_model_harness_rocm.py       | 228 ------------------
 .../deployment/frontier/test_rocm_install.py  |  60 -----
 tests/test_rocm.py                            | 138 +++++++++++
 7 files changed, 152 insertions(+), 562 deletions(-)
 delete mode 100644 src/deployment/frontier/test_rocm_install.sbatch
 delete mode 100644 tests/deployment/frontier/test_jvp_update_rocm.py
 delete mode 100644 tests/deployment/frontier/test_model_harness_rocm.py
 delete mode 100644 tests/deployment/frontier/test_rocm_install.py
 create mode 100644 tests/test_rocm.py

diff --git a/src/deployment/frontier/README.md b/src/deployment/frontier/README.md
index 9c9b12a..9f8147d 100644
--- a/src/deployment/frontier/README.md
+++ b/src/deployment/frontier/README.md
@@ -21,23 +21,23 @@ cd ./BaseSim_Framework
 source ./src/deployment/frontier/install_rocm.sh
 ```
 
-### Testing Setup
+Prior to running experiments, test ROCM support from the project root:
+> Pass project account via PROJECT_ACCOUNT
+```bash
+poetry run pytest tests/test_rocm.py
+```
+
+
+### Submit Job
 
-> Note: Testing model harness and jvp update requires MNIST dataset download on first run.
+> Note: Requires MNIST dataset download on first run.
 > Download the dataset before submitting the run using:
 
 ```bash
 poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist_data()"
 ```
 
-Prior to running experiments, test ROCM support from the project root:
-> Pass project account via PROJECT_ACCOUNT
-```bash
-SLURM_ACCOUNT=lrnxxx sbatch src/deployment/frontier/test_rocm_install.sbatch
-```
-
-### Submit Job
-If test pass, its safe to submit run from project root:
+Submit run from project root:
 
 ```bash
 SLURM_ACCOUNT=lrnxxx sbatch src/deployment/frontier/mnist_example.sbatch
diff --git a/src/deployment/frontier/mnist_example.sbatch b/src/deployment/frontier/mnist_example.sbatch
index bf175b4..358d00f 100644
--- a/src/deployment/frontier/mnist_example.sbatch
+++ b/src/deployment/frontier/mnist_example.sbatch
@@ -14,9 +14,10 @@ module load gcc/12.2.0
 module load rocm/6.4.2
 
 # ROCm/MIOpen flags
-mkdir -p $MEMBERWORK/$SBATCH_ACCOUNT/miopen
-export MIOPEN_USER_DB_PATH=$MEMBERWORK/$SBATCH_ACCOUNT/miopen
-export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$SBATCH_ACCOUNT/miopen
+ACCOUNT=$(sacct -j $SLURM_JOB_ID --format=Account --noheader | head -1 | tr -d ' ')
+mkdir -p $MEMBERWORK/$ACCOUNT/miopen
+export MIOPEN_USER_DB_PATH=$MEMBERWORK/$ACCOUNT/miopen
+export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$ACCOUNT/miopen
 export WANDB_MODE=offline
 
 # Print environment info
diff --git a/src/deployment/frontier/test_rocm_install.sbatch b/src/deployment/frontier/test_rocm_install.sbatch
deleted file mode 100644
index 50c8f0c..0000000
--- a/src/deployment/frontier/test_rocm_install.sbatch
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J test_rocm_install
-#SBATCH -t 0:10:00
-#SBATCH -N 1
-#SBATCH --exclusive
-#SBATCH --ntasks-per-node=1
-#SBATCH --gpus-per-node=1
-#SBATCH -o output/test_rocm_install.o%j
-#SBATCH -e output/test_rocm_install.e%j
-
-# Load required modules
-module load PrgEnv-gnu
-module load gcc/12.2.0
-module load rocm/6.4.2
-
-# ROCm/MIOpen flags
-mkdir -p $MEMBERWORK/$SBATCH_ACCOUNT/miopen
-export MIOPEN_USER_DB_PATH=$MEMBERWORK/$SBATCH_ACCOUNT/miopen
-export MIOPEN_CUSTOM_CACHE_DIR=$MEMBERWORK/$SBATCH_ACCOUNT/miopen
-
-# WANDBD set to offline; Need to set up proxy connections.
-export WANDB_MODE=offline
-
-# Print environment info
-echo "=============================================="
-echo "ROCm Installation & Harness Test"
-echo "=============================================="
-echo "Date: $(date)"
-echo "Hostname: $(hostname)"
-echo "ROCM_PATH: ${ROCM_PATH}"
-echo "=============================================="
-
-# Test torch rocm compatability
-poetry run pytest tests/deployment/frontier/test_rocm_install.py -v
-
-# Test model harness (loaders, model, etc)
-poetry run pytest tests/deployment/frontier/test_model_harness_rocm.py -v
-
-# Test jvp update
-poetry run pytest tests/deployment/frontier/test_jvp_update_rocm.py -v
diff --git a/tests/deployment/frontier/test_jvp_update_rocm.py b/tests/deployment/frontier/test_jvp_update_rocm.py
deleted file mode 100644
index f45da2f..0000000
--- a/tests/deployment/frontier/test_jvp_update_rocm.py
+++ /dev/null
@@ -1,221 +0,0 @@
-"""Tests to verify JVP regularized update works correctly with ROCm."""
-
-import pytest
-import torch
-
-from config.configuration import (
-    Config,
-    ModelCfg,
-    DataCfg,
-    TrainCfg,
-    ContinualLearningCfg,
-    DriftDetectionCfg,
-)
-
-from examples.mnist.model import MNIST_CNN
-
-from training.updater.jvp_reg import JVPRegUpdater
-
-
-@pytest.fixture
-def rocm_config():
-    """Create a config for ROCm/GPU testing."""
-    return Config(
-        model=ModelCfg(name="mnist_cnn", pretrained_path=""),
-        data=DataCfg(name="mnist", path="./data"),
-        train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001),
-        continual_learning=ContinualLearningCfg(
-            jvp_lambda=0.001,
-            jvp_deltax_norm=1.0,
-        ),
-        drift_detection=DriftDetectionCfg(),
-        seed=42,
-        device="cuda",
-        multi_gpu=False,
-    )
-
-
-@pytest.fixture
-def harness_with_history(rocm_config):
-    """Create MNIST harness with historical data."""
-    harness = MNIST_CNN(rocm_config)
-    harness.update_data_stream()  # First stream
-    harness.update_data_stream()  # Second stream (creates history)
-    return harness
-
-
-class TestJVPRegUpdater:
-    """Tests for JVPRegUpdater class."""
-
-    def test_jvp_updater_creation(self, rocm_config, harness_with_history):
-        """Test that JVPRegUpdater can be created."""
-        jvp_updater = JVPRegUpdater(
-            cfg=rocm_config,
-            modelHarness=harness_with_history,
-        )
-        assert jvp_updater is not None
-
-    def test_jvp_updater_forward_backward(self, rocm_config, harness_with_history):
-        """Test that JVPRegUpdater forward-backward pass works on GPU."""
-        jvp_updater = JVPRegUpdater(
-            cfg=rocm_config,
-            modelHarness=harness_with_history,
-        )
-
-        # Get batches
-        train_loader, _ = harness_with_history.get_cur_data_loaders()
-        hist_train_loader, _ = harness_with_history.get_hist_data_loaders()
-
-        train_batch = next(iter(train_loader))
-        hist_batch = next(iter(hist_train_loader))
-
-        # Move to device
-        train_batch = tuple(b.to(rocm_config.device) for b in train_batch)
-        hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch)
-
-        # Run forward-backward pass
-        jvp_updater.update_pre_fwd_bwd()
-        loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch)
-        loss_mem = jvp_updater.update_post_fwd_bwd()
-
-        assert loss_curr is not None, "Current loss is None"
-        assert loss_mem is not None, "Memory loss is None"
-        assert loss_curr > 0, "Current loss should be positive"
-
-    def test_jvp_gradients_on_gpu(self, rocm_config, harness_with_history):
-        """Test that JVP gradients are computed on GPU."""
-        jvp_updater = JVPRegUpdater(
-            cfg=rocm_config,
-            modelHarness=harness_with_history,
-        )
-
-        # Get batches
-        train_loader, _ = harness_with_history.get_cur_data_loaders()
-        hist_train_loader, _ = harness_with_history.get_hist_data_loaders()
-
-        train_batch = next(iter(train_loader))
-        hist_batch = next(iter(hist_train_loader))
-
-        train_batch = tuple(b.to(rocm_config.device) for b in train_batch)
-        hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch)
-
-        # Compute gradients
-        jvp_updater.update_pre_fwd_bwd()
-        jvp_updater.fwd_bwd(train_batch, hist_batch)
-        jvp_updater.update_post_fwd_bwd()
-
-        # Check gradients exist for all parameters
-        for name, param in harness_with_history.model.named_parameters():
-            assert param.grad is not None, f"No gradient for {name}"
-            assert param.grad.is_cuda, f"Gradient for {name} not on GPU"
-            assert not torch.isnan(param.grad).any(), f"NaN in gradient for {name}"
-
-
-class TestJVPUpdateStep:
-    """Tests for JVP update step with optimizer."""
-
-    def test_jvp_step_runs(self, rocm_config, harness_with_history):
-        """Test that JVP update step executes without error."""
-        optimizer = harness_with_history.get_optmizer()
-
-        jvp_updater = JVPRegUpdater(
-            cfg=rocm_config,
-            modelHarness=harness_with_history,
-        )
-
-        # Get batches
-        train_loader, _ = harness_with_history.get_cur_data_loaders()
-        hist_train_loader, _ = harness_with_history.get_hist_data_loaders()
-
-        train_batch = next(iter(train_loader))
-        hist_batch = next(iter(hist_train_loader))
-
-        train_batch = tuple(b.to(rocm_config.device) for b in train_batch)
-        hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch)
-
-        # Run update step
-        optimizer.zero_grad()
-        jvp_updater.update_pre_fwd_bwd()
-        loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch)
-        loss_mem = jvp_updater.update_post_fwd_bwd()
-        optimizer.step()
-
-        assert loss_curr > 0, "Current loss should be positive"
-        assert loss_mem >= 0, "Memory loss should be non-negative"
-
-    def test_jvp_step_updates_weights(self, rocm_config, harness_with_history):
-        """Test that JVP update step modifies model weights."""
-        optimizer = harness_with_history.get_optmizer()
-        model = harness_with_history.model
-
-        # Get initial weights
-        initial_weights = {
-            name: param.clone().detach() for name, param in model.named_parameters()
-        }
-
-        jvp_updater = JVPRegUpdater(
-            cfg=rocm_config,
-            modelHarness=harness_with_history,
-        )
-
-        # Get batches
-        train_loader, _ = harness_with_history.get_cur_data_loaders()
-        hist_train_loader, _ = harness_with_history.get_hist_data_loaders()
-
-        train_batch = next(iter(train_loader))
-        hist_batch = next(iter(hist_train_loader))
-
-        train_batch = tuple(b.to(rocm_config.device) for b in train_batch)
-        hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch)
-
-        # Run update step
-        optimizer.zero_grad()
-        jvp_updater.update_pre_fwd_bwd()
-        jvp_updater.fwd_bwd(train_batch, hist_batch)
-        jvp_updater.update_post_fwd_bwd()
-        optimizer.step()
-
-        # Check weights changed
-        weights_changed = False
-        for name, param in model.named_parameters():
-            if not torch.allclose(param, initial_weights[name], atol=1e-6):
-                weights_changed = True
-                break
-
-        assert weights_changed, "No weights updated after JVP step"
-
-    def test_jvp_step_multiple_iterations(self, rocm_config, harness_with_history):
-        """Test that multiple JVP update steps work correctly."""
-        optimizer = harness_with_history.get_optmizer()
-
-        jvp_updater = JVPRegUpdater(
-            cfg=rocm_config,
-            modelHarness=harness_with_history,
-        )
-
-        # Get loaders
-        train_loader, _ = harness_with_history.get_cur_data_loaders()
-        hist_train_loader, _ = harness_with_history.get_hist_data_loaders()
-
-        train_iter = iter(train_loader)
-        hist_iter = iter(hist_train_loader)
-
-        losses = []
-        for i in range(5):
-            train_batch = next(train_iter)
-            hist_batch = next(hist_iter)
-
-            train_batch = tuple(b.to(rocm_config.device) for b in train_batch)
-            hist_batch = tuple(b.to(rocm_config.device) for b in hist_batch)
-
-            optimizer.zero_grad()
-            jvp_updater.update_pre_fwd_bwd()
-            loss_curr = jvp_updater.fwd_bwd(train_batch, hist_batch)
-            loss_mem = jvp_updater.update_post_fwd_bwd()
-            optimizer.step()
-
-            loss_total = loss_curr + loss_mem
-            losses.append(loss_total)
-
-        # All losses should be positive
-        assert all(loss > 0 for loss in losses), "Some losses are not positive"
diff --git a/tests/deployment/frontier/test_model_harness_rocm.py b/tests/deployment/frontier/test_model_harness_rocm.py
deleted file mode 100644
index c8df404..0000000
--- a/tests/deployment/frontier/test_model_harness_rocm.py
+++ /dev/null
@@ -1,228 +0,0 @@
-"""Tests to verify MNIST model harness works correctly with ROCm."""
-
-import pytest
-import torch
-
-from config.configuration import (
-    Config,
-    ModelCfg,
-    DataCfg,
-    TrainCfg,
-    ContinualLearningCfg,
-    DriftDetectionCfg,
-)
-from examples.mnist.model import MNIST_CNN
-
-
-@pytest.fixture
-def rocm_config():
-    """Create a config for ROCm/GPU testing."""
-    return Config(
-        model=ModelCfg(name="mnist_cnn", pretrained_path=""),
-        data=DataCfg(name="mnist", path="./data"),
-        train=TrainCfg(batch_size=32, num_workers=0, init_lr=0.001),
-        continual_learning=ContinualLearningCfg(),
-        drift_detection=DriftDetectionCfg(),
-        seed=42,
-        device="cuda",
-        multi_gpu=False,
-    )
-
-
-@pytest.fixture
-def harness(rocm_config):
-    """Create MNIST harness and initialize data stream."""
-    harness = MNIST_CNN(rocm_config)
-    harness.update_data_stream()
-    return harness
-
-
-class TestModelLoading:
-    """Tests for model loading and GPU placement."""
-
-    def test_model_on_gpu(self, harness):
-        """Test that model is moved to GPU."""
-        device = next(harness.model.parameters()).device
-        assert device.type == "cuda", f"Model not on GPU, found {device}"
-
-    def test_model_device_matches_config(self, harness):
-        """Test that model device matches config device."""
-        device = next(harness.model.parameters()).device
-        assert str(device).startswith("cuda")
-
-
-class TestDataLoader:
-    """Tests for data loader functionality."""
-
-    def test_data_loaders_created(self, harness):
-        """Test that data loaders are created after update_data_stream."""
-        train_loader, val_loader = harness.get_cur_data_loaders()
-        assert train_loader is not None, "Train loader is None"
-        assert val_loader is not None, "Val loader is None"
-
-    def test_data_loader_batch_shape(self, harness):
-        """Test that data loader produces correct batch shapes."""
-        train_loader, _ = harness.get_cur_data_loaders()
-        batch = next(iter(train_loader))
-        x, y = batch
-        assert x.dim() == 3, f"Expected 3D input (B, H, W), got {x.dim()}D"
-        assert y.dim() == 1, f"Expected 1D labels, got {y.dim()}D"
-        assert x.shape[0] == y.shape[0], "Batch size mismatch between x and y"
-
-    def test_data_moves_to_gpu(self, harness):
-        """Test that data can be moved to GPU."""
-        train_loader, _ = harness.get_cur_data_loaders()
-        batch = next(iter(train_loader))
-        x, y = batch
-        x_gpu = x.to(harness.cfg.device)
-        y_gpu = y.to(harness.cfg.device)
-        assert x_gpu.is_cuda, "Input tensor not on GPU"
-        assert y_gpu.is_cuda, "Label tensor not on GPU"
-
-
-class TestForwardPass:
-    """Tests for model forward pass."""
-
-    def test_forward_pass_runs(self, harness):
-        """Test that forward pass executes without error."""
-        train_loader, _ = harness.get_cur_data_loaders()
-        batch = next(iter(train_loader))
-        x, y = batch
-        x = x.to(harness.cfg.device)
-
-        harness.model.eval()
-        with torch.no_grad():
-            output = harness.model(x)
-
-        assert output is not None, "Forward pass returned None"
-
-    def test_forward_pass_output_shape(self, harness):
-        """Test that forward pass produces correct output shape."""
-        train_loader, _ = harness.get_cur_data_loaders()
-        batch = next(iter(train_loader))
-        x, y = batch
-        x = x.to(harness.cfg.device)
-
-        harness.model.eval()
-        with torch.no_grad():
-            output = harness.model(x)
-
-        assert output.shape[0] == x.shape[0], "Batch size mismatch"
-        assert output.shape[1] == 10, f"Expected 10 classes, got {output.shape[1]}"
-
-    def test_forward_pass_output_on_gpu(self, harness):
-        """Test that forward pass output is on GPU."""
-        train_loader, _ = harness.get_cur_data_loaders()
-        batch = next(iter(train_loader))
-        x, _ = batch
-        x = x.to(harness.cfg.device)
-
-        harness.model.eval()
-        with torch.no_grad():
-            output = harness.model(x)
-
-        assert output.is_cuda, "Output tensor not on GPU"
-
-
-class TestEval:
-    """Tests for harness eval method."""
-
-    def test_eval_runs(self, harness):
-        """Test that eval method executes without error."""
-        metrics = harness.eval()
-        assert metrics is not None, "Eval returned None"
-
-    def test_eval_returns_metrics(self, harness):
-        """Test that eval returns expected number of metrics."""
-        metrics = harness.eval()
-        assert len(metrics) == len(harness.eval_metrics), (
-            f"Expected {len(harness.eval_metrics)} metrics, got {len(metrics)}"
-        )
-
-    def test_eval_metrics_are_valid(self, harness):
-        """Test that eval metrics are valid floats."""
-        metrics = harness.eval()
-        for i, metric in enumerate(metrics):
-            assert isinstance(metric, float), f"Metric {i} is not a float"
-            assert not torch.isnan(torch.tensor(metric)), f"Metric {i} is NaN"
-
-
-class TestTrainingStep:
-    """Tests for a single training step."""
-
-    def test_training_step(self, harness):
-        """Test that a single training step executes without error."""
-        train_loader, _ = harness.get_cur_data_loaders()
-        batch = next(iter(train_loader))
-        x, y = batch
-        x = x.to(harness.cfg.device)
-        y = y.to(harness.cfg.device)
-
-        harness.model.train()
-        optimizer = harness.get_optmizer()
-        criterion = harness.get_criterion()
-
-        optimizer.zero_grad()
-        output = harness.model(x)
-        loss = criterion(output, y)
-        loss.backward()
-        optimizer.step()
-
-        assert loss.item() > 0, "Loss should be positive"
-
-    def test_gradients_computed(self, harness):
-        """Test that gradients are computed during backward pass."""
-        train_loader, _ = harness.get_cur_data_loaders()
-        batch = next(iter(train_loader))
-        x, y = batch
-        x = x.to(harness.cfg.device)
-        y = y.to(harness.cfg.device)
-
-        harness.model.train()
-        optimizer = harness.get_optmizer()
-        criterion = harness.get_criterion()
-
-        optimizer.zero_grad()
-        output = harness.model(x)
-        loss = criterion(output, y)
-        loss.backward()
-
-        has_grad = False
-        for param in harness.model.parameters():
-            if param.grad is not None and param.grad.abs().sum() > 0:
-                has_grad = True
-                break
-
-        assert has_grad, "No gradients computed"
-
-    def test_weights_updated(self, harness):
-        """Test that weights are updated after optimizer step."""
-        train_loader, _ = harness.get_cur_data_loaders()
-        batch = next(iter(train_loader))
-        x, y = batch
-        x = x.to(harness.cfg.device)
-        y = y.to(harness.cfg.device)
-
-        harness.model.train()
-        optimizer = harness.get_optmizer()
-        criterion = harness.get_criterion()
-
-        # Get initial weights
-        initial_weights = {
-            name: param.clone() for name, param in harness.model.named_parameters()
-        }
-
-        optimizer.zero_grad()
-        output = harness.model(x)
-        loss = criterion(output, y)
-        loss.backward()
-        optimizer.step()
-
-        # Check weights changed
-        weights_changed = False
-        for name, param in harness.model.named_parameters():
-            if not torch.equal(param, initial_weights[name]):
-                weights_changed = True
-                break
-
-        assert weights_changed, "Weights not updated after optimizer step"
diff --git a/tests/deployment/frontier/test_rocm_install.py b/tests/deployment/frontier/test_rocm_install.py
deleted file mode 100644
index 3e7c7b8..0000000
--- a/tests/deployment/frontier/test_rocm_install.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""Tests to verify ROCm installation and PyTorch GPU support."""
-
-
-def test_torch_import():
-    """Test that PyTorch can be imported."""
-    import torch
-
-    assert torch is not None, "PyTorch import failed"
-
-
-def test_torchvision_import():
-    """Test that torchvision can be imported."""
-    import torchvision
-
-    assert torchvision is not None, "torchvision import failed"
-
-
-def test_rocm_available():
-    """Test that ROCm/HIP is available through PyTorch."""
-    import torch
-
-    assert torch.cuda.is_available(), "CUDA/ROCm is not available"
-
-
-def test_gpu_count():
-    """Test that at least one GPU is detected."""
-    import torch
-
-    gpu_count = torch.cuda.device_count()
-    assert gpu_count > 0, f"No GPUs detected, found {gpu_count}"
-
-
-def test_gpu_properties():
-    """Test that GPU properties can be queried."""
-    import torch
-
-    assert torch.cuda.is_available(), "CUDA/ROCm not available"
-    for i in range(torch.cuda.device_count()):
-        props = torch.cuda.get_device_properties(i)
-        assert props.name is not None
-        assert props.total_memory > 0
-
-
-def test_tensor_on_gpu():
-    """Test that tensors can be created and moved to GPU."""
-    import torch
-
-    assert torch.cuda.is_available(), "CUDA/ROCm not available"
-    x = torch.randn(100, 100).cuda()
-    assert x.is_cuda, "Tensor not on GPU"
-    y = x @ x.T
-    assert y.is_cuda, "Result tensor not on GPU"
-
-
-def test_torch_rocm_build():
-    """Test that PyTorch was built with ROCm support."""
-    import torch
-
-    hip_version = getattr(torch.version, "hip", None)
-    assert hip_version is not None, "PyTorch not built with ROCm/HIP support"
diff --git a/tests/test_rocm.py b/tests/test_rocm.py
new file mode 100644
index 0000000..c15f931
--- /dev/null
+++ b/tests/test_rocm.py
@@ -0,0 +1,138 @@
+"""Tests to verify ROCm/CUDA installation and PyTorch GPU support."""
+
+import pytest
+import torch
+
+requires_gpu = pytest.mark.skipif(
+    not torch.cuda.is_available(), reason="No CUDA/ROCm GPU available"
+)
+requires_rocm = pytest.mark.skipif(
+    getattr(torch.version, "hip", None) is None, reason="ROCm/HIP not available"
+)
+
+
+@pytest.fixture
+def gpu_device():
+    """Return the GPU device string."""
+    return "cuda"
+
+
+def test_torch_import():
+    """Test that PyTorch can be imported."""
+    assert torch is not None, "PyTorch import failed"
+
+
+def test_torchvision_import():
+    """Test that torchvision can be imported."""
+    import torchvision
+
+    assert torchvision is not None, "torchvision import failed"
+
+
+@requires_gpu
+def test_rocm_available():
+    """Test that CUDA/ROCm is available through PyTorch."""
+    assert torch.cuda.is_available(), "CUDA/ROCm is not available"
+
+
+@requires_gpu
+def test_gpu_count():
+    """Test that at least one GPU is detected."""
+    gpu_count = torch.cuda.device_count()
+    assert gpu_count > 0, f"No GPUs detected, found {gpu_count}"
+
+
+@requires_gpu
+def test_gpu_properties():
+    """Test that GPU properties can be queried."""
+    for i in range(torch.cuda.device_count()):
+        props = torch.cuda.get_device_properties(i)
+        assert props.name is not None
+        assert props.total_memory > 0
+
+
+@requires_gpu
+def test_tensor_on_gpu():
+    """Test that tensors can be created and moved to GPU."""
+    x = torch.randn(100, 100).cuda()
+    assert x.is_cuda, "Tensor not on GPU"
+    y = x @ x.T
+    assert y.is_cuda, "Result tensor not on GPU"
+
+
+@requires_gpu
+def test_tensor_to_device(gpu_device):
+    """Test that tensor.to(device) works (used by harness and JVP tests)."""
+    x = torch.randn(32, 10)
+    x_gpu = x.to(gpu_device)
+    assert x_gpu.is_cuda, "tensor.to(device) failed"
+    assert x_gpu.shape == x.shape, "Shape changed after .to()"
+
+
+@requires_gpu
+def test_autograd_on_gpu(gpu_device):
+    """Test that backward pass and gradient computation work on GPU."""
+    x = torch.randn(16, 4, device=gpu_device, requires_grad=True)
+    w = torch.randn(4, 2, device=gpu_device, requires_grad=True)
+    loss = (x @ w).sum()
+    loss.backward()
+    assert w.grad is not None, "Gradients not computed"
+    assert w.grad.is_cuda, "Gradients not on GPU"
+    assert w.grad.shape == w.shape, "Gradient shape mismatch"
+
+
+@requires_gpu
+def test_optimizer_step_on_gpu(gpu_device):
+    """Test that optimizer zero_grad/step work on GPU parameters."""
+    param = torch.nn.Parameter(torch.randn(4, 4, device=gpu_device))
+    optimizer = torch.optim.SGD([param], lr=0.1)
+
+    initial = param.clone().detach()
+    loss = param.sum()
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+
+    assert not torch.equal(param, initial), "Weights not updated after step"
+
+
+@requires_gpu
+def test_no_grad_context(gpu_device):
+    """Test that torch.no_grad() inference mode works on GPU."""
+    w = torch.randn(4, 4, device=gpu_device, requires_grad=True)
+    with torch.no_grad():
+        y = w @ w.T
+    assert y.is_cuda, "Output not on GPU"
+    assert not y.requires_grad, "Output should not require grad inside no_grad"
+
+
+@requires_gpu
+def test_tensor_clone_detach(gpu_device):
+    """Test that clone/detach work on GPU tensors (used for weight snapshots)."""
+    x = torch.randn(4, 4, device=gpu_device, requires_grad=True)
+    y = x.clone().detach()
+    assert y.is_cuda, "Cloned tensor not on GPU"
+    assert not y.requires_grad, "Detached tensor should not require grad"
+    assert torch.equal(x, y), "Cloned tensor values differ"
+
+
+@requires_gpu
+def test_torch_comparison_ops(gpu_device):
+    """Test torch.equal, torch.allclose, and torch.isnan on GPU tensors."""
+    a = torch.tensor([1.0, 2.0, 3.0], device=gpu_device)
+    b = a.clone()
+
+    assert torch.equal(a, b), "torch.equal failed on identical GPU tensors"
+    assert torch.allclose(a, b, atol=1e-6), "torch.allclose failed"
+
+    c = torch.tensor([1.0, float("nan"), 3.0], device=gpu_device)
+    nan_mask = torch.isnan(c)
+    assert nan_mask[1].item(), "torch.isnan failed to detect NaN on GPU"
+    assert not nan_mask[0].item(), "torch.isnan false positive on GPU"
+
+
+@requires_rocm
+def test_torch_rocm_build():
+    """Test that PyTorch was built with ROCm support."""
+    hip_version = getattr(torch.version, "hip", None)
+    assert hip_version is not None, "PyTorch not built with ROCm/HIP support"

From 5287f9723e39ef4abcd61a6a4610088afafafa8f Mon Sep 17 00:00:00 2001
From: Rafael Zamora-Resendiz <15003285+rz4@users.noreply.github.com>
Date: Wed, 18 Feb 2026 14:41:27 -0500
Subject: [PATCH 09/12] Update README.md

---
 src/deployment/frontier/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/deployment/frontier/README.md b/src/deployment/frontier/README.md
index 9f8147d..cffa812 100644
--- a/src/deployment/frontier/README.md
+++ b/src/deployment/frontier/README.md
@@ -22,7 +22,6 @@ source ./src/deployment/frontier/install_rocm.sh
 ```
 
 Prior to running experiments, test ROCM support from the project root:
-> Pass project account via PROJECT_ACCOUNT
 ```bash
 poetry run pytest tests/test_rocm.py
 ```
@@ -40,5 +39,5 @@ poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist
 Submit run from project root:
 
 ```bash
-SLURM_ACCOUNT=lrnxxx sbatch src/deployment/frontier/mnist_example.sbatch
+sbatch -A lrnxxx src/deployment/frontier/mnist_example.sbatch
 ```

From 459781beb608e5aa7da6ec41bec956da106ac62b Mon Sep 17 00:00:00 2001
From: Rafael Zamora-Resendiz <rzamora@login11.frontier.olcf.ornl.gov>
Date: Tue, 17 Mar 2026 13:17:40 -0400
Subject: [PATCH 10/12] Standardized virtual environment install. Follows same
 deployment as Perlmutter w/ additional ROCM dependencies.

---
 .../frontier/{install_rocm.sh => install_venv.sh}        | 9 +++++++--
 src/deployment/frontier/mnist_example.sbatch             | 6 +++++-
 2 files changed, 12 insertions(+), 3 deletions(-)
 rename src/deployment/frontier/{install_rocm.sh => install_venv.sh} (50%)

diff --git a/src/deployment/frontier/install_rocm.sh b/src/deployment/frontier/install_venv.sh
similarity index 50%
rename from src/deployment/frontier/install_rocm.sh
rename to src/deployment/frontier/install_venv.sh
index 7a9ec78..b6b3a87 100644
--- a/src/deployment/frontier/install_rocm.sh
+++ b/src/deployment/frontier/install_venv.sh
@@ -1,11 +1,16 @@
 #!/bin/bash
 
 module load PrgEnv-gnu
+module load python/3.13.0
 module load gcc/12.2.0
 module load rocm/6.4.2
 
-poetry lock
-poetry install
+python -m venv .venv # Create a virtual environment
+source ./.venv/bin/activate # Activate environment
+pip install poetry # Install poetry
+poetry lock # Sync poetry
+poetry install --no-cache # Install poetry
+
 poetry run pip install --force-reinstall \
     torch==2.9.1+rocm6.4 \
     torchvision==0.24.1+rocm6.4 \
diff --git a/src/deployment/frontier/mnist_example.sbatch b/src/deployment/frontier/mnist_example.sbatch
index 358d00f..b50c9d0 100644
--- a/src/deployment/frontier/mnist_example.sbatch
+++ b/src/deployment/frontier/mnist_example.sbatch
@@ -10,9 +10,13 @@
 
 # Load required modules
 module load PrgEnv-gnu
+module load python/3.13.0
 module load gcc/12.2.0
 module load rocm/6.4.2
 
+#
+source ./.venv/bin/activate
+
 # ROCm/MIOpen flags
 ACCOUNT=$(sacct -j $SLURM_JOB_ID --format=Account --noheader | head -1 | tr -d ' ')
 mkdir -p $MEMBERWORK/$ACCOUNT/miopen
@@ -30,4 +34,4 @@ echo "ROCM_PATH: ${ROCM_PATH}"
 echo "=============================================="
 
 # Run example
-poetry run python -m src.main --config ./examples/mnist/mnist.toml
+python -m src.main --config ./examples/mnist/mnist.toml

From f6ba24f41480745806c9fabb2cead0d464f562dc Mon Sep 17 00:00:00 2001
From: Rafael Zamora-Resendiz <rzamora@login11.frontier.olcf.ornl.gov>
Date: Tue, 17 Mar 2026 13:21:02 -0400
Subject: [PATCH 11/12] Added link to deployment doc.

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 9b28e0b..712cdd7 100644
--- a/README.md
+++ b/README.md
@@ -70,3 +70,9 @@ poetry run pytest
 
 ## Output
 Training logs report the task id, training/test accuracy, and replay-memory accuracy every five epochs. Accuracy is computed via `test(...)` on both the current task and the accumulated memory set.
+
+## Deployment
+
+Platform-specific deployment guides:
+
+- [OLCF Frontier](./src/deployment/frontier/README.md)

From f2300de08880d7530dea8b442542a81299b74fa3 Mon Sep 17 00:00:00 2001
From: Rafael Zamora-Resendiz <rzamora@login11.frontier.olcf.ornl.gov>
Date: Tue, 17 Mar 2026 13:28:41 -0400
Subject: [PATCH 12/12] Updated deployment README.

---
 src/deployment/frontier/README.md | 53 ++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/src/deployment/frontier/README.md b/src/deployment/frontier/README.md
index cffa812..c95a1ad 100644
--- a/src/deployment/frontier/README.md
+++ b/src/deployment/frontier/README.md
@@ -1,24 +1,36 @@
 # Deployment
 
-## OLCF's Frontier
+## OLCF Froniter
 
 ### Setup
-First, create a local virtual enviroment in scratch directory and clone repo:
+
+Clone the repo into your scratch directory and run the install script:
 
 ```bash
-cd $MEMBERWORK # User scratch space
-module load python # Load stable python
-python -m venv my_env # Create a virtual environment
-source ./my_env/bin/activate
-pip install poetry
+cd $MEMBERWORK
 git clone https://github.com/AI-ModCon/BaseSim_Framework.git
+cd BaseSim_Framework
+source ./src/deployment/frontier/install_venv.sh
 ```
 
-To install dependencies and torch libraries with ROCM support (6.4.2), run from the project root:
+`install_venv.sh` creates a virtual environment, installs Poetry, and uses it to resolve and install project dependencies. The environment is saved to `.venv` in the project root. The script runs the following:
 
 ```bash
-cd ./BaseSim_Framework
-source ./src/deployment/frontier/install_rocm.sh
+module load PrgEnv-gnu
+module load python/3.13.0
+module load gcc/12.2.0
+module load rocm/6.4.2
+
+python -m venv .venv # Create a virtual environment
+source ./.venv/bin/activate # Activate environment
+pip install poetry # Install poetry
+poetry lock # Sync poetry
+poetry install --no-cache # Install poetry
+
+poetry run pip install --force-reinstall \
+     torch==2.9.1+rocm6.4 \
+     torchvision==0.24.1+rocm6.4 \
+     --index-url https://download.pytorch.org/whl/rocm6.4
 ```
 
 Prior to running experiments, test ROCM support from the project root:
@@ -26,18 +38,23 @@ Prior to running experiments, test ROCM support from the project root:
 poetry run pytest tests/test_rocm.py
 ```
 
+### Submitting a Job
+
+> **Note:** The MNIST example requires to the dataset, which is downloaded on first run. Download it before submitting a batch job:
+>
+> ```bash
+> poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist_data()"
+> ```
 
-### Submit Job
+The virtual environment can be sourced directly at the top of your SLURM script (`source .venv/bin/activate`), so Poetry is not needed at runtime — jobs run against the installed environment.
 
-> Note: Requires MNIST dataset download on first run.
-> Download the dataset before submitting the run using:
+From the project root:
 
 ```bash
-poetry run python -c "from examples.mnist.utils import get_mnist_data; get_mnist_data()"
+sbatch -A xxx src/deployment/frontier/mnist_example.sbatch
 ```
 
-Submit run from project root:
+### Troubleshooting
 
-```bash
-sbatch -A lrnxxx src/deployment/frontier/mnist_example.sbatch
-```
+- **`poetry install` fails to connect to PyPI** — Run `poetry lock` first, then retry. The lock file caches package download specs and may be stale on a new host.
+- **`poetry install` fails with disk quota errors** — Poetry's default cache is in the home directory, which has limited space. Retry with `poetry install --no-cache` or free up space in `$HOME`.