Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ __pycache__/
*.py[codz]
*$py.class
data/*
# Optional third-party example assets
examples/matey/MATEY/
examples/matey/data/
# C extensions
*.so

Expand Down
75 changes: 75 additions & 0 deletions examples/matey/Demo_SOLPS_vit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
basic_config: &basic_config
# Run settings
log_to_wandb: !!bool False #True # Use wandb integration
log_to_screen: !!bool True # Log progress to screen.
save_checkpoint: !!bool True # Save checkpoints
checkpoint_save_interval: 10 # Save every # epochs - also saves "best" according to val loss
debug_grad: !!bool True # Compute gradient/step_sizes/ect for debugging
true_time: !!bool False # Debugging setting - sets num workers to zero and activates syncs
num_data_workers: 2 #6 # Generally pulling 8 cpu per process, so using 6 for DL - not sure if best ratio
enable_amp: !!bool False # Use automatic mixed precision - blows up with low variance fields right now
compile: !!bool False # Compile model - Does not currently work
gradient_checkpointing: !!bool False # Whether to use gradient checkpointing - Slow, but lower memory
exp_dir: './Dev_SOLPS' # Output path
log_interval: 1 # How often to log - Don't think this is actually implemented
pretrained: !!bool False # Whether to load a pretrained model
# Training settings
drop_path: 0.1
batch_size: 64
max_epochs: 10
scheduler_epochs: -1
epoch_size: 20
rescale_gradients: !!bool False # Activate hook that scales block gradients to norm 1
optimizer: 'AdamW' # DAdaptAdam 'AdamW' 'SGD'
scheduler: 'none' # Only cosine implemented
warmup_steps: 0 # Warmup when not using DAdapt
learning_rate: 1e-3 #
weight_decay: 1e-3
n_states: 29 # Must be >= max field label + 1 in the dataset
state_names: ['Pressure', 'Vx', 'Vy', 'Density', 'Vx', 'Vy', 'Density', 'Pressure'] # These are not used now!
dt: 1 # Striding of data - Not currently implemented > 1
leadtime_max: 10 #prediction lead time range [1, leadtime_max]
autoregressive: !!bool True # autoregressive training or one-step prediction
supportdata: # Whether to use support data (e.g. input control actuator) as input
- input_control_act: !!bool True
n_steps: 3 #16 # Length of history to include in input
enforce_max_steps: !!bool False # If false and n_steps > dataset steps, use dataset steps. Otherwise, raise Exception.
accum_grad: 1
# Model settings
model_type: 'vit_all2all' # no need for time_type and space_type inputs
#model_type: 'svit' #currently only support time_type=="all2all_time" and space_type=="all2all"
#time_type: 'all2all_time' #
#space_type: 'all2all' #
#model_type: 'avit' #currently only support space_type=="axial_attention" and time_type=="attention"
#time_type: 'attention' #
#space_type: 'axial_attention' #
tie_fields: !!bool False # Whether to use 1 embedding per field per data
embed_dim: 192 # Dimension of internal representation - 192/384/768/1024 for Ti/S/B/L
num_heads: 3 # Number of heads for attention - 3/6/12/16 for Ti/S/B/L
processor_blocks: 12 # Number of transformer blocks in the backbone - 12/12/12/24 for Ti/S/B/L
##patch_size: [[1, 2, 2]] #[[1, 40, 40]] #, [32, 32], [64, 64]] #
tokenizer_heads:
- head_name: "tk-2D"
patch_size: [[1, 2, 2]]
sts_model: !!bool False
sts_train: !!bool False #when True, we use loss function with two parts: l_coarse/base + l_total, so that the coarse ViT approximates true solutions directly
#gammaref: 0.2 #pick all tokens that with variances larger than gammaref*max_variance to refine
#refine_ratio: 0.2 #ratio of coarse tokens picked to be refined
bias_type: 'PositionAreaBias' # Options rel, continuous, none, PositionAreaBias
bias_MLP: !!bool True
# Data settings
#train_val_test: [.6, .2, .2]
augmentation: !!bool False # Augmentation not implemented
use_all_fields: !!bool True # Prepopulate the field metadata dictionary from dictionary in datasets
tie_batches: !!bool False # Force everything in batch to come from one dset
extended_names: !!bool False # Whether to use extended names - not currently implemented
embedding_offset: 0 # Use when adding extra finetuning fields
train_data_paths: [
['examples/matey/data/fusionMT-data/solps/train', 'SOLPS2D', '','tk-2D'],
]
valid_data_paths: [
['examples/matey/data/fusionMT-data/solps/valid', 'SOLPS2D', '','tk-2D'],
]
append_datasets: [] # List of datasets to append to the input/output projections for finetuning


91 changes: 91 additions & 0 deletions examples/matey/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# MATEY Example Harness

BaseSim harness for the [MATEY](https://github.com/FusionFM/MATEY) multiscale transformer codebase.

## Setup

1. Install BaseSim (from repo root):
```bash
poetry install
```

2. Install the optional MATEY example dependency (pinned commit):
```bash
poetry install --extras matey
```
This uses GitHub SSH auth for the private MATEY repo, so your SSH key must
have access to `FusionFM/MATEY`.

This extra is pinned to:
`4e615bb5c86024632e386153bfbed028b38a8262`

Equivalent pip command:
```bash
pip install "matey @ git+ssh://git@github.com/FusionFM/MATEY.git@4e615bb5c86024632e386153bfbed028b38a8262"
```

3. (Optional) Install heavy/system-dependent packages as needed for your environment:
```bash
MAX_JOBS=4 NINJA_STATUS="[%f/%t] " pip install -vv --progress-bar on --no-build-isolation flash-attn
pip install dadaptation==3.1 # for DAdaptAdam optimizer
pip install mpi4py # requires MPI C library
pip install netCDF4 # requires HDF5/netCDF C libs
pip install git+https://github.com/sandialabs/exodusii.git # not on PyPI
```

Alternative: install flash attention without screen output

```
MAX_JOBS=4 pip install flash-attn --no-build-isolation # requires CUDA toolkit + nvcc
```
## Running

```bash
poetry run python -m src.main --config examples/matey/matey.toml
```

Outer-loop drift demo (L2 placeholder model + input-noise stream updates):

```bash
poetry run python -m src.main --config examples/matey/matey_outer_loop.toml
# or
./examples/matey/run_outer_loop.sh
```

## Configuration

Edit [matey.toml](matey.toml) to adjust training parameters, drift detection, and data paths.

The `[data].path` should point to your local SOLPS dataset root
(must contain `train/` and `valid/` directories).
This data is expected to be user-provided and is not tracked in git.

For the SOLPS example, the harness builds a deterministic file-level split of
`[0.7, 0.15, 0.15]` and materializes staged views under:

```text
output/matey_split_cache/<fingerprint>/{train,val,test}
```

The cache is reused when source files, split ratios, and seed are unchanged.

The example TOML is tuned for short smoke runs to make drift-triggered continual
learning dispatch easier to observe (`detection_interval=5`, `aggregation="last"`,
`adwin_delta=0.05`, `max_stream_updates=10`).

For the outer-loop harness, use [matey_outer_loop.toml](matey_outer_loop.toml):
- `data.name = "matey_outer_loop"` selects `model_outer_loop.py`.
- `data.path` points at `examples/matey/dump/SOLPS2DwION`.
- `continual_learning.update_mode = "none"` disables parameter updates.
- `drift_detection.metric_index = 0` monitors the `input_l2` metric.

## Files

| File | Description |
|---|---|
| `model.py` | `MATEYHarness` -- adapts MATEY models/data to BaseSim's `BaseModelHarness` interface |
| `matey.toml` | Experiment config |
| `model_outer_loop.py` | Outer-loop drift harness with L2 placeholder model and noisy input stream |
| `matey_outer_loop.toml` | Outer-loop experiment config |
| `run_outer_loop.sh` | Convenience run script for `matey_outer_loop.toml` |
| `pyproject.toml` | Optional example dependency manifest |
1 change: 1 addition & 0 deletions examples/matey/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

46 changes: 46 additions & 0 deletions examples/matey/matey.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
seed = 1337
device = "auto"
multi_gpu = false
verbosity = "INFO"

[model]
name = "matey_vit"
pretrained_path = ""

[data]
name = "matey"
# User-provided SOLPS dataset root containing train/ and valid/ folders.
# This path is local-only and should not be tracked in git.
path = "/path/to/fusionMT-data/solps"

[train]
batch_size = 16
num_workers = 2
init_lr = 0.001
max_iter = 200
grad_accumulation_steps = 1

[continual_learning]
update_mode = "base"

[drift_detection]
detector_name = "ADWINDetector"
detection_interval = 5
aggregation = "last"
metric_index = 0 # MATEY metrics: 0=nrmse, 1=rmse, 2=loss
reset_after_learning = false
max_stream_updates = 10

# ADWIN hyperparameters
adwin_delta = 0.05
adwin_minor_threshold = 0.3
adwin_moderate_threshold = 0.6

[logging]
backend = "wandb"
experiment_name = "matey-continual-learning" # Optional: project/experiment name

[visualization]
baseline = 0.0
input = "output/matey.csv"
output = "output/matey_dashboard.png"
13 changes: 13 additions & 0 deletions examples/matey/matey_batches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from examples.matey.src.matey_batches import (
MateyInputBatch,
MateyLoaderAdapter,
MateyModelAdapter,
MateyTargetBatch,
)

__all__ = [
"MateyInputBatch",
"MateyTargetBatch",
"MateyLoaderAdapter",
"MateyModelAdapter",
]
44 changes: 44 additions & 0 deletions examples/matey/matey_outer_loop.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
seed = 1337
device = "auto"
multi_gpu = false
verbosity = "INFO"

[model]
name = "matey_outer_loop_l2"
pretrained_path = ""

[data]
name = "matey_outer_loop"
path = "examples/matey/dump/SOLPS2DwION"

[train]
batch_size = 1
num_workers = 0
init_lr = 0.0
max_iter = 50
grad_accumulation_steps = 1

[continual_learning]
update_mode = "none"

[drift_detection]
detector_name = "ADWINDetector"
detection_interval = 5
aggregation = "last"
metric_index = 0 # outer-loop metrics: 0=input_l2, 1=loss
reset_after_learning = false
max_stream_updates = 20

# ADWIN hyperparameters
adwin_delta = 0.05
adwin_minor_threshold = 0.3
adwin_moderate_threshold = 0.6

[logging]
backend = "wandb"
experiment_name = "matey-outer-loop-drift"

[visualization]
baseline = 0.0
input = "output/matey_outer_loop.csv"
output = "output/matey_outer_loop_dashboard.png"
Loading
Loading