From e38e54aa4a7c15e2b6c2d64292fe867652ea1607 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Tue, 14 Apr 2026 14:20:12 +0100 Subject: [PATCH 01/29] writing pseudodata in diagonal basis and saving eigvecs and eigvals as part of vp-setupfit --- doc/sphinx/source/n3fit/runcard_detailed.rst | 6 + n3fit/src/n3fit/scripts/vp_setupfit.py | 4 +- validphys2/src/validphys/n3fit_data.py | 62 +++++++++- validphys2/src/validphys/overfit_metric.py | 44 +++++-- validphys2/src/validphys/pseudodata.py | 115 ++++++++++++++---- .../src/validphys/tests/test_pseudodata.py | 7 ++ 6 files changed, 197 insertions(+), 41 deletions(-) diff --git a/doc/sphinx/source/n3fit/runcard_detailed.rst b/doc/sphinx/source/n3fit/runcard_detailed.rst index f5ea1643eb..0db00dfaad 100644 --- a/doc/sphinx/source/n3fit/runcard_detailed.rst +++ b/doc/sphinx/source/n3fit/runcard_detailed.rst @@ -429,6 +429,12 @@ according to their experiment. Additionally, the union of these two is saved in ``/replica_/datacuts_theory_fitting_pseudodata_table.csv`` if one is not interested in the exact nature of the splitting. +When ``diagonal_basis: true`` is used (by default), the saved pseudodata indices are labeled as +``eigenmode `` corresponding to the diagonal basis used in the fit. ``vp-setupfit`` writes one +file with the diagonal-basis elements under: +``/tables/diagonal_basis_rotation.csv``. +This file stores both the eigenvalues and eigenvectors used for the rotation. + Imposing sum rules ^^^^^^^^^^^^^^^^^^ diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index 8ed0d8766a..abfda732bb 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -51,6 +51,7 @@ 'validphys.commondata', 'validphys.covmats', 'validphys.filters', + 'validphys.n3fit_data', 'validphys.results', 'validphys.theorycovariance.construction', 'validphys.photon.compute', @@ -169,10 +170,11 @@ def from_yaml(cls, o, *args, **kwargs): check_n3fit_action = 'datacuts::theory::closuretest::fitting n3fit_checks_action' else: filter_action = 'datacuts::theory::fitting filter' + rotation_action = 'datacuts::theory::fitting save_diagonal_basis_rotation' check_n3fit_action = 'datacuts::theory::fitting n3fit_checks_action' # The settings for these actions depend on the presence of closuretest - SETUPFIT_FIXED_CONFIG['actions_'] += [check_n3fit_action, filter_action] + SETUPFIT_FIXED_CONFIG['actions_'] += [check_n3fit_action, filter_action, rotation_action] # Check theory covariance matrix configuration thconfig = file_content.get('theorycovmatconfig', {}) diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index b5c5e6e239..3588f76446 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -10,6 +10,7 @@ import functools import hashlib import logging +import pathlib import numpy as np import pandas as pd @@ -21,6 +22,8 @@ log = logging.getLogger(__name__) +DIAGONAL_BASIS_ROTATION_FILENAME = "diagonal_basis_rotation.csv" + class Hashrray(TupleComp): """Wrapper class to hash a numpy array so it can be cached.""" @@ -546,6 +549,35 @@ def fitting_data_dict( exps_fitting_data_dict = collect("fitting_data_dict", ("group_dataset_inputs_by_metadata",)) +groups_dataset_inputs_fitting_covmat = collect( + "dataset_inputs_fitting_covmat", ("group_dataset_inputs_by_metadata",) +) + + +def save_diagonal_basis_rotation(output_path, _inv_covmat_prepared, diagonal_basis=True): + """Store a single fit-level table with eigenvalues and eigenvectors. + + The rows are globally indexed as ``eigenmode N`` to line up with diagonal-basis pseudodata. + """ + if not diagonal_basis: + return None + + table_path = pathlib.Path(output_path) / "tables" + table_path.mkdir(parents=True, exist_ok=True) + + _, _, diagonal_rotation, eig_vals = _inv_covmat_prepared + if diagonal_rotation is None or eig_vals is None: + log.warning("No diagonal-basis rotation available; skipping table export.") + return None + + df_rotation = pd.DataFrame(diagonal_rotation) + df_rotation.insert(0, "eig_val", eig_vals) + df_rotation.index = pd.Index([f"eigenmode {i}" for i in range(len(eig_vals))]) + + output_file = table_path / DIAGONAL_BASIS_ROTATION_FILENAME + df_rotation.to_csv(output_file, sep="\t", index=True) + log.info("Saved diagonal-basis rotation table to %s", output_file) + return output_file def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nnseed): @@ -575,7 +607,9 @@ def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nn ) -def replica_pseudodata(experiment_indexed_make_replica, replica): +def replica_pseudodata( + experiment_indexed_make_replica, exps_fitting_data_dict, replica, diagonal_basis=True +): """Creates a pandas DataFrame containing the generated pseudodata. The index is :py:func:`validphys.results.experiments_index` and the columns is the replica numbers. @@ -586,9 +620,29 @@ def replica_pseudodata(experiment_indexed_make_replica, replica): `fitting::savepseudodata` is `true` (as per the default setting) The table can be found in the replica folder i.e. /nnfit/replica_*/ """ - df = pd.concat(experiment_indexed_make_replica) - df.columns = [f"replica {replica}"] - return df + replica_column = f"replica {replica}" + + if not diagonal_basis: + df = pd.concat(experiment_indexed_make_replica) + df.columns = [replica_column] + return df + + diagonal_frames = [] + mode_counter = 0 + for indexed_replica, fit_data in zip(experiment_indexed_make_replica, exps_fitting_data_dict): + values = indexed_replica.iloc[:, 0].to_numpy() + diag_rot = fit_data.get("data_transformation") + rotated_values = values if diag_rot is None else diag_rot @ values + next_counter = mode_counter + len(rotated_values) + eigenmode_index = pd.Index( + [f"eigenmode {i}" for i in range(mode_counter, next_counter)], name="eigenmode" + ) + diagonal_frames.append( + pd.DataFrame(rotated_values, index=eigenmode_index, columns=[replica_column]) + ) + mode_counter = next_counter + + return pd.concat(diagonal_frames) @_per_replica diff --git a/validphys2/src/validphys/overfit_metric.py b/validphys2/src/validphys/overfit_metric.py index 2d8c1d1c8f..9101e0a4ff 100644 --- a/validphys2/src/validphys/overfit_metric.py +++ b/validphys2/src/validphys/overfit_metric.py @@ -17,6 +17,7 @@ from reportengine.table import table from validphys import plotutils from validphys.checks import check_at_least_two_replicas +from validphys.pseudodata import load_diagonal_rotation_and_eigenvalues log = logging.getLogger(__name__) @@ -39,6 +40,12 @@ def _create_new_val_pseudodata(pdf_data_index, fit_data_indices_list): return np.array(vl_data_fitrep)[:, :, 0] +def _eigenmode_ids(eigenmode_index): + return np.array( + [int(str(label).replace("eigenmode ", "")) for label in eigenmode_index], dtype=int + ) + + @check_at_least_two_replicas def calculate_chi2s_per_replica( pdf, # for the check @@ -47,6 +54,7 @@ def calculate_chi2s_per_replica( preds, dataset_inputs, groups_covmat_no_table, + fit, ): """Calculates, for each PDF replica, the chi2 of the validation with the pseudodata generated for all other replicas in the fit @@ -86,28 +94,46 @@ def calculate_chi2s_per_replica( PDF_predictions = pd.concat(pp) + diagonal_mode = isinstance( + recreate_pdf_pseudodata_no_table[0].val_idx, pd.Index + ) and not isinstance(recreate_pdf_pseudodata_no_table[0].val_idx, pd.MultiIndex) + rotation = None + eigenvalues = None + if diagonal_mode: + rotation, eigenvalues = load_diagonal_rotation_and_eigenvalues(fit) + chi2s_per_replica = [] for enum, pdf_data_index in enumerate(recreate_pdf_pseudodata_no_table): - prediction_filter = pdf_data_index.val_idx.droplevel(level=0) - prediction_filter.rename(["dataset", "data"], inplace=True) - PDF_predictions_val = PDF_predictions.loc[prediction_filter] - PDF_predictions_val = PDF_predictions_val.values[:, enum] + if diagonal_mode and rotation is not None: + # Build full prediction vector in data space for this PDF replica and rotate it. + full_prediction = PDF_predictions.values[:, enum] + rotated_prediction = rotation @ full_prediction + mode_ids = _eigenmode_ids(pdf_data_index.val_idx) + PDF_predictions_val = rotated_prediction[mode_ids] + else: + prediction_filter = pdf_data_index.val_idx.droplevel(level=0) + prediction_filter.rename(["dataset", "data"], inplace=True) + PDF_predictions_val = PDF_predictions.loc[prediction_filter] + PDF_predictions_val = PDF_predictions_val.values[:, enum] new_val_pseudodata_list = _create_new_val_pseudodata( pdf_data_index, recreate_pdf_pseudodata_no_table ) - invcovmat_vl = np.linalg.inv( - groups_covmat_no_table[pdf_data_index.val_idx].T[pdf_data_index.val_idx] - ) + if diagonal_mode and eigenvalues is not None: + mode_ids = _eigenmode_ids(pdf_data_index.val_idx) + invcovmat_vl = np.diag(1.0 / eigenvalues[mode_ids]) + else: + invcovmat_vl = np.linalg.inv( + groups_covmat_no_table[pdf_data_index.val_idx].T[pdf_data_index.val_idx] + ) tmp = PDF_predictions_val - new_val_pseudodata_list chi2 = np.einsum("ij,jk,ik->i", tmp, invcovmat_vl, tmp) / tmp.shape[1] chi2s_per_replica.append(chi2) - ret = np.array(chi2s_per_replica) - return ret + return np.array(chi2s_per_replica) def array_expected_overfitting( diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index de7dd36c69..c10db996a3 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -20,6 +20,8 @@ log = logging.getLogger(__name__) +DIAGONAL_BASIS_ROTATION_FILENAME = "diagonal_basis_rotation.csv" + DataTrValSpec = namedtuple('DataTrValSpec', ['pseudodata', 'tr_idx', 'val_idx']) context_index = collect("groups_index", ("fitcontext",)) @@ -31,7 +33,7 @@ class ReplicaGenerationError(Exception): pass -def read_replica_pseudodata(fit, context_index, replica): +def read_replica_pseudodata(fit, context_index, replica, diagonal_basis=True): """Function to handle the reading of training and validation splits for a fit that has been produced with the ``savepseudodata`` flag set to ``True``. @@ -70,11 +72,6 @@ def read_replica_pseudodata(fit, context_index, replica): 5 3.117819 6 0.771079 """ - # List of length 1 due to the collect - context_index = context_index[0] - # The [0] is because of how pandas handles sorting a MultiIndex - sorted_index = context_index.sortlevel(level=range(1, 3))[0] - log.debug(f"Reading pseudodata & training/validation splits from {fit.name}.") replica_path = fit.path / "nnfit" / f"replica_{replica}" @@ -88,8 +85,16 @@ def read_replica_pseudodata(fit, context_index, replica): vl_pseudodatafile = "datacuts_theory_fitting_validation_pseudodata.csv" try: - tr = pd.read_csv(replica_path / tr_pseudodatafile, index_col=[0, 1, 2], sep="\t", header=0) - val = pd.read_csv(replica_path / vl_pseudodatafile, index_col=[0, 1, 2], sep="\t", header=0) + if diagonal_basis: + tr = pd.read_csv(replica_path / tr_pseudodatafile, index_col=[0], sep="\t", header=0) + val = pd.read_csv(replica_path / vl_pseudodatafile, index_col=[0], sep="\t", header=0) + else: + tr = pd.read_csv( + replica_path / tr_pseudodatafile, index_col=[0, 1, 2], sep="\t", header=0 + ) + val = pd.read_csv( + replica_path / vl_pseudodatafile, index_col=[0, 1, 2], sep="\t", header=0 + ) except FileNotFoundError as e: raise FileNotFoundError( "Could not find saved training and validation data files. " @@ -100,19 +105,23 @@ def read_replica_pseudodata(fit, context_index, replica): pseudodata = pd.concat((tr, val)) - # In order for this function to work also with old fit, it is necessary to remap the names - # being read (since the names in the context have already been remapped) - # The following checks whether a given name is in both the context and the fit, and if not - # tries to get it from the old_to_new mapping. - mapping = {} - context_datasets = context_index.get_level_values("dataset").unique() - for dsname in pseudodata.index.get_level_values("dataset").unique(): - if dsname not in context_datasets: - new_name, _ = legacy_to_new_map(dsname) - mapping[dsname] = new_name + if not diagonal_basis: + # List of length 1 due to the collect + context_index = context_index[0] + # The [0] is because of how pandas handles sorting a MultiIndex + sorted_index = context_index.sortlevel(level=range(1, 3))[0] + + # In order for this function to work also with old fit, it is necessary to remap the names + # being read (since the names in the context have already been remapped) + mapping = {} + context_datasets = context_index.get_level_values("dataset").unique() + for dsname in pseudodata.index.get_level_values("dataset").unique(): + if dsname not in context_datasets: + new_name, _ = legacy_to_new_map(dsname) + mapping[dsname] = new_name - pseudodata = pseudodata.rename(mapping, level=1).sort_index(level=range(1, 3)) - pseudodata.index = sorted_index + pseudodata = pseudodata.rename(mapping, level=1).sort_index(level=range(1, 3)) + pseudodata.index = sorted_index tr = pseudodata[pseudodata["type"] == "training"] val = pseudodata[pseudodata["type"] == "validation"] @@ -314,6 +323,28 @@ def indexed_make_replica(groups_index, make_replica): return pd.DataFrame(make_replica, index=groups_index, columns=["data"]) +def _load_diagonal_rotation(fit): + """Load the fixed diagonal-basis rotation table written during vp-setupfit.""" + rotation_path = fit.path / "tables" / DIAGONAL_BASIS_ROTATION_FILENAME + rotation_table = pd.read_csv(rotation_path, sep="\t", index_col=0) + rotation_table = rotation_table.sort_index( + key=lambda idx: idx.str.replace("eigenmode ", "").astype(int) + ) + eigenvector_cols = [c for c in rotation_table.columns if c.startswith("evec_")] + return rotation_table[eigenvector_cols].to_numpy() + + +def load_diagonal_rotation_and_eigenvalues(fit): + """Return the global rotation matrix and eigenvalues for diagonal-basis fits.""" + rotation_path = fit.path / "tables" / DIAGONAL_BASIS_ROTATION_FILENAME + rotation_table = pd.read_csv(rotation_path, sep="\t", index_col=0) + rotation_table = rotation_table.sort_index( + key=lambda idx: idx.str.replace("eigenmode ", "").astype(int) + ) + eigenvector_cols = [c for c in rotation_table.columns if c.startswith("evec_")] + return rotation_table[eigenvector_cols].to_numpy(), rotation_table["eigenvalue"].to_numpy() + + def level0_commondata_wc(data, fakepdf): """ Given a validphys.core.DataGroupSpec object, load commondata and @@ -432,11 +463,17 @@ def make_level1_data(data, level0_commondata_wc, filterseed, data_index, sep_mul ) # ================== generation of Level1 data ======================# - central_vals= central_values_array(level0_commondata_wc) + central_vals = central_values_array(level0_commondata_wc) group_mult_errs = group_multiplicative_errors(level0_commondata_wc, sep_mult=sep_mult) group_pos_mask = group_positivity_mask(level0_commondata_wc) level1_data = make_replica( - central_vals, filterseed, covmat, group_multiplicative_errors=group_mult_errs, group_positivity_mask=group_pos_mask, sep_mult=sep_mult, genrep=True, + central_vals, + filterseed, + covmat, + group_multiplicative_errors=group_mult_errs, + group_positivity_mask=group_pos_mask, + sep_mult=sep_mult, + genrep=True, ) indexed_level1_data = indexed_make_replica(data_index, level1_data) @@ -468,7 +505,9 @@ def make_level1_data(data, level0_commondata_wc, filterseed, data_index, sep_mul indexed_make_replicas = collect('indexed_make_replica', ('replicas',)) -def recreate_fit_pseudodata(_recreate_fit_pseudodata, fitreplicas, fit_masks): +def recreate_fit_pseudodata( + _recreate_fit_pseudodata, fitreplicas, fit_masks, fit, diagonal_basis=False +): """Function used to reconstruct the pseudodata seen by each of the Monte Carlo fit replicas. @@ -493,16 +532,28 @@ def recreate_fit_pseudodata(_recreate_fit_pseudodata, fitreplicas, fit_masks): :py:func:`validphys.pseudodata.recreate_pdf_pseudodata` """ res = [] + diagonal_rotation = _load_diagonal_rotation(fit) if diagonal_basis else None + for pseudodata, mask, rep in zip(_recreate_fit_pseudodata, fit_masks, fitreplicas): df = pd.concat(pseudodata) df.columns = [f"replica {rep}"] + + if diagonal_rotation is not None: + rotated_values = diagonal_rotation @ df.iloc[:, 0].to_numpy() + eigenmode_index = pd.Index( + [f"eigenmode {i}" for i in range(len(rotated_values))], name="eigenmode" + ) + df = pd.DataFrame(rotated_values, index=eigenmode_index, columns=[f"replica {rep}"]) + tr_idx = df.loc[mask[0].values].index val_idx = df.loc[mask[1].values].index res.append(DataTrValSpec(df, tr_idx, val_idx)) return res -def recreate_pdf_pseudodata(_recreate_pdf_pseudodata, pdfreplicas, pdf_masks): +def recreate_pdf_pseudodata( + _recreate_pdf_pseudodata, pdfreplicas, pdf_masks, fit, diagonal_basis=False +): """Like :py:func:`validphys.pseudodata.recreate_fit_pseudodata` but accounts for the postfit reshuffling of replicas. @@ -522,11 +573,21 @@ def recreate_pdf_pseudodata(_recreate_pdf_pseudodata, pdfreplicas, pdf_masks): -------- :py:func:`validphys.pseudodata.recreate_fit_pseudodata` """ - return recreate_fit_pseudodata(_recreate_pdf_pseudodata, pdfreplicas, pdf_masks) + return recreate_fit_pseudodata( + _recreate_pdf_pseudodata, pdfreplicas, pdf_masks, fit, diagonal_basis=diagonal_basis + ) pdf_masks_no_table = collect('replica_mask', ('pdfreplicas', 'fitenvironment')) -def recreate_pdf_pseudodata_no_table(_recreate_pdf_pseudodata, pdfreplicas, pdf_masks_no_table): - return recreate_pdf_pseudodata(_recreate_pdf_pseudodata, pdfreplicas, pdf_masks_no_table) +def recreate_pdf_pseudodata_no_table( + _recreate_pdf_pseudodata, pdfreplicas, pdf_masks_no_table, fit, diagonal_basis=False +): + return recreate_pdf_pseudodata( + _recreate_pdf_pseudodata, + pdfreplicas, + pdf_masks_no_table, + fit, + diagonal_basis=diagonal_basis, + ) diff --git a/validphys2/src/validphys/tests/test_pseudodata.py b/validphys2/src/validphys/tests/test_pseudodata.py index cbb1cc8b8d..29b9f37aa4 100644 --- a/validphys2/src/validphys/tests/test_pseudodata.py +++ b/validphys2/src/validphys/tests/test_pseudodata.py @@ -8,6 +8,8 @@ recreation. """ +from pathlib import Path + import numpy as np from numpy.testing import assert_allclose import pandas as pd @@ -85,6 +87,11 @@ def test_read_matches_recreate(): for fit in [PSEUDODATA_FIT, PSEUDODATA_FIT_DIAG]: diagonal_basis = True if fit == PSEUDODATA_FIT_DIAG else False + if diagonal_basis: + fit_spec = API.fit(fit=fit) + rotation_file = Path(fit_spec.path) / "tables" / "diagonal_basis_rotation.csv" + if not rotation_file.exists(): + pytest.fail(f"Diagonal rotation file not available for {fit}") reads = API.read_fit_pseudodata(fit=fit, diagonal_basis=diagonal_basis) recreates = API.recreate_fit_pseudodata(fit=fit, diagonal_basis=diagonal_basis) for read, recreate in zip(reads, recreates): From a9f621e2e684a684336e5b07e6653a8a9378431e Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Tue, 21 Apr 2026 11:36:44 +0100 Subject: [PATCH 02/29] using table decorator --- n3fit/src/n3fit/scripts/vp_setupfit.py | 2 +- validphys2/src/validphys/n3fit_data.py | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index abfda732bb..f9e05c707b 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -170,7 +170,7 @@ def from_yaml(cls, o, *args, **kwargs): check_n3fit_action = 'datacuts::theory::closuretest::fitting n3fit_checks_action' else: filter_action = 'datacuts::theory::fitting filter' - rotation_action = 'datacuts::theory::fitting save_diagonal_basis_rotation' + rotation_action = 'datacuts::theory::fitting diagonal_basis_rotation_table' check_n3fit_action = 'datacuts::theory::fitting n3fit_checks_action' # The settings for these actions depend on the presence of closuretest diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index 3588f76446..e2db7e124c 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -554,10 +554,10 @@ def fitting_data_dict( ) -def save_diagonal_basis_rotation(output_path, _inv_covmat_prepared, diagonal_basis=True): - """Store a single fit-level table with eigenvalues and eigenvectors. - - The rows are globally indexed as ``eigenmode N`` to line up with diagonal-basis pseudodata. +@table +def diagonal_basis_rotation_table(output_path, _inv_covmat_prepared, diagonal_basis=True): + """ + Save the diagonal basis rotation. """ if not diagonal_basis: return None @@ -573,11 +573,8 @@ def save_diagonal_basis_rotation(output_path, _inv_covmat_prepared, diagonal_bas df_rotation = pd.DataFrame(diagonal_rotation) df_rotation.insert(0, "eig_val", eig_vals) df_rotation.index = pd.Index([f"eigenmode {i}" for i in range(len(eig_vals))]) - - output_file = table_path / DIAGONAL_BASIS_ROTATION_FILENAME - df_rotation.to_csv(output_file, sep="\t", index=True) - log.info("Saved diagonal-basis rotation table to %s", output_file) - return output_file + log.info("Saving diagonal-basis rotation table") + return df_rotation def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nnseed): From 88620455292d978d05a072eefeb08fa1ec8dcc1c Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Tue, 21 Apr 2026 11:41:24 +0100 Subject: [PATCH 03/29] more cleaning --- validphys2/src/validphys/n3fit_data.py | 39 ++++++++++--------- .../src/validphys/tests/test_pseudodata.py | 7 ---- 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index e2db7e124c..8cc149f105 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -549,9 +549,6 @@ def fitting_data_dict( exps_fitting_data_dict = collect("fitting_data_dict", ("group_dataset_inputs_by_metadata",)) -groups_dataset_inputs_fitting_covmat = collect( - "dataset_inputs_fitting_covmat", ("group_dataset_inputs_by_metadata",) -) @table @@ -604,8 +601,19 @@ def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nn ) +def diagonal_indexed_make_replica(indexed_make_replica, fitting_data_dict): + """Rotate one group pseudodata block to the diagonal basis.""" + values = indexed_make_replica.iloc[:, 0].to_numpy() + diag_rot = fitting_data_dict.get("data_transformation") + rotated_values = values if diag_rot is None else diag_rot @ values + return pd.DataFrame(rotated_values, columns=["data"]) + + def replica_pseudodata( - experiment_indexed_make_replica, exps_fitting_data_dict, replica, diagonal_basis=True + experiment_indexed_make_replica, + experiment_diagonal_indexed_make_replica, + replica, + diagonal_basis=True, ): """Creates a pandas DataFrame containing the generated pseudodata. The index is :py:func:`validphys.results.experiments_index` and the columns @@ -624,22 +632,15 @@ def replica_pseudodata( df.columns = [replica_column] return df - diagonal_frames = [] - mode_counter = 0 - for indexed_replica, fit_data in zip(experiment_indexed_make_replica, exps_fitting_data_dict): - values = indexed_replica.iloc[:, 0].to_numpy() - diag_rot = fit_data.get("data_transformation") - rotated_values = values if diag_rot is None else diag_rot @ values - next_counter = mode_counter + len(rotated_values) - eigenmode_index = pd.Index( - [f"eigenmode {i}" for i in range(mode_counter, next_counter)], name="eigenmode" - ) - diagonal_frames.append( - pd.DataFrame(rotated_values, index=eigenmode_index, columns=[replica_column]) - ) - mode_counter = next_counter + df = pd.concat(experiment_diagonal_indexed_make_replica, ignore_index=True) + df.columns = [replica_column] + df.index = pd.Index([f"eigenmode {i}" for i in range(len(df))]) + return df + - return pd.concat(diagonal_frames) +experiment_diagonal_indexed_make_replica = collect( + "diagonal_indexed_make_replica", ("group_dataset_inputs_by_metadata",) +) @_per_replica diff --git a/validphys2/src/validphys/tests/test_pseudodata.py b/validphys2/src/validphys/tests/test_pseudodata.py index 29b9f37aa4..cbb1cc8b8d 100644 --- a/validphys2/src/validphys/tests/test_pseudodata.py +++ b/validphys2/src/validphys/tests/test_pseudodata.py @@ -8,8 +8,6 @@ recreation. """ -from pathlib import Path - import numpy as np from numpy.testing import assert_allclose import pandas as pd @@ -87,11 +85,6 @@ def test_read_matches_recreate(): for fit in [PSEUDODATA_FIT, PSEUDODATA_FIT_DIAG]: diagonal_basis = True if fit == PSEUDODATA_FIT_DIAG else False - if diagonal_basis: - fit_spec = API.fit(fit=fit) - rotation_file = Path(fit_spec.path) / "tables" / "diagonal_basis_rotation.csv" - if not rotation_file.exists(): - pytest.fail(f"Diagonal rotation file not available for {fit}") reads = API.read_fit_pseudodata(fit=fit, diagonal_basis=diagonal_basis) recreates = API.recreate_fit_pseudodata(fit=fit, diagonal_basis=diagonal_basis) for read, recreate in zip(reads, recreates): From 8fd4d1f7072a505da30cd3b23f35a8185c6a99c4 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Tue, 21 Apr 2026 12:06:59 +0100 Subject: [PATCH 04/29] more cleaning --- validphys2/src/validphys/n3fit_data.py | 6 -- validphys2/src/validphys/overfit_metric.py | 44 ++------- validphys2/src/validphys/pseudodata.py | 105 +++++---------------- 3 files changed, 34 insertions(+), 121 deletions(-) diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index 8cc149f105..f54e35ccb9 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -10,7 +10,6 @@ import functools import hashlib import logging -import pathlib import numpy as np import pandas as pd @@ -22,8 +21,6 @@ log = logging.getLogger(__name__) -DIAGONAL_BASIS_ROTATION_FILENAME = "diagonal_basis_rotation.csv" - class Hashrray(TupleComp): """Wrapper class to hash a numpy array so it can be cached.""" @@ -559,9 +556,6 @@ def diagonal_basis_rotation_table(output_path, _inv_covmat_prepared, diagonal_ba if not diagonal_basis: return None - table_path = pathlib.Path(output_path) / "tables" - table_path.mkdir(parents=True, exist_ok=True) - _, _, diagonal_rotation, eig_vals = _inv_covmat_prepared if diagonal_rotation is None or eig_vals is None: log.warning("No diagonal-basis rotation available; skipping table export.") diff --git a/validphys2/src/validphys/overfit_metric.py b/validphys2/src/validphys/overfit_metric.py index 9101e0a4ff..2d8c1d1c8f 100644 --- a/validphys2/src/validphys/overfit_metric.py +++ b/validphys2/src/validphys/overfit_metric.py @@ -17,7 +17,6 @@ from reportengine.table import table from validphys import plotutils from validphys.checks import check_at_least_two_replicas -from validphys.pseudodata import load_diagonal_rotation_and_eigenvalues log = logging.getLogger(__name__) @@ -40,12 +39,6 @@ def _create_new_val_pseudodata(pdf_data_index, fit_data_indices_list): return np.array(vl_data_fitrep)[:, :, 0] -def _eigenmode_ids(eigenmode_index): - return np.array( - [int(str(label).replace("eigenmode ", "")) for label in eigenmode_index], dtype=int - ) - - @check_at_least_two_replicas def calculate_chi2s_per_replica( pdf, # for the check @@ -54,7 +47,6 @@ def calculate_chi2s_per_replica( preds, dataset_inputs, groups_covmat_no_table, - fit, ): """Calculates, for each PDF replica, the chi2 of the validation with the pseudodata generated for all other replicas in the fit @@ -94,46 +86,28 @@ def calculate_chi2s_per_replica( PDF_predictions = pd.concat(pp) - diagonal_mode = isinstance( - recreate_pdf_pseudodata_no_table[0].val_idx, pd.Index - ) and not isinstance(recreate_pdf_pseudodata_no_table[0].val_idx, pd.MultiIndex) - rotation = None - eigenvalues = None - if diagonal_mode: - rotation, eigenvalues = load_diagonal_rotation_and_eigenvalues(fit) - chi2s_per_replica = [] for enum, pdf_data_index in enumerate(recreate_pdf_pseudodata_no_table): - if diagonal_mode and rotation is not None: - # Build full prediction vector in data space for this PDF replica and rotate it. - full_prediction = PDF_predictions.values[:, enum] - rotated_prediction = rotation @ full_prediction - mode_ids = _eigenmode_ids(pdf_data_index.val_idx) - PDF_predictions_val = rotated_prediction[mode_ids] - else: - prediction_filter = pdf_data_index.val_idx.droplevel(level=0) - prediction_filter.rename(["dataset", "data"], inplace=True) - PDF_predictions_val = PDF_predictions.loc[prediction_filter] - PDF_predictions_val = PDF_predictions_val.values[:, enum] + prediction_filter = pdf_data_index.val_idx.droplevel(level=0) + prediction_filter.rename(["dataset", "data"], inplace=True) + PDF_predictions_val = PDF_predictions.loc[prediction_filter] + PDF_predictions_val = PDF_predictions_val.values[:, enum] new_val_pseudodata_list = _create_new_val_pseudodata( pdf_data_index, recreate_pdf_pseudodata_no_table ) - if diagonal_mode and eigenvalues is not None: - mode_ids = _eigenmode_ids(pdf_data_index.val_idx) - invcovmat_vl = np.diag(1.0 / eigenvalues[mode_ids]) - else: - invcovmat_vl = np.linalg.inv( - groups_covmat_no_table[pdf_data_index.val_idx].T[pdf_data_index.val_idx] - ) + invcovmat_vl = np.linalg.inv( + groups_covmat_no_table[pdf_data_index.val_idx].T[pdf_data_index.val_idx] + ) tmp = PDF_predictions_val - new_val_pseudodata_list chi2 = np.einsum("ij,jk,ik->i", tmp, invcovmat_vl, tmp) / tmp.shape[1] chi2s_per_replica.append(chi2) + ret = np.array(chi2s_per_replica) - return np.array(chi2s_per_replica) + return ret def array_expected_overfitting( diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index c10db996a3..8f1eca10e8 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -20,8 +20,6 @@ log = logging.getLogger(__name__) -DIAGONAL_BASIS_ROTATION_FILENAME = "diagonal_basis_rotation.csv" - DataTrValSpec = namedtuple('DataTrValSpec', ['pseudodata', 'tr_idx', 'val_idx']) context_index = collect("groups_index", ("fitcontext",)) @@ -33,7 +31,7 @@ class ReplicaGenerationError(Exception): pass -def read_replica_pseudodata(fit, context_index, replica, diagonal_basis=True): +def read_replica_pseudodata(fit, context_index, replica): """Function to handle the reading of training and validation splits for a fit that has been produced with the ``savepseudodata`` flag set to ``True``. @@ -72,6 +70,11 @@ def read_replica_pseudodata(fit, context_index, replica, diagonal_basis=True): 5 3.117819 6 0.771079 """ + # List of length 1 due to the collect + context_index = context_index[0] + # The [0] is because of how pandas handles sorting a MultiIndex + sorted_index = context_index.sortlevel(level=range(1, 3))[0] + log.debug(f"Reading pseudodata & training/validation splits from {fit.name}.") replica_path = fit.path / "nnfit" / f"replica_{replica}" @@ -85,16 +88,8 @@ def read_replica_pseudodata(fit, context_index, replica, diagonal_basis=True): vl_pseudodatafile = "datacuts_theory_fitting_validation_pseudodata.csv" try: - if diagonal_basis: - tr = pd.read_csv(replica_path / tr_pseudodatafile, index_col=[0], sep="\t", header=0) - val = pd.read_csv(replica_path / vl_pseudodatafile, index_col=[0], sep="\t", header=0) - else: - tr = pd.read_csv( - replica_path / tr_pseudodatafile, index_col=[0, 1, 2], sep="\t", header=0 - ) - val = pd.read_csv( - replica_path / vl_pseudodatafile, index_col=[0, 1, 2], sep="\t", header=0 - ) + tr = pd.read_csv(replica_path / tr_pseudodatafile, index_col=[0, 1, 2], sep="\t", header=0) + val = pd.read_csv(replica_path / vl_pseudodatafile, index_col=[0, 1, 2], sep="\t", header=0) except FileNotFoundError as e: raise FileNotFoundError( "Could not find saved training and validation data files. " @@ -105,23 +100,19 @@ def read_replica_pseudodata(fit, context_index, replica, diagonal_basis=True): pseudodata = pd.concat((tr, val)) - if not diagonal_basis: - # List of length 1 due to the collect - context_index = context_index[0] - # The [0] is because of how pandas handles sorting a MultiIndex - sorted_index = context_index.sortlevel(level=range(1, 3))[0] - - # In order for this function to work also with old fit, it is necessary to remap the names - # being read (since the names in the context have already been remapped) - mapping = {} - context_datasets = context_index.get_level_values("dataset").unique() - for dsname in pseudodata.index.get_level_values("dataset").unique(): - if dsname not in context_datasets: - new_name, _ = legacy_to_new_map(dsname) - mapping[dsname] = new_name + # In order for this function to work also with old fit, it is necessary to remap the names + # being read (since the names in the context have already been remapped) + # The following checks whether a given name is in both the context and the fit, and if not + # tries to get it from the old_to_new mapping. + mapping = {} + context_datasets = context_index.get_level_values("dataset").unique() + for dsname in pseudodata.index.get_level_values("dataset").unique(): + if dsname not in context_datasets: + new_name, _ = legacy_to_new_map(dsname) + mapping[dsname] = new_name - pseudodata = pseudodata.rename(mapping, level=1).sort_index(level=range(1, 3)) - pseudodata.index = sorted_index + pseudodata = pseudodata.rename(mapping, level=1).sort_index(level=range(1, 3)) + pseudodata.index = sorted_index tr = pseudodata[pseudodata["type"] == "training"] val = pseudodata[pseudodata["type"] == "validation"] @@ -323,28 +314,6 @@ def indexed_make_replica(groups_index, make_replica): return pd.DataFrame(make_replica, index=groups_index, columns=["data"]) -def _load_diagonal_rotation(fit): - """Load the fixed diagonal-basis rotation table written during vp-setupfit.""" - rotation_path = fit.path / "tables" / DIAGONAL_BASIS_ROTATION_FILENAME - rotation_table = pd.read_csv(rotation_path, sep="\t", index_col=0) - rotation_table = rotation_table.sort_index( - key=lambda idx: idx.str.replace("eigenmode ", "").astype(int) - ) - eigenvector_cols = [c for c in rotation_table.columns if c.startswith("evec_")] - return rotation_table[eigenvector_cols].to_numpy() - - -def load_diagonal_rotation_and_eigenvalues(fit): - """Return the global rotation matrix and eigenvalues for diagonal-basis fits.""" - rotation_path = fit.path / "tables" / DIAGONAL_BASIS_ROTATION_FILENAME - rotation_table = pd.read_csv(rotation_path, sep="\t", index_col=0) - rotation_table = rotation_table.sort_index( - key=lambda idx: idx.str.replace("eigenmode ", "").astype(int) - ) - eigenvector_cols = [c for c in rotation_table.columns if c.startswith("evec_")] - return rotation_table[eigenvector_cols].to_numpy(), rotation_table["eigenvalue"].to_numpy() - - def level0_commondata_wc(data, fakepdf): """ Given a validphys.core.DataGroupSpec object, load commondata and @@ -505,9 +474,7 @@ def make_level1_data(data, level0_commondata_wc, filterseed, data_index, sep_mul indexed_make_replicas = collect('indexed_make_replica', ('replicas',)) -def recreate_fit_pseudodata( - _recreate_fit_pseudodata, fitreplicas, fit_masks, fit, diagonal_basis=False -): +def recreate_fit_pseudodata(_recreate_fit_pseudodata, fitreplicas, fit_masks): """Function used to reconstruct the pseudodata seen by each of the Monte Carlo fit replicas. @@ -532,28 +499,16 @@ def recreate_fit_pseudodata( :py:func:`validphys.pseudodata.recreate_pdf_pseudodata` """ res = [] - diagonal_rotation = _load_diagonal_rotation(fit) if diagonal_basis else None - for pseudodata, mask, rep in zip(_recreate_fit_pseudodata, fit_masks, fitreplicas): df = pd.concat(pseudodata) df.columns = [f"replica {rep}"] - - if diagonal_rotation is not None: - rotated_values = diagonal_rotation @ df.iloc[:, 0].to_numpy() - eigenmode_index = pd.Index( - [f"eigenmode {i}" for i in range(len(rotated_values))], name="eigenmode" - ) - df = pd.DataFrame(rotated_values, index=eigenmode_index, columns=[f"replica {rep}"]) - tr_idx = df.loc[mask[0].values].index val_idx = df.loc[mask[1].values].index res.append(DataTrValSpec(df, tr_idx, val_idx)) return res -def recreate_pdf_pseudodata( - _recreate_pdf_pseudodata, pdfreplicas, pdf_masks, fit, diagonal_basis=False -): +def recreate_pdf_pseudodata(_recreate_pdf_pseudodata, pdfreplicas, pdf_masks): """Like :py:func:`validphys.pseudodata.recreate_fit_pseudodata` but accounts for the postfit reshuffling of replicas. @@ -573,21 +528,11 @@ def recreate_pdf_pseudodata( -------- :py:func:`validphys.pseudodata.recreate_fit_pseudodata` """ - return recreate_fit_pseudodata( - _recreate_pdf_pseudodata, pdfreplicas, pdf_masks, fit, diagonal_basis=diagonal_basis - ) + return recreate_fit_pseudodata(_recreate_pdf_pseudodata, pdfreplicas, pdf_masks) pdf_masks_no_table = collect('replica_mask', ('pdfreplicas', 'fitenvironment')) -def recreate_pdf_pseudodata_no_table( - _recreate_pdf_pseudodata, pdfreplicas, pdf_masks_no_table, fit, diagonal_basis=False -): - return recreate_pdf_pseudodata( - _recreate_pdf_pseudodata, - pdfreplicas, - pdf_masks_no_table, - fit, - diagonal_basis=diagonal_basis, - ) +def recreate_pdf_pseudodata_no_table(_recreate_pdf_pseudodata, pdfreplicas, pdf_masks_no_table): + return recreate_pdf_pseudodata(_recreate_pdf_pseudodata, pdfreplicas, pdf_masks_no_table) From af1f2415dbc96c229262d0784d5b07eff32f9ded Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Fri, 24 Apr 2026 10:47:40 +0100 Subject: [PATCH 05/29] setting use_t0 = True in vp-setupfit Needed to compute the covmat for diagonalisation durin vp-setupfit --- n3fit/src/n3fit/scripts/vp_setupfit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index f9e05c707b..dbc41100ef 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -57,7 +57,7 @@ 'validphys.photon.compute', ] -SETUPFIT_DEFAULTS = dict(use_cuts='internal') +SETUPFIT_DEFAULTS = dict(use_cuts='internal', use_t0=True) log = logging.getLogger(__name__) From ae4b0ebcdba0a409d0a1f9ab38ae2ff4ae685de8 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Fri, 24 Apr 2026 13:17:52 +0100 Subject: [PATCH 06/29] wip on loading diag rot from table dir in vp-setupfit --- n3fit/src/n3fit/scripts/vp_setupfit.py | 2 +- validphys2/src/validphys/config.py | 8 ++++++-- validphys2/src/validphys/n3fit_data.py | 22 ++++++++++++---------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index dbc41100ef..11dfa94e9d 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -51,10 +51,10 @@ 'validphys.commondata', 'validphys.covmats', 'validphys.filters', - 'validphys.n3fit_data', 'validphys.results', 'validphys.theorycovariance.construction', 'validphys.photon.compute', + 'validphys.n3fit_data', ] SETUPFIT_DEFAULTS = dict(use_cuts='internal', use_t0=True) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 910fc1fd7b..8e57e4b39e 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -805,7 +805,7 @@ def produce_experiment_from_input(self, experiment_input, theoryid, use_cuts, fi } @configparser.explicit_node - def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=False): + def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=True): """ Produces the correct covmat to be used in fitting_data_dict according to some options: whether to include the theory covmat, whether to @@ -880,12 +880,16 @@ def produce_loaded_theory_covmat( point_prescriptions=None, use_thcovmat_in_sampling=False, use_thcovmat_in_fitting=False, + use_thcovmat_in_diag_basis=True, ): """ Loads the theory covmat from the correct file according to how it was generated by vp-setupfit. """ - if not use_thcovmat_in_sampling and not use_thcovmat_in_fitting: + + if not use_thcovmat_in_diag_basis and not ( + use_thcovmat_in_sampling or use_thcovmat_in_fitting + ): return 0.0 # Load correct file according to how the thcovmat was generated by vp-setupfit generic_path = "datacuts_theory_theorycovmatconfig_theory_covmat_custom.csv" diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index f54e35ccb9..186297a27e 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -371,6 +371,7 @@ def fitting_data_dict( _inv_covmat_prepared, kfold_masks, fittable_datasets_masked, + output_path, threshold=0.0, ): """ @@ -421,6 +422,15 @@ def fitting_data_dict( fittable_datasets = fittable_datasets_masked # all covmat manipulation is shared across the replicas for memory purposes + + # TODO: load vp-setupfit table + diagonal_basis_saved = "datacuts_theory_fitting_diagonal_basis_rotation_table.csv" + path_diagonal_basis = output_path / "tables" / diagonal_basis_saved + eigensystem = pd.read_csv( + path_diagonal_basis, index_col=[0], header=[0], sep="\t|,", engine="python" + ) + diag_rot2 = eigensystem.iloc[:, 1:].values + eig_vals2 = eigensystem["eig_val"].values covmat, inv_true, diag_rot, eig_vals = _inv_covmat_prepared # get the masks - different for each replica so fine to call here @@ -604,10 +614,7 @@ def diagonal_indexed_make_replica(indexed_make_replica, fitting_data_dict): def replica_pseudodata( - experiment_indexed_make_replica, - experiment_diagonal_indexed_make_replica, - replica, - diagonal_basis=True, + experiment_indexed_make_replica, diagonal_indexed_make_replica, replica, diagonal_basis=True ): """Creates a pandas DataFrame containing the generated pseudodata. The index is :py:func:`validphys.results.experiments_index` and the columns @@ -626,17 +633,12 @@ def replica_pseudodata( df.columns = [replica_column] return df - df = pd.concat(experiment_diagonal_indexed_make_replica, ignore_index=True) + df = pd.concat(diagonal_indexed_make_replica, ignore_index=True) df.columns = [replica_column] df.index = pd.Index([f"eigenmode {i}" for i in range(len(df))]) return df -experiment_diagonal_indexed_make_replica = collect( - "diagonal_indexed_make_replica", ("group_dataset_inputs_by_metadata",) -) - - @_per_replica @table def pseudodata_table(replica_pseudodata): From ba641ee2da6a4a408f314f6e851dddd933770c9e Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Mon, 27 Apr 2026 12:10:10 +0100 Subject: [PATCH 07/29] wip on loading diag rot from table dir in vp-setupfit --- n3fit/src/n3fit/scripts/vp_setupfit.py | 1 + validphys2/src/validphys/config.py | 7 +++---- validphys2/src/validphys/n3fit_data.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index 11dfa94e9d..254911584d 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -170,6 +170,7 @@ def from_yaml(cls, o, *args, **kwargs): check_n3fit_action = 'datacuts::theory::closuretest::fitting n3fit_checks_action' else: filter_action = 'datacuts::theory::fitting filter' + # TODO: correct namespace?, the rotation needs access to the theory covmat but it's not yet available.. rotation_action = 'datacuts::theory::fitting diagonal_basis_rotation_table' check_n3fit_action = 'datacuts::theory::fitting n3fit_checks_action' diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 8e57e4b39e..c4c654ac1d 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -805,7 +805,7 @@ def produce_experiment_from_input(self, experiment_input, theoryid, use_cuts, fi } @configparser.explicit_node - def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=True): + def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=False): """ Produces the correct covmat to be used in fitting_data_dict according to some options: whether to include the theory covmat, whether to @@ -887,9 +887,8 @@ def produce_loaded_theory_covmat( was generated by vp-setupfit. """ - if not use_thcovmat_in_diag_basis and not ( - use_thcovmat_in_sampling or use_thcovmat_in_fitting - ): + # TODO: this return a zero theory covmat which means that the diagonal basis does not include the theory covmat + if not use_thcovmat_in_sampling and not use_thcovmat_in_fitting: return 0.0 # Load correct file according to how the thcovmat was generated by vp-setupfit generic_path = "datacuts_theory_theorycovmatconfig_theory_covmat_custom.csv" diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index 186297a27e..63386c8416 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -633,7 +633,7 @@ def replica_pseudodata( df.columns = [replica_column] return df - df = pd.concat(diagonal_indexed_make_replica, ignore_index=True) + df = diagonal_indexed_make_replica.copy() df.columns = [replica_column] df.index = pd.Index([f"eigenmode {i}" for i in range(len(df))]) return df From fe3abca89aec0f801853b47fab44b54d0d65f165 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Mon, 27 Apr 2026 12:19:44 +0100 Subject: [PATCH 08/29] updating theory cov defaults --- validphys2/src/validphys/config.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index c4c654ac1d..41f3e09ba1 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -805,7 +805,7 @@ def produce_experiment_from_input(self, experiment_input, theoryid, use_cuts, fi } @configparser.explicit_node - def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=False): + def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=True): """ Produces the correct covmat to be used in fitting_data_dict according to some options: whether to include the theory covmat, whether to @@ -878,16 +878,14 @@ def produce_loaded_theory_covmat( data_input, user_covmat_path=None, point_prescriptions=None, - use_thcovmat_in_sampling=False, - use_thcovmat_in_fitting=False, - use_thcovmat_in_diag_basis=True, + use_thcovmat_in_sampling=True, + use_thcovmat_in_fitting=True, ): """ Loads the theory covmat from the correct file according to how it was generated by vp-setupfit. """ - # TODO: this return a zero theory covmat which means that the diagonal basis does not include the theory covmat if not use_thcovmat_in_sampling and not use_thcovmat_in_fitting: return 0.0 # Load correct file according to how the thcovmat was generated by vp-setupfit From 097b9276d60963e9509c911be63f6de711638f4c Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Mon, 27 Apr 2026 13:02:45 +0100 Subject: [PATCH 09/29] swapping order rotation and theory covmat in vp-setupfit --- n3fit/src/n3fit/scripts/vp_setupfit.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index 254911584d..239a7e494c 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -175,7 +175,7 @@ def from_yaml(cls, o, *args, **kwargs): check_n3fit_action = 'datacuts::theory::fitting n3fit_checks_action' # The settings for these actions depend on the presence of closuretest - SETUPFIT_FIXED_CONFIG['actions_'] += [check_n3fit_action, filter_action, rotation_action] + SETUPFIT_FIXED_CONFIG['actions_'] += [check_n3fit_action, filter_action] # Check theory covariance matrix configuration thconfig = file_content.get('theorycovmatconfig', {}) @@ -191,6 +191,8 @@ def from_yaml(cls, o, *args, **kwargs): 'datacuts::theory::theorycovmatconfig nnfit_theory_covmat' ) + SETUPFIT_FIXED_CONFIG['actions_'] += [rotation_action] + # Check fiatlux configuration fiatlux = file_content.get('fiatlux') if fiatlux is not None: From 320163c32a41ea7a249d29a338efa37732a4c824 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Mon, 27 Apr 2026 14:42:40 +0100 Subject: [PATCH 10/29] vp-setupfit now runs --- n3fit/src/n3fit/scripts/vp_setupfit.py | 2 +- validphys2/src/validphys/n3fit_data.py | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index 239a7e494c..10fe7a691d 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -171,7 +171,7 @@ def from_yaml(cls, o, *args, **kwargs): else: filter_action = 'datacuts::theory::fitting filter' # TODO: correct namespace?, the rotation needs access to the theory covmat but it's not yet available.. - rotation_action = 'datacuts::theory::fitting diagonal_basis_rotation_table' + rotation_action = 'datacuts::theory::theorycovmatconfig diagonal_basis_rotation_table' check_n3fit_action = 'datacuts::theory::fitting n3fit_checks_action' # The settings for these actions depend on the presence of closuretest diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index 63386c8416..462797e1f2 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -313,24 +313,24 @@ def fittable_datasets_masked(data): return validphys_group_extractor(data.datasets) -def _hashed_dataset_inputs_fitting_covmat(dataset_inputs_fitting_covmat) -> Hashrray: - """Wrap the covmat into a Hashrray for caches to work""" - return Hashrray(dataset_inputs_fitting_covmat) +# def _hashed_dataset_inputs_fitting_covmat(dataset_inputs_fitting_covmat) -> Hashrray: +# """Wrap the covmat into a Hashrray for caches to work""" +# return Hashrray(dataset_inputs_fitting_covmat) -@functools.lru_cache -def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat, diagonal_basis=True): +# @functools.lru_cache +def _inv_covmat_prepared(dataset_inputs_fitting_covmat, diagonal_basis=True): """Returns the inverse covmats for training, validation and total - attending to the right masks and whether it is diagonal or not. - - Since the masks and number of datapoints need to be treated for 1-point datasets - it also returns the right ndata and masks for training and validation: + attending to the right masks and whether it is diagonal or not. + s + Since the masks and number of datapoints need to be treated for 1-point datasets + it also returns the right ndata and masks for training and validation: """ - log.info( - f"_inv_covmat_prepared called with covmat hash={hash(_hashed_dataset_inputs_fitting_covmat)}, diagonal_basis={diagonal_basis}" - ) - covmat = _hashed_dataset_inputs_fitting_covmat.array + # log.info( + # f"_inv_covmat_prepared called with covmat hash={hash(_hashed_dataset_inputs_fitting_covmat)}, diagonal_basis={diagonal_basis}" + # ) + covmat = dataset_inputs_fitting_covmat diagonal_rotation = None eig_vals = None From 140bdcf555d648c17c1305c411b1889f0a074458 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Mon, 27 Apr 2026 15:25:34 +0100 Subject: [PATCH 11/29] calling nnfit_theory_covmat instead of reading the csv --- validphys2/src/validphys/config.py | 5 ++--- validphys2/src/validphys/n3fit_data.py | 27 ++++++++++++++++---------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 41f3e09ba1..299a6f5093 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -805,17 +805,16 @@ def produce_experiment_from_input(self, experiment_input, theoryid, use_cuts, fi } @configparser.explicit_node - def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=True): + def produce_dataset_inputs_fitting_covmat(self): """ Produces the correct covmat to be used in fitting_data_dict according to some options: whether to include the theory covmat, whether to separate the multiplcative errors and whether to compute the experimental covmat using the t0 prescription. """ + # TODO update docstring from validphys import covmats - if use_thcovmat_in_fitting: - return covmats.dataset_inputs_t0_total_covmat return covmats.dataset_inputs_t0_exp_covmat def produce_sep_mult(self, separate_multiplicative=False): diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index 462797e1f2..4492e245f9 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -319,7 +319,7 @@ def fittable_datasets_masked(data): # @functools.lru_cache -def _inv_covmat_prepared(dataset_inputs_fitting_covmat, diagonal_basis=True): +def _inv_covmat_prepared(dataset_inputs_fitting_covmat, nnfit_theory_covmat, diagonal_basis=True): """Returns the inverse covmats for training, validation and total attending to the right masks and whether it is diagonal or not. s @@ -330,7 +330,13 @@ def _inv_covmat_prepared(dataset_inputs_fitting_covmat, diagonal_basis=True): # log.info( # f"_inv_covmat_prepared called with covmat hash={hash(_hashed_dataset_inputs_fitting_covmat)}, diagonal_basis={diagonal_basis}" # ) + + # TODO: JtH, note to self: inv_covmat_prepared can no longer be called during n3fit because nnfit_theory_covmat is in a different namespace + covmat = dataset_inputs_fitting_covmat + if nnfit_theory_covmat is not None: + covmat += nnfit_theory_covmat + diagonal_rotation = None eig_vals = None @@ -368,7 +374,6 @@ def fitting_data_dict( make_replica, dataset_inputs_loaded_cd_with_cuts, masks, - _inv_covmat_prepared, kfold_masks, fittable_datasets_masked, output_path, @@ -421,17 +426,17 @@ def fitting_data_dict( expdata = make_replica fittable_datasets = fittable_datasets_masked - # all covmat manipulation is shared across the replicas for memory purposes - - # TODO: load vp-setupfit table - diagonal_basis_saved = "datacuts_theory_fitting_diagonal_basis_rotation_table.csv" + # load covmat stored at the time of vp-setupfit + diagonal_basis_saved = "datacuts_theory_theorycovmatconfig_diagonal_basis_rotation_table.csv" path_diagonal_basis = output_path / "tables" / diagonal_basis_saved eigensystem = pd.read_csv( path_diagonal_basis, index_col=[0], header=[0], sep="\t|,", engine="python" ) - diag_rot2 = eigensystem.iloc[:, 1:].values - eig_vals2 = eigensystem["eig_val"].values - covmat, inv_true, diag_rot, eig_vals = _inv_covmat_prepared + diag_rot = eigensystem.iloc[:, 1:].values + eig_vals = eigensystem["eig_val"].values + inv_true = np.diag(1 / eig_vals) + covmat = np.diag(eig_vals) + # covmat, inv_true, diag_rot, eig_vals = _inv_covmat_prepared # get the masks - different for each replica so fine to call here tr_mask, vl_mask = masks.tr_masks[0], masks.vl_masks[0] @@ -563,10 +568,12 @@ def diagonal_basis_rotation_table(output_path, _inv_covmat_prepared, diagonal_ba """ Save the diagonal basis rotation. """ + covmat, _, diagonal_rotation, eig_vals = _inv_covmat_prepared + if not diagonal_basis: + return None - _, _, diagonal_rotation, eig_vals = _inv_covmat_prepared if diagonal_rotation is None or eig_vals is None: log.warning("No diagonal-basis rotation available; skipping table export.") return None From ba59edfe5e04c7ab7b341be5d6b2dd9554ed4fc4 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Mon, 27 Apr 2026 15:49:14 +0100 Subject: [PATCH 12/29] vp-setupfit stores covmat in diagonal and non-diagonal case --- n3fit/src/n3fit/scripts/vp_setupfit.py | 3 +- validphys2/src/validphys/n3fit_data.py | 51 +++++++++++++++++--------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index 10fe7a691d..6c54139653 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -170,8 +170,7 @@ def from_yaml(cls, o, *args, **kwargs): check_n3fit_action = 'datacuts::theory::closuretest::fitting n3fit_checks_action' else: filter_action = 'datacuts::theory::fitting filter' - # TODO: correct namespace?, the rotation needs access to the theory covmat but it's not yet available.. - rotation_action = 'datacuts::theory::theorycovmatconfig diagonal_basis_rotation_table' + rotation_action = 'datacuts::theory::theorycovmatconfig fitting_covmat_table' check_n3fit_action = 'datacuts::theory::fitting n3fit_checks_action' # The settings for these actions depend on the presence of closuretest diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index 4492e245f9..acb4b40ff5 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -378,6 +378,7 @@ def fitting_data_dict( fittable_datasets_masked, output_path, threshold=0.0, + diagonal_basis=True, ): """ Provider which takes the information from validphys ``data``. @@ -427,16 +428,33 @@ def fitting_data_dict( fittable_datasets = fittable_datasets_masked # load covmat stored at the time of vp-setupfit - diagonal_basis_saved = "datacuts_theory_theorycovmatconfig_diagonal_basis_rotation_table.csv" - path_diagonal_basis = output_path / "tables" / diagonal_basis_saved - eigensystem = pd.read_csv( - path_diagonal_basis, index_col=[0], header=[0], sep="\t|,", engine="python" - ) - diag_rot = eigensystem.iloc[:, 1:].values - eig_vals = eigensystem["eig_val"].values - inv_true = np.diag(1 / eig_vals) - covmat = np.diag(eig_vals) - # covmat, inv_true, diag_rot, eig_vals = _inv_covmat_prepared + if diagonal_basis: + diagonal_basis_saved = "datacuts_theory_theorycovmatconfig_fitting_covmat_table.csv" + path_diagonal_basis = output_path / "tables" / diagonal_basis_saved + eigensystem = pd.read_csv( + path_diagonal_basis, index_col=[0], header=[0], sep="\t|,", engine="python" + ) + diag_rot = eigensystem.iloc[:, 1:].values + eig_vals = eigensystem["eig_val"].values + inv_true = np.diag(1 / eig_vals) + covmat = np.diag(eig_vals) + else: + covmat_saved = "datacuts_theory_theorycovmatconfig_fitting_covmat_table.csv" + path_covmat = output_path / "tables" / covmat_saved + # TODO: check if it is save to convert to numpy already at this stage, is the indexing consistent? + covmat = pd.read_csv( + path_covmat, index_col=[0, 1, 2], header=[0, 1, 2], sep="\t|,", engine="python" + ).values + diag_rot = None + eig_vals = None + # TODO: cache this so that is reused + diag_inv_sqrt_total = 1 / np.sqrt(np.diag(covmat)) + cormat_total = np.einsum("i, ij, j -> ij", diag_inv_sqrt_total, covmat, diag_inv_sqrt_total) + inv_true = ( + np.diag(diag_inv_sqrt_total) + @ np.linalg.inv(cormat_total) + @ np.diag(diag_inv_sqrt_total) + ) # get the masks - different for each replica so fine to call here tr_mask, vl_mask = masks.tr_masks[0], masks.vl_masks[0] @@ -564,19 +582,16 @@ def fitting_data_dict( @table -def diagonal_basis_rotation_table(output_path, _inv_covmat_prepared, diagonal_basis=True): +def fitting_covmat_table(output_path, _inv_covmat_prepared, diagonal_basis=True): """ - Save the diagonal basis rotation. + Stores the fitting covariance matrix if diagonal_basis is False, else store the rotation matrix and eigenvalues """ + # TODO: JtH, check if this includes the theory covmat covmat, _, diagonal_rotation, eig_vals = _inv_covmat_prepared if not diagonal_basis: - - return None - - if diagonal_rotation is None or eig_vals is None: - log.warning("No diagonal-basis rotation available; skipping table export.") - return None + log.info("Saving fitting covmat") + return covmat df_rotation = pd.DataFrame(diagonal_rotation) df_rotation.insert(0, "eig_val", eig_vals) From b987466ec07a79037371e77c051c3925a6a7ebb1 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Mon, 27 Apr 2026 16:47:55 +0100 Subject: [PATCH 13/29] caching inverse covmat in non diagonal basis --- validphys2/src/validphys/config.py | 16 ++++++++ validphys2/src/validphys/n3fit_data.py | 57 ++++++++++++-------------- 2 files changed, 43 insertions(+), 30 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 299a6f5093..ff7404e2b5 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -871,6 +871,22 @@ def produce_dataset_inputs_sampling_covmat( else: return covmats.dataset_inputs_exp_covmat + def produce_loaded_fit_covmat(self, output_path, data_input): + """ + Loads the theory covmat from the correct file according to how it + was generated by vp-setupfit. + """ + + generic_path = "datacuts_theory_theorycovmatconfig_fitting_covmat_table.csv" + fit_covmat_path = output_path / "tables" / generic_path + fit_covmat = pd.read_csv( + fit_covmat_path, index_col=[0, 1, 2], header=[0, 1, 2], sep="\t|,", engine="python" + ).fillna(0) + # change ordering according to exp_covmat (so according to runcard order) + tmp = fit_covmat.droplevel(0, axis=0).droplevel(0, axis=1) + bb = [str(i) for i in data_input] + return tmp.reindex(index=bb, columns=bb, level=0).values + def produce_loaded_theory_covmat( self, output_path, diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index acb4b40ff5..ec21cdfd90 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -313,23 +313,36 @@ def fittable_datasets_masked(data): return validphys_group_extractor(data.datasets) -# def _hashed_dataset_inputs_fitting_covmat(dataset_inputs_fitting_covmat) -> Hashrray: -# """Wrap the covmat into a Hashrray for caches to work""" -# return Hashrray(dataset_inputs_fitting_covmat) +def _hashed_dataset_inputs_fitting_covmat(loaded_fit_covmat) -> Hashrray: + """Wrap the covmat into a Hashrray for caches to work""" + return Hashrray(loaded_fit_covmat) -# @functools.lru_cache -def _inv_covmat_prepared(dataset_inputs_fitting_covmat, nnfit_theory_covmat, diagonal_basis=True): - """Returns the inverse covmats for training, validation and total +@functools.lru_cache +def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat): + """Returns the inverse covmats for training, validation and total when diagonal_basis = False""" + log.info( + f"_inv_covmat_prepared called with covmat hash={hash(_hashed_dataset_inputs_fitting_covmat)}" + ) + covmat = _hashed_dataset_inputs_fitting_covmat.array + + diag_inv_sqrt_total = 1 / np.sqrt(np.diag(covmat)) + cormat_total = np.einsum("i, ij, j -> ij", diag_inv_sqrt_total, covmat, diag_inv_sqrt_total) + inv_total = ( + np.diag(diag_inv_sqrt_total) @ np.linalg.inv(cormat_total) @ np.diag(diag_inv_sqrt_total) + ) + + return inv_total + + +def _covmat_prepared(dataset_inputs_fitting_covmat, nnfit_theory_covmat, diagonal_basis=True): + """Returns the covmats for training, validation and total attending to the right masks and whether it is diagonal or not. s Since the masks and number of datapoints need to be treated for 1-point datasets it also returns the right ndata and masks for training and validation: """ - # log.info( - # f"_inv_covmat_prepared called with covmat hash={hash(_hashed_dataset_inputs_fitting_covmat)}, diagonal_basis={diagonal_basis}" - # ) # TODO: JtH, note to self: inv_covmat_prepared can no longer be called during n3fit because nnfit_theory_covmat is in a different namespace @@ -356,17 +369,7 @@ def _inv_covmat_prepared(dataset_inputs_fitting_covmat, nnfit_theory_covmat, dia inv_total = np.diag(1 / eig_vals) - else: - - diag_inv_sqrt_total = 1 / np.sqrt(np.diag(covmat)) - cormat_total = np.einsum("i, ij, j -> ij", diag_inv_sqrt_total, covmat, diag_inv_sqrt_total) - inv_total = ( - np.diag(diag_inv_sqrt_total) - @ np.linalg.inv(cormat_total) - @ np.diag(diag_inv_sqrt_total) - ) - - return (covmat, inv_total, diagonal_rotation, eig_vals) + return covmat, diagonal_rotation, eig_vals def fitting_data_dict( @@ -374,6 +377,7 @@ def fitting_data_dict( make_replica, dataset_inputs_loaded_cd_with_cuts, masks, + _inv_covmat_prepared, kfold_masks, fittable_datasets_masked, output_path, @@ -447,14 +451,7 @@ def fitting_data_dict( ).values diag_rot = None eig_vals = None - # TODO: cache this so that is reused - diag_inv_sqrt_total = 1 / np.sqrt(np.diag(covmat)) - cormat_total = np.einsum("i, ij, j -> ij", diag_inv_sqrt_total, covmat, diag_inv_sqrt_total) - inv_true = ( - np.diag(diag_inv_sqrt_total) - @ np.linalg.inv(cormat_total) - @ np.diag(diag_inv_sqrt_total) - ) + inv_true = _inv_covmat_prepared # get the masks - different for each replica so fine to call here tr_mask, vl_mask = masks.tr_masks[0], masks.vl_masks[0] @@ -582,12 +579,12 @@ def fitting_data_dict( @table -def fitting_covmat_table(output_path, _inv_covmat_prepared, diagonal_basis=True): +def fitting_covmat_table(output_path, _covmat_prepared, diagonal_basis=True): """ Stores the fitting covariance matrix if diagonal_basis is False, else store the rotation matrix and eigenvalues """ # TODO: JtH, check if this includes the theory covmat - covmat, _, diagonal_rotation, eig_vals = _inv_covmat_prepared + covmat, diagonal_rotation, eig_vals = _covmat_prepared if not diagonal_basis: log.info("Saving fitting covmat") From c55f697eab1be79bbd4b3a3ed9b6a173e21c52a0 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Mon, 27 Apr 2026 22:32:58 +0100 Subject: [PATCH 14/29] cleaning --- validphys2/src/validphys/config.py | 24 ++++--- validphys2/src/validphys/n3fit_data.py | 88 ++++++++++++++------------ 2 files changed, 65 insertions(+), 47 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index ff7404e2b5..9210433255 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -8,6 +8,7 @@ import numbers import pathlib +import numpy as np import pandas as pd from nnpdf_data import legacy_to_new_map @@ -871,7 +872,7 @@ def produce_dataset_inputs_sampling_covmat( else: return covmats.dataset_inputs_exp_covmat - def produce_loaded_fit_covmat(self, output_path, data_input): + def produce_loaded_fit_covmat(self, output_path, data_input, diagonal_basis=True): """ Loads the theory covmat from the correct file according to how it was generated by vp-setupfit. @@ -879,13 +880,20 @@ def produce_loaded_fit_covmat(self, output_path, data_input): generic_path = "datacuts_theory_theorycovmatconfig_fitting_covmat_table.csv" fit_covmat_path = output_path / "tables" / generic_path - fit_covmat = pd.read_csv( - fit_covmat_path, index_col=[0, 1, 2], header=[0, 1, 2], sep="\t|,", engine="python" - ).fillna(0) - # change ordering according to exp_covmat (so according to runcard order) - tmp = fit_covmat.droplevel(0, axis=0).droplevel(0, axis=1) - bb = [str(i) for i in data_input] - return tmp.reindex(index=bb, columns=bb, level=0).values + if not diagonal_basis: + fit_covmat = pd.read_csv( + fit_covmat_path, index_col=[0, 1, 2], header=[0, 1, 2], sep="\t|,", engine="python" + ).fillna(0) + # change ordering according to exp_covmat (so according to runcard order) + tmp = fit_covmat.droplevel(0, axis=0).droplevel(0, axis=1) + bb = [str(i) for i in data_input] + return tmp.reindex(index=bb, columns=bb, level=0).values + else: + eigensystem = pd.read_csv( + fit_covmat_path, index_col=[0], header=[0], sep="\t|,", engine="python" + ) + + return np.diag(eigensystem["eig_val"].values) def produce_loaded_theory_covmat( self, diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index ec21cdfd90..232e68355e 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -319,29 +319,62 @@ def _hashed_dataset_inputs_fitting_covmat(loaded_fit_covmat) -> Hashrray: @functools.lru_cache -def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat): +def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat, output_path, diagonal_basis=True): """Returns the inverse covmats for training, validation and total when diagonal_basis = False""" log.info( f"_inv_covmat_prepared called with covmat hash={hash(_hashed_dataset_inputs_fitting_covmat)}" ) covmat = _hashed_dataset_inputs_fitting_covmat.array + diag_rot = None + eig_vals = None - diag_inv_sqrt_total = 1 / np.sqrt(np.diag(covmat)) - cormat_total = np.einsum("i, ij, j -> ij", diag_inv_sqrt_total, covmat, diag_inv_sqrt_total) - inv_total = ( - np.diag(diag_inv_sqrt_total) @ np.linalg.inv(cormat_total) @ np.diag(diag_inv_sqrt_total) - ) + if diagonal_basis: + diagonal_basis_saved = "datacuts_theory_theorycovmatconfig_fitting_covmat_table.csv" + path_diagonal_basis = output_path / "tables" / diagonal_basis_saved + eigensystem = pd.read_csv( + path_diagonal_basis, index_col=[0], header=[0], sep="\t|,", engine="python" + ) + diag_rot = eigensystem.iloc[:, 1:].values + eig_vals = eigensystem["eig_val"].values + inv_total = np.diag(1 / eig_vals) + + else: + diag_inv_sqrt_total = 1 / np.sqrt(np.diag(covmat)) + cormat_total = np.einsum("i, ij, j -> ij", diag_inv_sqrt_total, covmat, diag_inv_sqrt_total) + inv_total = ( + np.diag(diag_inv_sqrt_total) + @ np.linalg.inv(cormat_total) + @ np.diag(diag_inv_sqrt_total) + ) - return inv_total + return covmat, inv_total, diag_rot, eig_vals -def _covmat_prepared(dataset_inputs_fitting_covmat, nnfit_theory_covmat, diagonal_basis=True): - """Returns the covmats for training, validation and total - attending to the right masks and whether it is diagonal or not. - s - Since the masks and number of datapoints need to be treated for 1-point datasets - it also returns the right ndata and masks for training and validation: +def _fiting_covmat(dataset_inputs_fitting_covmat, nnfit_theory_covmat, diagonal_basis=True): + """Prepare the fitting covariance matrix by optionally adding theory contributions + and transforming to diagonal basis. + Parameters + ---------- + dataset_inputs_fitting_covmat : np.ndarray + The experimental covariance matrix from the datasets. + nnfit_theory_covmat : np.ndarray or None + The theory covariance matrix to add to the experimental covmat. + If None, only the experimental covmat is used. + diagonal_basis : bool, optional + If True, transform the covariance matrix to diagonal basis by extracting + eigenvalues and eigenvectors of the correlation matrix. Default is True. + + Returns + ------- + covmat : np.ndarray + The prepared covariance matrix (sum of experimental and theory covmats). + diagonal_rotation : np.ndarray or None + The rotation matrix (transposed eigenvectors) to transform data to diagonal basis. + Only returned if diagonal_basis=True, otherwise None. + eig_vals : np.ndarray or None + The eigenvalues of the correlation matrix in diagonal basis. + Only returned if diagonal_basis=True, otherwise None. """ # TODO: JtH, note to self: inv_covmat_prepared can no longer be called during n3fit because nnfit_theory_covmat is in a different namespace @@ -365,10 +398,6 @@ def _covmat_prepared(dataset_inputs_fitting_covmat, nnfit_theory_covmat, diagona uT = np.einsum("i, ik -> ik", diag_inv_sqrt, uT) diagonal_rotation = uT.T - ndata = len(eig_vals) - - inv_total = np.diag(1 / eig_vals) - return covmat, diagonal_rotation, eig_vals @@ -432,26 +461,7 @@ def fitting_data_dict( fittable_datasets = fittable_datasets_masked # load covmat stored at the time of vp-setupfit - if diagonal_basis: - diagonal_basis_saved = "datacuts_theory_theorycovmatconfig_fitting_covmat_table.csv" - path_diagonal_basis = output_path / "tables" / diagonal_basis_saved - eigensystem = pd.read_csv( - path_diagonal_basis, index_col=[0], header=[0], sep="\t|,", engine="python" - ) - diag_rot = eigensystem.iloc[:, 1:].values - eig_vals = eigensystem["eig_val"].values - inv_true = np.diag(1 / eig_vals) - covmat = np.diag(eig_vals) - else: - covmat_saved = "datacuts_theory_theorycovmatconfig_fitting_covmat_table.csv" - path_covmat = output_path / "tables" / covmat_saved - # TODO: check if it is save to convert to numpy already at this stage, is the indexing consistent? - covmat = pd.read_csv( - path_covmat, index_col=[0, 1, 2], header=[0, 1, 2], sep="\t|,", engine="python" - ).values - diag_rot = None - eig_vals = None - inv_true = _inv_covmat_prepared + covmat, inv_true, diag_rot, eig_vals = _inv_covmat_prepared # get the masks - different for each replica so fine to call here tr_mask, vl_mask = masks.tr_masks[0], masks.vl_masks[0] @@ -579,12 +589,12 @@ def fitting_data_dict( @table -def fitting_covmat_table(output_path, _covmat_prepared, diagonal_basis=True): +def fitting_covmat_table(output_path, _fiting_covmat, diagonal_basis=True): """ Stores the fitting covariance matrix if diagonal_basis is False, else store the rotation matrix and eigenvalues """ # TODO: JtH, check if this includes the theory covmat - covmat, diagonal_rotation, eig_vals = _covmat_prepared + covmat, diagonal_rotation, eig_vals = _fiting_covmat if not diagonal_basis: log.info("Saving fitting covmat") From df49a5bf21392ea42aef901f2cd3f688921702e4 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Wed, 29 Apr 2026 20:49:40 +0100 Subject: [PATCH 15/29] fixing covmat by reference issue --- n3fit/src/n3fit/scripts/vp_setupfit.py | 4 +--- validphys2/src/validphys/config.py | 11 +++++++---- validphys2/src/validphys/covmats.py | 12 ++++++++++-- validphys2/src/validphys/n3fit_data.py | 8 +------- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index 6c54139653..987a2de368 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -174,7 +174,7 @@ def from_yaml(cls, o, *args, **kwargs): check_n3fit_action = 'datacuts::theory::fitting n3fit_checks_action' # The settings for these actions depend on the presence of closuretest - SETUPFIT_FIXED_CONFIG['actions_'] += [check_n3fit_action, filter_action] + SETUPFIT_FIXED_CONFIG['actions_'] += [check_n3fit_action, filter_action, rotation_action] # Check theory covariance matrix configuration thconfig = file_content.get('theorycovmatconfig', {}) @@ -190,8 +190,6 @@ def from_yaml(cls, o, *args, **kwargs): 'datacuts::theory::theorycovmatconfig nnfit_theory_covmat' ) - SETUPFIT_FIXED_CONFIG['actions_'] += [rotation_action] - # Check fiatlux configuration fiatlux = file_content.get('fiatlux') if fiatlux is not None: diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 9210433255..a967697314 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -806,16 +806,18 @@ def produce_experiment_from_input(self, experiment_input, theoryid, use_cuts, fi } @configparser.explicit_node - def produce_dataset_inputs_fitting_covmat(self): + def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=False): """ Produces the correct covmat to be used in fitting_data_dict according to some options: whether to include the theory covmat, whether to separate the multiplcative errors and whether to compute the experimental covmat using the t0 prescription. """ - # TODO update docstring + from validphys import covmats + if use_thcovmat_in_fitting: + return covmats.dataset_inputs_t0_total_covmat return covmats.dataset_inputs_t0_exp_covmat def produce_sep_mult(self, separate_multiplicative=False): @@ -901,8 +903,8 @@ def produce_loaded_theory_covmat( data_input, user_covmat_path=None, point_prescriptions=None, - use_thcovmat_in_sampling=True, - use_thcovmat_in_fitting=True, + use_thcovmat_in_sampling=False, + use_thcovmat_in_fitting=False, ): """ Loads the theory covmat from the correct file according to how it @@ -1352,6 +1354,7 @@ def produce_nnfit_theory_covmat( This function is only used in vp-setupfit to store the necessary covmats as .csv files in the tables directory. """ + if point_prescriptions is not None: if user_covmat_path is not None: # Both scalevar and user uncertainties diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 195371ad5d..c8a06d9163 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -472,13 +472,21 @@ def dataset_inputs_exp_covmat_separate( return covmat -def dataset_inputs_t0_total_covmat(dataset_inputs_t0_exp_covmat, loaded_theory_covmat): +def dataset_inputs_t0_total_covmat(dataset_inputs_t0_exp_covmat, nnfit_theory_covmat, data): """ Function to compute the covmat to be used for the sampling by make_replica and for the chi2 by fitting_data_dict. In this case the t0 prescription is used for the experimental covmat and the multiplicative errors are included in it. Moreover, the theory covmat is added to experimental covmat. """ - return dataset_inputs_t0_exp_covmat + loaded_theory_covmat + tmp = nnfit_theory_covmat.droplevel(0, axis=0).droplevel(0, axis=1) + # old to new names mapping + new_names = {d[0]: legacy_to_new_map(d[0])[0] for d in tmp.index} + tmp = tmp.rename(columns=new_names, index=new_names, level=0) + # reorder + bb = [str(i) for i in data] + theory_covmat = tmp.reindex(index=bb, columns=bb, level=0).values + + return dataset_inputs_t0_exp_covmat + theory_covmat def dataset_inputs_t0_exp_covmat( diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index 232e68355e..f6e43f3337 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -350,7 +350,7 @@ def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat, output_path, dia return covmat, inv_total, diag_rot, eig_vals -def _fiting_covmat(dataset_inputs_fitting_covmat, nnfit_theory_covmat, diagonal_basis=True): +def _fiting_covmat(dataset_inputs_fitting_covmat, diagonal_basis=True): """Prepare the fitting covariance matrix by optionally adding theory contributions and transforming to diagonal basis. @@ -377,15 +377,9 @@ def _fiting_covmat(dataset_inputs_fitting_covmat, nnfit_theory_covmat, diagonal_ Only returned if diagonal_basis=True, otherwise None. """ - # TODO: JtH, note to self: inv_covmat_prepared can no longer be called during n3fit because nnfit_theory_covmat is in a different namespace - covmat = dataset_inputs_fitting_covmat - if nnfit_theory_covmat is not None: - covmat += nnfit_theory_covmat - diagonal_rotation = None eig_vals = None - if diagonal_basis: log.info("working in diagonal basis.") From 300963865fedb66dd34c48cc17debed272b8b5ab Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Thu, 30 Apr 2026 16:42:01 +0100 Subject: [PATCH 16/29] n3fit runs again --- validphys2/src/validphys/config.py | 60 ++++++++++++++------------ validphys2/src/validphys/covmats.py | 27 +++++------- validphys2/src/validphys/n3fit_data.py | 12 +++--- validphys2/src/validphys/pseudodata.py | 7 ++- 4 files changed, 54 insertions(+), 52 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index a967697314..a8027dce62 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -806,7 +806,7 @@ def produce_experiment_from_input(self, experiment_input, theoryid, use_cuts, fi } @configparser.explicit_node - def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=False): + def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=True): """ Produces the correct covmat to be used in fitting_data_dict according to some options: whether to include the theory covmat, whether to @@ -827,7 +827,11 @@ def produce_sep_mult(self, separate_multiplicative=False): @configparser.explicit_node def produce_dataset_inputs_sampling_covmat( - self, sep_mult=False, use_thcovmat_in_sampling=False, use_t0_sampling=True + self, + loaded_fit_covmat, + sep_mult=False, + use_thcovmat_in_sampling=False, + use_t0_sampling=True, ): """ Produces the correct MC replica method sampling covmat to be used in @@ -850,36 +854,38 @@ def produce_dataset_inputs_sampling_covmat( """ from validphys import covmats - if use_t0_sampling: - if use_thcovmat_in_sampling: - if sep_mult: - return covmats.dataset_inputs_t0_total_covmat_separate - else: - return covmats.dataset_inputs_t0_total_covmat - else: - if sep_mult: - return covmats.dataset_inputs_t0_exp_covmat_separate - else: - return covmats.dataset_inputs_t0_exp_covmat - - else: - if use_thcovmat_in_sampling: - if sep_mult: - return covmats.dataset_inputs_total_covmat_separate - else: - return covmats.dataset_inputs_total_covmat - else: - if sep_mult: - return covmats.dataset_inputs_exp_covmat_separate - else: - return covmats.dataset_inputs_exp_covmat + return loaded_fit_covmat + + # if use_t0_sampling: + # if use_thcovmat_in_sampling: + # if sep_mult: + # return covmats.dataset_inputs_t0_total_covmat_separate + # else: + # return covmats.dataset_inputs_t0_total_covmat + # else: + # if sep_mult: + # return covmats.dataset_inputs_t0_exp_covmat_separate + # else: + # return covmats.dataset_inputs_t0_exp_covmat + # + # else: + # if use_thcovmat_in_sampling: + # if sep_mult: + # return covmats.dataset_inputs_total_covmat_separate + # else: + # return covmats.dataset_inputs_total_covmat + # else: + # if sep_mult: + # return covmats.dataset_inputs_exp_covmat_separate + # else: + # return covmats.dataset_inputs_exp_covmat def produce_loaded_fit_covmat(self, output_path, data_input, diagonal_basis=True): """ - Loads the theory covmat from the correct file according to how it - was generated by vp-setupfit. + Loads the fit covmat (including the theory covmat if present) as saved during vp-setupfit """ + # the csv includes both the experimental and theory covmat (if requested in the runcard) generic_path = "datacuts_theory_theorycovmatconfig_fitting_covmat_table.csv" fit_covmat_path = output_path / "tables" / generic_path if not diagonal_basis: diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index c8a06d9163..fdbbc7315a 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -225,6 +225,7 @@ def dataset_inputs_covmat_from_systematics( covmat = (covmat / sqrt_weights).T / sqrt_weights if norm_threshold is not None: covmat = regularize_covmat(covmat, norm_threshold=norm_threshold) + return covmat @@ -406,14 +407,14 @@ def dataset_inputs_t0_covmat_from_systematics( def dataset_inputs_t0_total_covmat_separate( - dataset_inputs_t0_exp_covmat_separate, loaded_theory_covmat + dataset_inputs_t0_exp_covmat_separate, nnfit_theory_covmat ): """ Function to compute the covmat to be used for the sampling by make_replica. In this case the t0 prescription is used for the experimental covmat and the multiplicative errors are separated. Moreover, the theory covmat is added to experimental covmat. """ - return dataset_inputs_t0_exp_covmat_separate + loaded_theory_covmat + return dataset_inputs_t0_exp_covmat_separate + nnfit_theory_covmat def dataset_inputs_t0_exp_covmat_separate( @@ -440,13 +441,13 @@ def dataset_inputs_t0_exp_covmat_separate( return covmat -def dataset_inputs_total_covmat_separate(dataset_inputs_exp_covmat_separate, loaded_theory_covmat): +def dataset_inputs_total_covmat_separate(dataset_inputs_exp_covmat_separate, nnfit_theory_covmat): """ Function to compute the covmat to be used for the sampling by make_replica. In this case the t0 prescription is not used for the experimental covmat and the multiplicative errors are separated. Moreover, the theory covmat is added to experimental covmat. """ - return dataset_inputs_exp_covmat_separate + loaded_theory_covmat + return dataset_inputs_exp_covmat_separate + nnfit_theory_covmat def dataset_inputs_exp_covmat_separate( @@ -472,21 +473,14 @@ def dataset_inputs_exp_covmat_separate( return covmat -def dataset_inputs_t0_total_covmat(dataset_inputs_t0_exp_covmat, nnfit_theory_covmat, data): +def dataset_inputs_t0_total_covmat(dataset_inputs_t0_exp_covmat, nnfit_theory_covmat): """ Function to compute the covmat to be used for the sampling by make_replica and for the chi2 by fitting_data_dict. In this case the t0 prescription is used for the experimental covmat and the multiplicative errors are included in it. Moreover, the theory covmat is added to experimental covmat. """ - tmp = nnfit_theory_covmat.droplevel(0, axis=0).droplevel(0, axis=1) - # old to new names mapping - new_names = {d[0]: legacy_to_new_map(d[0])[0] for d in tmp.index} - tmp = tmp.rename(columns=new_names, index=new_names, level=0) - # reorder - bb = [str(i) for i in data] - theory_covmat = tmp.reindex(index=bb, columns=bb, level=0).values - return dataset_inputs_t0_exp_covmat + theory_covmat + return dataset_inputs_t0_exp_covmat + nnfit_theory_covmat def dataset_inputs_t0_exp_covmat( @@ -496,6 +490,7 @@ def dataset_inputs_t0_exp_covmat( use_weights_in_covmat=True, norm_threshold=None, dataset_inputs_t0_predictions, + data_index, ): """ Function to compute the covmat to be used for the sampling by make_replica and for the chi2 @@ -510,16 +505,16 @@ def dataset_inputs_t0_exp_covmat( dataset_inputs_t0_predictions, False, ) - return covmat + return pd.DataFrame(covmat, index=data_index, columns=data_index) -def dataset_inputs_total_covmat(dataset_inputs_exp_covmat, loaded_theory_covmat): +def dataset_inputs_total_covmat(dataset_inputs_exp_covmat, nnfit_theory_covmat): """ Function to compute the covmat to be used for the sampling by make_replica and for the chi2 by fitting_data_dict. In this case the t0 prescription is not used for the experimental covmat and the multiplicative errors are included in it. Moreover, the theory covmat is added to experimental covmat. """ - return dataset_inputs_exp_covmat + loaded_theory_covmat + return dataset_inputs_exp_covmat + nnfit_theory_covmat def dataset_inputs_exp_covmat( diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index f6e43f3337..a1611c179a 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -171,12 +171,12 @@ def __init__(self, group_name, seed, tr_masks, vl_masks): super().__init__(group_name, seed) -def diagonal_masks(data, replica_trvlseed, dataset_inputs_fitting_covmat, diagonal_frac=0.75): +def diagonal_masks(data, replica_trvlseed, loaded_fit_covmat, diagonal_frac=0.75): nameseed = int(hashlib.sha256(str(data).encode()).hexdigest(), 16) % 10**8 nameseed += replica_trvlseed rng = np.random.Generator(np.random.PCG64(nameseed)) - ndata = len(dataset_inputs_fitting_covmat) + ndata = len(loaded_fit_covmat) # construct training mask by selecting a fraction of the eigenvalues trmax = int(ndata * diagonal_frac) @@ -350,7 +350,9 @@ def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat, output_path, dia return covmat, inv_total, diag_rot, eig_vals -def _fiting_covmat(dataset_inputs_fitting_covmat, diagonal_basis=True): +def _fiting_covmat( + dataset_inputs_fitting_covmat, nnfit_theory_covmat, data_input, diagonal_basis=True +): """Prepare the fitting covariance matrix by optionally adding theory contributions and transforming to diagonal basis. @@ -380,6 +382,7 @@ def _fiting_covmat(dataset_inputs_fitting_covmat, diagonal_basis=True): covmat = dataset_inputs_fitting_covmat diagonal_rotation = None eig_vals = None + if diagonal_basis: log.info("working in diagonal basis.") @@ -583,11 +586,10 @@ def fitting_data_dict( @table -def fitting_covmat_table(output_path, _fiting_covmat, diagonal_basis=True): +def fitting_covmat_table(output_path, _fiting_covmat, data_index, diagonal_basis=True): """ Stores the fitting covariance matrix if diagonal_basis is False, else store the rotation matrix and eigenvalues """ - # TODO: JtH, check if this includes the theory covmat covmat, diagonal_rotation, eig_vals = _fiting_covmat if not diagonal_basis: diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 8f1eca10e8..4870fa0afe 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -123,7 +123,7 @@ def read_replica_pseudodata(fit, context_index, replica): def make_replica( central_values_array, group_replica_mcseed, - dataset_inputs_sampling_covmat, + loaded_fit_covmat, group_multiplicative_errors=None, group_positivity_mask=None, sep_mult=False, @@ -195,8 +195,7 @@ def make_replica( # Set random seed rng = np.random.default_rng(seed=group_replica_mcseed) # construct covmat - covmat = dataset_inputs_sampling_covmat - covmat_sqrt = sqrt_covmat(covmat) + covmat_sqrt = sqrt_covmat(loaded_fit_covmat) full_mask = ( group_positivity_mask @@ -224,7 +223,7 @@ def make_replica( mult_shifts.append(mult_shift) # If sep_mult is true then the multiplicative shifts were not included in the covmat - shifts = covmat_sqrt @ rng.normal(size=covmat.shape[1]) + shifts = covmat_sqrt @ rng.normal(size=loaded_fit_covmat.shape[1]) mult_part = 1.0 if sep_mult: special_mult_errors = group_multiplicative_errors["special_mult"] From 14c4b8ee5d67a01286e3679ddcfcb908af79e4dd Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Thu, 30 Apr 2026 17:01:26 +0100 Subject: [PATCH 17/29] extending number of headers --- validphys2/src/validphys/config.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index a8027dce62..79c14787c1 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -890,7 +890,11 @@ def produce_loaded_fit_covmat(self, output_path, data_input, diagonal_basis=True fit_covmat_path = output_path / "tables" / generic_path if not diagonal_basis: fit_covmat = pd.read_csv( - fit_covmat_path, index_col=[0, 1, 2], header=[0, 1, 2], sep="\t|,", engine="python" + fit_covmat_path, + index_col=[0, 1, 2, 3], + header=[0, 1, 2, 3], + sep="\t|,", + engine="python", ).fillna(0) # change ordering according to exp_covmat (so according to runcard order) tmp = fit_covmat.droplevel(0, axis=0).droplevel(0, axis=1) From 2559764b14374a45d6bc347d594430298ae0ecdd Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Fri, 1 May 2026 08:52:11 +0100 Subject: [PATCH 18/29] passing covmat in data basis to make_replica for correct sampling --- validphys2/src/validphys/config.py | 82 -------------------------- validphys2/src/validphys/covmats.py | 1 - validphys2/src/validphys/n3fit_data.py | 13 ++-- validphys2/src/validphys/pseudodata.py | 6 +- 4 files changed, 10 insertions(+), 92 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 79c14787c1..632b477b73 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -825,88 +825,6 @@ def produce_sep_mult(self, separate_multiplicative=False): return False return True - @configparser.explicit_node - def produce_dataset_inputs_sampling_covmat( - self, - loaded_fit_covmat, - sep_mult=False, - use_thcovmat_in_sampling=False, - use_t0_sampling=True, - ): - """ - Produces the correct MC replica method sampling covmat to be used in - make_replica according to some options: whether to sample using a t0 - covariance matrix, include the theory covmat and whether to - separate the multiplcative errors. - - Parameters - ---------- - sep_mult : bool, default=False - Whether to separate the multiplicative errors. - use_thcovmat_in_sampling : bool, default=False - Whether to include the theory covariance matrix. - use_t0_sampling : bool, default=True - Whether to sample using a t0 covariance matrix. - - Returns - ------- - Callable - """ - from validphys import covmats - - return loaded_fit_covmat - - # if use_t0_sampling: - # if use_thcovmat_in_sampling: - # if sep_mult: - # return covmats.dataset_inputs_t0_total_covmat_separate - # else: - # return covmats.dataset_inputs_t0_total_covmat - # else: - # if sep_mult: - # return covmats.dataset_inputs_t0_exp_covmat_separate - # else: - # return covmats.dataset_inputs_t0_exp_covmat - # - # else: - # if use_thcovmat_in_sampling: - # if sep_mult: - # return covmats.dataset_inputs_total_covmat_separate - # else: - # return covmats.dataset_inputs_total_covmat - # else: - # if sep_mult: - # return covmats.dataset_inputs_exp_covmat_separate - # else: - # return covmats.dataset_inputs_exp_covmat - - def produce_loaded_fit_covmat(self, output_path, data_input, diagonal_basis=True): - """ - Loads the fit covmat (including the theory covmat if present) as saved during vp-setupfit - """ - - # the csv includes both the experimental and theory covmat (if requested in the runcard) - generic_path = "datacuts_theory_theorycovmatconfig_fitting_covmat_table.csv" - fit_covmat_path = output_path / "tables" / generic_path - if not diagonal_basis: - fit_covmat = pd.read_csv( - fit_covmat_path, - index_col=[0, 1, 2, 3], - header=[0, 1, 2, 3], - sep="\t|,", - engine="python", - ).fillna(0) - # change ordering according to exp_covmat (so according to runcard order) - tmp = fit_covmat.droplevel(0, axis=0).droplevel(0, axis=1) - bb = [str(i) for i in data_input] - return tmp.reindex(index=bb, columns=bb, level=0).values - else: - eigensystem = pd.read_csv( - fit_covmat_path, index_col=[0], header=[0], sep="\t|,", engine="python" - ) - - return np.diag(eigensystem["eig_val"].values) - def produce_loaded_theory_covmat( self, output_path, diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index fdbbc7315a..4a802be215 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -479,7 +479,6 @@ def dataset_inputs_t0_total_covmat(dataset_inputs_t0_exp_covmat, nnfit_theory_co by fitting_data_dict. In this case the t0 prescription is used for the experimental covmat and the multiplicative errors are included in it. Moreover, the theory covmat is added to experimental covmat. """ - return dataset_inputs_t0_exp_covmat + nnfit_theory_covmat diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index a1611c179a..de04aaa43c 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -171,12 +171,12 @@ def __init__(self, group_name, seed, tr_masks, vl_masks): super().__init__(group_name, seed) -def diagonal_masks(data, replica_trvlseed, loaded_fit_covmat, diagonal_frac=0.75): +def diagonal_masks(data, replica_trvlseed, dataset_inputs_covmat_t0_considered, diagonal_frac=0.75): nameseed = int(hashlib.sha256(str(data).encode()).hexdigest(), 16) % 10**8 nameseed += replica_trvlseed rng = np.random.Generator(np.random.PCG64(nameseed)) - ndata = len(loaded_fit_covmat) + ndata = len(dataset_inputs_covmat_t0_considered) # construct training mask by selecting a fraction of the eigenvalues trmax = int(ndata * diagonal_frac) @@ -313,9 +313,9 @@ def fittable_datasets_masked(data): return validphys_group_extractor(data.datasets) -def _hashed_dataset_inputs_fitting_covmat(loaded_fit_covmat) -> Hashrray: +def _hashed_dataset_inputs_fitting_covmat(dataset_inputs_covmat_t0_considered) -> Hashrray: """Wrap the covmat into a Hashrray for caches to work""" - return Hashrray(loaded_fit_covmat) + return Hashrray(dataset_inputs_covmat_t0_considered) @functools.lru_cache @@ -324,7 +324,7 @@ def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat, output_path, dia log.info( f"_inv_covmat_prepared called with covmat hash={hash(_hashed_dataset_inputs_fitting_covmat)}" ) - covmat = _hashed_dataset_inputs_fitting_covmat.array + diag_rot = None eig_vals = None @@ -336,9 +336,10 @@ def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat, output_path, dia ) diag_rot = eigensystem.iloc[:, 1:].values eig_vals = eigensystem["eig_val"].values + covmat = np.diag(eig_vals) inv_total = np.diag(1 / eig_vals) - else: + covmat = _hashed_dataset_inputs_fitting_covmat.array diag_inv_sqrt_total = 1 / np.sqrt(np.diag(covmat)) cormat_total = np.einsum("i, ij, j -> ij", diag_inv_sqrt_total, covmat, diag_inv_sqrt_total) inv_total = ( diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 4870fa0afe..79fd4a27c5 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -123,7 +123,7 @@ def read_replica_pseudodata(fit, context_index, replica): def make_replica( central_values_array, group_replica_mcseed, - loaded_fit_covmat, + dataset_inputs_covmat_t0_considered, group_multiplicative_errors=None, group_positivity_mask=None, sep_mult=False, @@ -195,7 +195,7 @@ def make_replica( # Set random seed rng = np.random.default_rng(seed=group_replica_mcseed) # construct covmat - covmat_sqrt = sqrt_covmat(loaded_fit_covmat) + covmat_sqrt = sqrt_covmat(dataset_inputs_covmat_t0_considered) full_mask = ( group_positivity_mask @@ -223,7 +223,7 @@ def make_replica( mult_shifts.append(mult_shift) # If sep_mult is true then the multiplicative shifts were not included in the covmat - shifts = covmat_sqrt @ rng.normal(size=loaded_fit_covmat.shape[1]) + shifts = covmat_sqrt @ rng.normal(size=dataset_inputs_covmat_t0_considered.shape[1]) mult_part = 1.0 if sep_mult: special_mult_errors = group_multiplicative_errors["special_mult"] From 1e800c0d4a37108ecb2e73435d223397b14b69e1 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Fri, 1 May 2026 11:31:40 +0100 Subject: [PATCH 19/29] indexing by process group --- validphys2/src/validphys/covmats.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 4a802be215..5443b00b0a 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -479,6 +479,7 @@ def dataset_inputs_t0_total_covmat(dataset_inputs_t0_exp_covmat, nnfit_theory_co by fitting_data_dict. In this case the t0 prescription is used for the experimental covmat and the multiplicative errors are included in it. Moreover, the theory covmat is added to experimental covmat. """ + return dataset_inputs_t0_exp_covmat + nnfit_theory_covmat @@ -489,7 +490,7 @@ def dataset_inputs_t0_exp_covmat( use_weights_in_covmat=True, norm_threshold=None, dataset_inputs_t0_predictions, - data_index, + procs_index_matched, ): """ Function to compute the covmat to be used for the sampling by make_replica and for the chi2 @@ -504,7 +505,8 @@ def dataset_inputs_t0_exp_covmat( dataset_inputs_t0_predictions, False, ) - return pd.DataFrame(covmat, index=data_index, columns=data_index) + # TODO: how to know for sure if the index matches the covmat value ordering? + return pd.DataFrame(covmat, index=procs_index_matched, columns=procs_index_matched) def dataset_inputs_total_covmat(dataset_inputs_exp_covmat, nnfit_theory_covmat): From 7123c15a6fae660986e93ab819a7a3f0cbed243e Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Fri, 1 May 2026 11:48:15 +0100 Subject: [PATCH 20/29] make rotation action condition on presence of theorycovmatconfig in runcard --- n3fit/src/n3fit/scripts/vp_setupfit.py | 5 +++- validphys2/src/validphys/config.py | 2 +- validphys2/src/validphys/n3fit_data.py | 34 ++++++++++++++++++++------ 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index 987a2de368..5ffad64e60 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -170,7 +170,10 @@ def from_yaml(cls, o, *args, **kwargs): check_n3fit_action = 'datacuts::theory::closuretest::fitting n3fit_checks_action' else: filter_action = 'datacuts::theory::fitting filter' - rotation_action = 'datacuts::theory::theorycovmatconfig fitting_covmat_table' + if file_content.get('theorycovmatconfig') is not None: + rotation_action = 'datacuts::theory::theorycovmatconfig fitting_covmat_table' + else: + rotation_action = 'datacuts::theory fitting_covmat_table' check_n3fit_action = 'datacuts::theory::fitting n3fit_checks_action' # The settings for these actions depend on the presence of closuretest diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 632b477b73..858cfb024f 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -806,7 +806,7 @@ def produce_experiment_from_input(self, experiment_input, theoryid, use_cuts, fi } @configparser.explicit_node - def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=True): + def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=False): """ Produces the correct covmat to be used in fitting_data_dict according to some options: whether to include the theory covmat, whether to diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index de04aaa43c..74aecab899 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -320,7 +320,32 @@ def _hashed_dataset_inputs_fitting_covmat(dataset_inputs_covmat_t0_considered) - @functools.lru_cache def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat, output_path, diagonal_basis=True): - """Returns the inverse covmats for training, validation and total when diagonal_basis = False""" + """Prepare the covariance matrix and its inverse, optionally transforming to diagonal basis. + + Parameters + ---------- + _hashed_dataset_inputs_fitting_covmat : Hashrray + Wrapped covariance matrix for caching purposes. + output_path : Path + Path to output directory containing diagonal basis data if needed. + diagonal_basis : bool, optional + If True, load pre-computed diagonal basis from tables. If False, use the + covariance matrix directly and compute the inverse via correlation matrix + inversion. Default is True. + + Returns + ------- + covmat : np.ndarray + The covariance matrix (diagonal if diagonal_basis=True, full otherwise). + inv_total : np.ndarray + The inverse of the covariance matrix. + diag_rot : np.ndarray or None + The rotation matrix (eigenvectors transposed) for diagonal basis transformation. + Only returned when diagonal_basis=True, otherwise None. + eig_vals : np.ndarray or None + The eigenvalues of the correlation matrix. Only returned when + diagonal_basis=True, otherwise None. + """ log.info( f"_inv_covmat_prepared called with covmat hash={hash(_hashed_dataset_inputs_fitting_covmat)}" ) @@ -351,9 +376,7 @@ def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat, output_path, dia return covmat, inv_total, diag_rot, eig_vals -def _fiting_covmat( - dataset_inputs_fitting_covmat, nnfit_theory_covmat, data_input, diagonal_basis=True -): +def _fiting_covmat(dataset_inputs_fitting_covmat, data_input, diagonal_basis=True): """Prepare the fitting covariance matrix by optionally adding theory contributions and transforming to diagonal basis. @@ -361,9 +384,6 @@ def _fiting_covmat( ---------- dataset_inputs_fitting_covmat : np.ndarray The experimental covariance matrix from the datasets. - nnfit_theory_covmat : np.ndarray or None - The theory covariance matrix to add to the experimental covmat. - If None, only the experimental covmat is used. diagonal_basis : bool, optional If True, transform the covariance matrix to diagonal basis by extracting eigenvalues and eigenvectors of the correlation matrix. Default is True. From 6b02b685528684f7751477e66e95cc7eaa9e66cc Mon Sep 17 00:00:00 2001 From: Jaco ter Hoeve <54140851+jacoterh@users.noreply.github.com> Date: Fri, 1 May 2026 12:31:28 +0100 Subject: [PATCH 21/29] Update validphys2/src/validphys/config.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- validphys2/src/validphys/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 858cfb024f..d131e442e4 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -8,7 +8,6 @@ import numbers import pathlib -import numpy as np import pandas as pd from nnpdf_data import legacy_to_new_map From 0597efe73df73f22a80d4ba7e2156ced55ccb07e Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Fri, 1 May 2026 13:08:23 +0100 Subject: [PATCH 22/29] making csv file name theorycovmat dependent + no longer modify global setupfitconfig --- n3fit/src/n3fit/scripts/vp_setupfit.py | 24 ++++++++++++++---------- validphys2/src/validphys/n3fit_data.py | 12 ++++++++++-- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index 5ffad64e60..0e4e83cd27 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -25,6 +25,7 @@ # top. +import copy import hashlib import logging import pathlib @@ -142,6 +143,9 @@ class SetupFitConfig(Config): @classmethod def from_yaml(cls, o, *args, **kwargs): + # Create a fresh copy of the fixed config to avoid in-place modifications + fixed_config = copy.deepcopy(SETUPFIT_FIXED_CONFIG) + try: file_content = yaml_safe.load(o) except error.YAMLError as e: @@ -157,10 +161,10 @@ def from_yaml(cls, o, *args, **kwargs): # Use faketheoryid to create the L0 data to be stored into the filter folder # (L1 data is stored if fakedata is True) if 'faketheoryid' in closuredict: - # make sure theory key exists in SETUPFIT_FIXED_CONFIG - SETUPFIT_FIXED_CONFIG.setdefault('theory', {}) + # make sure theory key exists in fixed_config + fixed_config.setdefault('theory', {}) # overwrite theoryid with the faketheoryid - SETUPFIT_FIXED_CONFIG['theory']['theoryid'] = closuredict['faketheoryid'] + fixed_config['theory']['theoryid'] = closuredict['faketheoryid'] # download theoryid since it will be used in the fit try: loader.check_theoryID(file_content['theory']['theoryid']) @@ -177,7 +181,7 @@ def from_yaml(cls, o, *args, **kwargs): check_n3fit_action = 'datacuts::theory::fitting n3fit_checks_action' # The settings for these actions depend on the presence of closuretest - SETUPFIT_FIXED_CONFIG['actions_'] += [check_n3fit_action, filter_action, rotation_action] + fixed_config['actions_'] += [check_n3fit_action, filter_action, rotation_action] # Check theory covariance matrix configuration thconfig = file_content.get('theorycovmatconfig', {}) @@ -189,7 +193,7 @@ def from_yaml(cls, o, *args, **kwargs): "`point_prescriptions: ['9 point', '3 point']`" ) if thconfig: - SETUPFIT_FIXED_CONFIG['actions_'].append( + fixed_config['actions_'].append( 'datacuts::theory::theorycovmatconfig nnfit_theory_covmat' ) @@ -219,14 +223,14 @@ def from_yaml(cls, o, *args, **kwargs): if compute_in_setupfit: log.info("Forcing photon computation with FiatLux during setupfit.") # Since the photon will be computed, check that the luxset and additional_errors exist - SETUPFIT_FIXED_CONFIG['actions_'].append('fiatlux check_luxset_exists') + fixed_config['actions_'].append('fiatlux check_luxset_exists') if fiatlux.get("additional_errors"): - SETUPFIT_FIXED_CONFIG['actions_'].append('fiatlux check_additional_errors') - SETUPFIT_FIXED_CONFIG['actions_'].append('fiatlux::theory compute_photon_to_disk') + fixed_config['actions_'].append('fiatlux check_additional_errors') + fixed_config['actions_'].append('fiatlux::theory compute_photon_to_disk') # Check positivity bound if file_content.get('positivity_bound') is not None: - SETUPFIT_FIXED_CONFIG['actions_'].append('positivity_bound check_unpolarized_bc') + fixed_config['actions_'].append('positivity_bound check_unpolarized_bc') # Check hyperscan trials_config = file_content.get('trial_specs', {}) @@ -238,7 +242,7 @@ def from_yaml(cls, o, *args, **kwargs): file_content.setdefault(k, v) # Update file content with fixed configuration - file_content.update(SETUPFIT_FIXED_CONFIG) + file_content.update(fixed_config) return cls(file_content, *args, **kwargs) diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index 74aecab899..94bf19785f 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -319,7 +319,12 @@ def _hashed_dataset_inputs_fitting_covmat(dataset_inputs_covmat_t0_considered) - @functools.lru_cache -def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat, output_path, diagonal_basis=True): +def _inv_covmat_prepared( + _hashed_dataset_inputs_fitting_covmat, + output_path, + use_thcovmat_in_fitting=False, + diagonal_basis=True, +): """Prepare the covariance matrix and its inverse, optionally transforming to diagonal basis. Parameters @@ -354,7 +359,10 @@ def _inv_covmat_prepared(_hashed_dataset_inputs_fitting_covmat, output_path, dia eig_vals = None if diagonal_basis: - diagonal_basis_saved = "datacuts_theory_theorycovmatconfig_fitting_covmat_table.csv" + if use_thcovmat_in_fitting: + diagonal_basis_saved = "datacuts_theory_theorycovmatconfig_fitting_covmat_table.csv" + else: + diagonal_basis_saved = "datacuts_theory_fitting_covmat_table.csv" path_diagonal_basis = output_path / "tables" / diagonal_basis_saved eigensystem = pd.read_csv( path_diagonal_basis, index_col=[0], header=[0], sep="\t|,", engine="python" From 47c21fe68f41d0015c96ac2fb2356e970d7632fd Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Thu, 14 May 2026 13:05:30 +0200 Subject: [PATCH 23/29] make_replica uses loaded theory covmat --- validphys2/src/validphys/config.py | 50 ++++++++++++++++++++++++++ validphys2/src/validphys/covmats.py | 14 ++++++-- validphys2/src/validphys/n3fit_data.py | 2 +- validphys2/src/validphys/pseudodata.py | 6 ++-- 4 files changed, 65 insertions(+), 7 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index d131e442e4..77b86a0d22 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -824,6 +824,56 @@ def produce_sep_mult(self, separate_multiplicative=False): return False return True + @configparser.explicit_node + def produce_dataset_inputs_sampling_covmat( + self, sep_mult=False, use_thcovmat_in_sampling=False, use_t0_sampling=True + ): + """ + Produces the correct MC replica method sampling covmat to be used in + make_replica according to some options: whether to sample using a t0 + covariance matrix, include the theory covmat and whether to + separate the multiplcative errors. + + Parameters + ---------- + sep_mult : bool, default=False + Whether to separate the multiplicative errors. + use_thcovmat_in_sampling : bool, default=False + Whether to include the theory covariance matrix. + use_t0_sampling : bool, default=True + Whether to sample using a t0 covariance matrix. + + Returns + ------- + Callable + """ + from validphys import covmats + + # TODO: JtH also add loaders for the remain covmats + if use_t0_sampling: + if use_thcovmat_in_sampling: + if sep_mult: + return covmats.dataset_inputs_t0_total_covmat_separate + else: + return covmats.dataset_load_inputs_t0_total_covmat + else: + if sep_mult: + return covmats.dataset_inputs_t0_exp_covmat_separate + else: + return covmats.dataset_inputs_t0_exp_covmat + + else: + if use_thcovmat_in_sampling: + if sep_mult: + return covmats.dataset_inputs_total_covmat_separate + else: + return covmats.dataset_inputs_total_covmat + else: + if sep_mult: + return covmats.dataset_inputs_exp_covmat_separate + else: + return covmats.dataset_inputs_exp_covmat + def produce_loaded_theory_covmat( self, output_path, diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 5443b00b0a..c3808cd5d5 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -483,6 +483,15 @@ def dataset_inputs_t0_total_covmat(dataset_inputs_t0_exp_covmat, nnfit_theory_co return dataset_inputs_t0_exp_covmat + nnfit_theory_covmat +def dataset_load_inputs_t0_total_covmat(dataset_inputs_t0_exp_covmat, loaded_theory_covmat): + """ + Function to compute the covmat to be used for the sampling by make_replica and for the chi2 + by fitting_data_dict. In this case the t0 prescription is used for the experimental covmat + and the multiplicative errors are included in it. Moreover, the theory covmat is added to experimental covmat. + """ + return dataset_inputs_t0_exp_covmat + loaded_theory_covmat + + def dataset_inputs_t0_exp_covmat( dataset_inputs_loaded_cd_with_cuts, *, @@ -490,7 +499,6 @@ def dataset_inputs_t0_exp_covmat( use_weights_in_covmat=True, norm_threshold=None, dataset_inputs_t0_predictions, - procs_index_matched, ): """ Function to compute the covmat to be used for the sampling by make_replica and for the chi2 @@ -505,8 +513,8 @@ def dataset_inputs_t0_exp_covmat( dataset_inputs_t0_predictions, False, ) - # TODO: how to know for sure if the index matches the covmat value ordering? - return pd.DataFrame(covmat, index=procs_index_matched, columns=procs_index_matched) + + return covmat def dataset_inputs_total_covmat(dataset_inputs_exp_covmat, nnfit_theory_covmat): diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index 94bf19785f..b86f956331 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -623,7 +623,7 @@ def fitting_covmat_table(output_path, _fiting_covmat, data_index, diagonal_basis if not diagonal_basis: log.info("Saving fitting covmat") - return covmat + return pd.DataFrame(covmat, index=data_index, columns=data_index) df_rotation = pd.DataFrame(diagonal_rotation) df_rotation.insert(0, "eig_val", eig_vals) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 79fd4a27c5..96126f9974 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -123,7 +123,7 @@ def read_replica_pseudodata(fit, context_index, replica): def make_replica( central_values_array, group_replica_mcseed, - dataset_inputs_covmat_t0_considered, + dataset_inputs_sampling_covmat, group_multiplicative_errors=None, group_positivity_mask=None, sep_mult=False, @@ -195,7 +195,7 @@ def make_replica( # Set random seed rng = np.random.default_rng(seed=group_replica_mcseed) # construct covmat - covmat_sqrt = sqrt_covmat(dataset_inputs_covmat_t0_considered) + covmat_sqrt = sqrt_covmat(dataset_inputs_sampling_covmat) full_mask = ( group_positivity_mask @@ -223,7 +223,7 @@ def make_replica( mult_shifts.append(mult_shift) # If sep_mult is true then the multiplicative shifts were not included in the covmat - shifts = covmat_sqrt @ rng.normal(size=dataset_inputs_covmat_t0_considered.shape[1]) + shifts = covmat_sqrt @ rng.normal(size=dataset_inputs_sampling_covmat.shape[1]) mult_part = 1.0 if sep_mult: special_mult_errors = group_multiplicative_errors["special_mult"] From da14aff455f819704fa38e83ce353cc580c6a0a0 Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Thu, 14 May 2026 13:12:32 +0200 Subject: [PATCH 24/29] adding loaded theory covmat variants --- validphys2/src/validphys/config.py | 7 +++---- validphys2/src/validphys/covmats.py | 31 +++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 77b86a0d22..d84ec11b6a 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -849,11 +849,10 @@ def produce_dataset_inputs_sampling_covmat( """ from validphys import covmats - # TODO: JtH also add loaders for the remain covmats if use_t0_sampling: if use_thcovmat_in_sampling: if sep_mult: - return covmats.dataset_inputs_t0_total_covmat_separate + return covmats.dataset_load_inputs_t0_total_covmat_separate else: return covmats.dataset_load_inputs_t0_total_covmat else: @@ -865,9 +864,9 @@ def produce_dataset_inputs_sampling_covmat( else: if use_thcovmat_in_sampling: if sep_mult: - return covmats.dataset_inputs_total_covmat_separate + return covmats.dataset_load_inputs_total_covmat_separate else: - return covmats.dataset_inputs_total_covmat + return covmats.dataset_load_inputs_total_covmat else: if sep_mult: return covmats.dataset_inputs_exp_covmat_separate diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index c3808cd5d5..7de450dcc0 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -406,6 +406,17 @@ def dataset_inputs_t0_covmat_from_systematics( ) +def dataset_load_inputs_t0_total_covmat_separate( + dataset_inputs_t0_exp_covmat_separate, loaded_theory_covmat +): + """ + Function to compute the covmat to be used for the sampling by make_replica. + In this case the t0 prescription is used for the experimental covmat and the multiplicative + errors are separated. Moreover, the theory covmat is added to experimental covmat. + """ + return dataset_inputs_t0_exp_covmat_separate + loaded_theory_covmat + + def dataset_inputs_t0_total_covmat_separate( dataset_inputs_t0_exp_covmat_separate, nnfit_theory_covmat ): @@ -441,6 +452,17 @@ def dataset_inputs_t0_exp_covmat_separate( return covmat +def dataset_load_inputs_total_covmat_separate( + dataset_inputs_exp_covmat_separate, loaded_theory_covmat +): + """ + Function to compute the covmat to be used for the sampling by make_replica. + In this case the t0 prescription is not used for the experimental covmat and the multiplicative + errors are separated. Moreover, the theory covmat is added to experimental covmat. + """ + return dataset_inputs_exp_covmat_separate + loaded_theory_covmat + + def dataset_inputs_total_covmat_separate(dataset_inputs_exp_covmat_separate, nnfit_theory_covmat): """ Function to compute the covmat to be used for the sampling by make_replica. @@ -517,6 +539,15 @@ def dataset_inputs_t0_exp_covmat( return covmat +def dataset_load_inputs_total_covmat(dataset_inputs_exp_covmat, loaded_theory_covmat): + """ + Function to compute the covmat to be used for the sampling by make_replica and for the chi2 + by fitting_data_dict. In this case the t0 prescription is not used for the experimental covmat + and the multiplicative errors are included in it. Moreover, the theory covmat is added to experimental covmat. + """ + return dataset_inputs_exp_covmat + loaded_theory_covmat + + def dataset_inputs_total_covmat(dataset_inputs_exp_covmat, nnfit_theory_covmat): """ Function to compute the covmat to be used for the sampling by make_replica and for the chi2 From a6dc904cdc08b2d43f83c13fbf93a62d1d51311a Mon Sep 17 00:00:00 2001 From: jacoterh <54140851+jacoterh@users.noreply.github.com> Date: Thu, 14 May 2026 14:47:06 +0200 Subject: [PATCH 25/29] removing t0_considered from hashed covmat array --- validphys2/src/validphys/config.py | 2 +- validphys2/src/validphys/n3fit_data.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index d84ec11b6a..0d841bd6ad 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -816,7 +816,7 @@ def produce_dataset_inputs_fitting_covmat(self, use_thcovmat_in_fitting=False): from validphys import covmats if use_thcovmat_in_fitting: - return covmats.dataset_inputs_t0_total_covmat + return covmats.dataset_load_inputs_t0_total_covmat return covmats.dataset_inputs_t0_exp_covmat def produce_sep_mult(self, separate_multiplicative=False): diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index b86f956331..0e1016d8fc 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -313,9 +313,9 @@ def fittable_datasets_masked(data): return validphys_group_extractor(data.datasets) -def _hashed_dataset_inputs_fitting_covmat(dataset_inputs_covmat_t0_considered) -> Hashrray: +def _hashed_dataset_inputs_fitting_covmat(dataset_inputs_fitting_covmat) -> Hashrray: """Wrap the covmat into a Hashrray for caches to work""" - return Hashrray(dataset_inputs_covmat_t0_considered) + return Hashrray(dataset_inputs_fitting_covmat) @functools.lru_cache @@ -384,7 +384,7 @@ def _inv_covmat_prepared( return covmat, inv_total, diag_rot, eig_vals -def _fiting_covmat(dataset_inputs_fitting_covmat, data_input, diagonal_basis=True): +def _fiting_covmat(dataset_inputs_fitting_covmat, diagonal_basis=True): """Prepare the fitting covariance matrix by optionally adding theory contributions and transforming to diagonal basis. @@ -623,7 +623,10 @@ def fitting_covmat_table(output_path, _fiting_covmat, data_index, diagonal_basis if not diagonal_basis: log.info("Saving fitting covmat") - return pd.DataFrame(covmat, index=data_index, columns=data_index) + if not hasattr(covmat, "index"): + return pd.DataFrame(covmat, index=data_index, columns=data_index) + else: + return covmat df_rotation = pd.DataFrame(diagonal_rotation) df_rotation.insert(0, "eig_val", eig_vals) From 91fabeac42ea052f756da4f69a24ffa250f204fb Mon Sep 17 00:00:00 2001 From: achiefa Date: Fri, 15 May 2026 16:26:29 +0100 Subject: [PATCH 26/29] Fixng errors: moving rotation_action out of closure block --- n3fit/src/n3fit/scripts/vp_setupfit.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index 0e4e83cd27..a5ba367d10 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -174,12 +174,14 @@ def from_yaml(cls, o, *args, **kwargs): check_n3fit_action = 'datacuts::theory::closuretest::fitting n3fit_checks_action' else: filter_action = 'datacuts::theory::fitting filter' - if file_content.get('theorycovmatconfig') is not None: - rotation_action = 'datacuts::theory::theorycovmatconfig fitting_covmat_table' - else: - rotation_action = 'datacuts::theory fitting_covmat_table' check_n3fit_action = 'datacuts::theory::fitting n3fit_checks_action' + # Add rotation action for the total covariance matrix + if file_content.get('theorycovmatconfig') is not None: + rotation_action = 'datacuts::theory::theorycovmatconfig fitting_covmat_table' + else: + rotation_action = 'datacuts::theory fitting_covmat_table' + # The settings for these actions depend on the presence of closuretest fixed_config['actions_'] += [check_n3fit_action, filter_action, rotation_action] From 85a7049b57bafe3575e00b44ef90bbad65c02b96 Mon Sep 17 00:00:00 2001 From: achiefa Date: Sat, 16 May 2026 15:31:30 +0100 Subject: [PATCH 27/29] Using single-column csv when reading diag-basis results --- n3fit/src/n3fit/tests/test_fit.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/n3fit/src/n3fit/tests/test_fit.py b/n3fit/src/n3fit/tests/test_fit.py index c7f10e3faf..088c797000 100644 --- a/n3fit/src/n3fit/tests/test_fit.py +++ b/n3fit/src/n3fit/tests/test_fit.py @@ -283,8 +283,11 @@ def test_parallel_against_sequential(tmp_path, rep_from=6, rep_to=8): "ATLAS_TTBAR_8TEV_TOT_X-SEC", "CMS_SINGLETOP_13TEV_TCHANNEL-XSEC", ] - dataset_inputs = [{"dataset": d, "frac": 0.6, "variant": "legacy"} for d in datasets] + dataset_inputs = [{"dataset": d, "variant": "legacy"} for d in datasets] n3fit_input["dataset_inputs"] = dataset_inputs + # Using diaogonal basis + n3fit_input["diagonal_basis"] = True + n3fit_input["diagonal_frac"] = 0.5 # Exit inmediately n3fit_input["parameters"]["epochs"] = 1 # Save pseudodata @@ -311,8 +314,9 @@ def test_parallel_against_sequential(tmp_path, rep_from=6, rep_to=8): for csvfile_seq in folder_seq.glob("*/*.csv"): csvfile_par = folder_par / csvfile_seq.relative_to(folder_seq) - result_seq = pd.read_csv(csvfile_seq, sep="\t", index_col=[0, 1, 2], header=0) - result_par = pd.read_csv(csvfile_par, sep="\t", index_col=[0, 1, 2], header=0) + # Diagonal basis writes single-index csv files + result_seq = pd.read_csv(csvfile_seq, sep="\t", index_col=[0], header=0) + result_par = pd.read_csv(csvfile_par, sep="\t", index_col=[0], header=0) pd.testing.assert_frame_equal(result_seq, result_par) # Check the rest of the fit, while numerical differences are expected between sequential From 82cdad53388dff7bbecb1ab51fdd1969fd4fa0c4 Mon Sep 17 00:00:00 2001 From: achiefa Date: Sun, 17 May 2026 10:22:21 +0100 Subject: [PATCH 28/29] Pin torch 2.12.0 to avoid triton segfault --- .github/workflows/all_tests_nnpdf.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/all_tests_nnpdf.yml b/.github/workflows/all_tests_nnpdf.yml index f61f4764c1..ea0f882457 100644 --- a/.github/workflows/all_tests_nnpdf.yml +++ b/.github/workflows/all_tests_nnpdf.yml @@ -103,7 +103,11 @@ jobs: sudo rm -rf /opt/hostedtoolcache/Python/{3.9*} sudo rm -rf /opt/hostedtoolcache/CodeQL sudo rm -rf /usr/local/lib/android/sdk - pip install .[nolha,torch] + # torch 2.12.0 pulls triton 3.7.0, whose libtriton C extension segfaults + # at import on CPU-only Linux + CPython 3.13. Pinning torch<2.12 also + # pins triton to 3.6.x. Remove this pin once the upstream issue is + # resolved. + pip install ".[nolha,torch]" "torch<2.12" # Since there is no LHAPDF in the system, initialize the folder and download pdfsets.index lhapdf-management update --init - name: Test we can run one runcard From e67b1c66e0d32bd2b64e881a0e2fe0f63b9387b1 Mon Sep 17 00:00:00 2001 From: achiefa Date: Sun, 17 May 2026 20:57:28 +0100 Subject: [PATCH 29/29] Add reference to issue --- .github/workflows/all_tests_nnpdf.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/all_tests_nnpdf.yml b/.github/workflows/all_tests_nnpdf.yml index ea0f882457..8735f2b9bd 100644 --- a/.github/workflows/all_tests_nnpdf.yml +++ b/.github/workflows/all_tests_nnpdf.yml @@ -106,7 +106,7 @@ jobs: # torch 2.12.0 pulls triton 3.7.0, whose libtriton C extension segfaults # at import on CPU-only Linux + CPython 3.13. Pinning torch<2.12 also # pins triton to 3.6.x. Remove this pin once the upstream issue is - # resolved. + # resolved (see https://github.com/nascheme/triton/issues/43). pip install ".[nolha,torch]" "torch<2.12" # Since there is no LHAPDF in the system, initialize the folder and download pdfsets.index lhapdf-management update --init