From b4aabdd6862c2de805feea2ed244faafa9cab591 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Tue, 3 Sep 2019 12:39:08 +0100
Subject: [PATCH 1/8] add simple corrmat reg to calcutils

---
 validphys2/src/validphys/calcutils.py | 67 +++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/validphys2/src/validphys/calcutils.py b/validphys2/src/validphys/calcutils.py
index 44ed05442e..21d5ae73f3 100644
--- a/validphys2/src/validphys/calcutils.py
+++ b/validphys2/src/validphys/calcutils.py
@@ -149,3 +149,70 @@ def get_df_block(matrix: pd.DataFrame, key: str, level):
             key, level=level, axis=1).values
     return block
 
+def regularize_covmat(covmat: np.array, cond_num_threshold=500):
+    """Given a covariance matrix, performs a regularization on
+    the corresponding correlation matrix and uses that to return a new
+    regularized covariance matrix:
+
+    corr_ij = cov_ij / (sigma_i * sigma_j)
+
+    where sigma_i = sqrt(diag(cov)_i). The correlation
+    matrix is then regularized by clipping the smallest eigenvalues to a
+    minimum acceptable values given by
+
+    max(eigenvalues of corr_ij)/cond_num_threshold
+
+    finally the process to get a correlation matrix from a covariance matrix
+    is inverted
+
+    new_cov_ij = (regularized corr)_ij * sigma_i * sigma_j
+
+    Parameters
+    ----------
+    covmat : array
+        a covariance matrix which is to be regularized.
+    cond_num_threshold : float
+        The acceptable condition number of the correlation matrix, by default
+        set to 500.
+
+    Returns
+    -------
+    new_covmat : array
+        A new covariance matrix which has been regularized according to
+        prescription above.
+
+    Notes
+    -----
+    (regularized corr)_ij is not technically a correlation matrix since it might
+    not have 1s on the diagonal.
+
+    TODO: is `regularized corr` the nearest matrix with condition number of
+    `cond_num_threshold` according to frob dist?
+
+    Examples
+    --------
+
+    >>> from validphys.calcutils import regularize_covmat
+    >>> import numpy as np
+    >>> import scipy.linalg as la
+    >>> np.random.seed(0)
+    >>> s = np.random.rand(3,3)
+    >>> cov = s@s.T
+    >>> cov
+    array([[1.17601578, 0.99135397, 1.45880096],
+       [0.99135397, 0.89356028, 1.23866193],
+       [1.45880096, 1.23866193, 1.91538757]])
+    >>> new_cov = regularize_covmat(cov, 100)
+    >>> new_cov
+    array([[1.18115619, 0.98945605, 1.45496881],
+       [0.98945605, 0.89426102, 1.24007682],
+       [1.45496881, 1.24007682, 1.9182444 ]])
+    >>> print(np.linalg.norm(new_cov-cov))
+    0.008697992044783345
+    """
+    d = np.sqrt(np.diag(covmat))
+    corr = (covmat/d)/d[:, np.newaxis]
+    e_val, e_vec = la.eigh(corr)
+    new_e_val = np.clip(e_val, a_min=max(e_val)/cond_num_threshold, a_max=None)
+    new_corr = e_vec@(np.diag(new_e_val)@e_vec.T)
+    return (new_corr*d)*d[:, np.newaxis]

From d0fbf5e2f79925115a1b7b82563a4200855189a7 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Tue, 3 Sep 2019 12:49:58 +0100
Subject: [PATCH 2/8] regularize covmats at the level of n3fit

---
 n3fit/src/n3fit/io/reader.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/n3fit/src/n3fit/io/reader.py b/n3fit/src/n3fit/io/reader.py
index a155e7959c..cabad1b284 100644
--- a/n3fit/src/n3fit/io/reader.py
+++ b/n3fit/src/n3fit/io/reader.py
@@ -8,6 +8,7 @@
 from NNPDF import RandomGenerator
 from validphys.core import ExperimentSpec as vp_Exp
 from validphys.core import DataSetSpec as vp_Dataset
+from validphys.calcutils import regularize_covmat
 
 
 def make_tr_val_mask(datasets, exp_name, seed):
@@ -247,6 +248,8 @@ def common_data_reader(spec, t0pdfset, replica_seeds=None, trval_seeds=None):
 
     exp_name = spec.name
     covmat = spec_c.get_covmat()
+    # Regularize the covmat with cond_num_threshold=500
+    covmat = regularize_covmat(covmat, cond_num_threshold=500)
 
     # Now it is time to build the masks for the training validation split
     all_dict_out = []

From 66020f8f0dfc22ab0f9e0b42dd815b92f52e5b63 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 4 Sep 2019 10:29:03 +0100
Subject: [PATCH 3/8] restructured covmat functions to just return covmats and
 added sqrt covmat functions in rsults, improved documentation

---
 validphys2/src/validphys/results.py | 161 +++++++++++++++++++++-------
 1 file changed, 124 insertions(+), 37 deletions(-)

diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py
index a6db26f128..7813702904 100644
--- a/validphys2/src/validphys/results.py
+++ b/validphys2/src/validphys/results.py
@@ -235,8 +235,7 @@ def experiments_covmat_no_table(
     for experiment, experiment_covmat in zip(
             experiments, experiments_covariance_matrix):
         name = experiment.name
-        mat, _ = experiment_covmat
-        df.loc[[name],[name]] = mat
+        df.loc[[name],[name]] = experiment_covmat
     return df
 
 @table
@@ -244,21 +243,25 @@ def experiments_covmat(experiments_covmat_no_table):
     """Duplicate of experiments_covmat_no_table but with a table decorator."""
     return experiments_covmat_no_table
 
+exps_sqrt_covmat = collect(
+    'experiment_sqrt_covariance_matrix',
+    ('experiments',)
+)
+
 @table
 def experiments_sqrtcovmat(
-        experiments, experiments_index, experiments_covariance_matrix):
+        experiments, experiments_index, exps_sqrt_covmat):
     """Like experiments_covmat, but dump the lower triangular part of the
     Cholesky decomposition as used in the fit. The upper part indices are set
     to zero.
     """
     data = np.zeros((len(experiments_index),len(experiments_index)))
     df = pd.DataFrame(data, index=experiments_index, columns=experiments_index)
-    for experiment, experiment_covmat in zip(
-            experiments, experiments_covariance_matrix):
+    for experiment, exp_sqrt_covmat in zip(
+            experiments, exps_sqrt_covmat):
         name = experiment.name
-        _, mat = experiment_covmat
-        mat[np.triu_indices_from(mat, k=1)] = 0
-        df.loc[[name],[name]] = mat
+        exp_sqrt_covmat[np.triu_indices_from(exp_sqrt_covmat, k=1)] = 0
+        df.loc[[name],[name]] = exp_sqrt_covmat
     return df
 
 @table
@@ -272,9 +275,8 @@ def experiments_invcovmat(
     for experiment, experiment_covmat in zip(
             experiments, experiments_covariance_matrix):
         name = experiment.name
-        cov, _ = experiment_covmat
         #Improve this inversion if this method tuns out to be important
-        invcov = la.inv(cov)
+        invcov = la.inv(experiment_covmat)
         df.loc[[name],[name]] = invcov
     return df
 
@@ -360,11 +362,49 @@ def closure_pseudodata_replicas(experiments, pdf, nclosure:int,
 
 @check_dataset_cuts_match_theorycovmat
 def covariance_matrix(dataset:DataSetSpec, fitthcovmat, t0set:(PDF, type(None)) = None):
-    """Returns a tuple of Covariance matrix and sqrt covariance matrix for the given dataset
-    which includes theory contribution from scale variations if `use_theorycovmat` is True and
-    an appropriate fit from which the covariance matrix will be loaded is given
-
-    if t0set is specified then t0 predictions will be used to construct the covariance matrix.
+    """Returns the covariance matrix for a given `dataset`. By default the
+    data central values will be used to calculate the multiplicative contributions
+    to the covariance matrix.
+
+    The matrix can instead be constructed with
+    the t0 proceedure if the user sets `use_t0` to True and gives a
+    `t0pdfset`. In this case the central predictions from the `t0pdfset` will be
+    used to calculate the multiplicative contributions to the covariance matrix.
+    More information on the t0 procedure can be found here:
+    https://arxiv.org/abs/0912.2276
+
+    Additionally the user can specify `use_fit_thcovmat_if_present` to be True
+    and provide a corresponding `fit`. If the theory covmat was used in the
+    corresponding `fit` and the specfied `dataset` was used in the fit then
+    the theory covariance matrix for this `dataset` will be added in quadrature
+    to the experimental covariance matrix.
+
+    Parameters
+    ----------
+    dataset : DataSetSpec
+        object parsed from the `dataset_input` runcard key
+    fitthcovmat: None or ThCovMatSpec
+        None if either `use_thcovmat_if_present` is False or if no theory
+        covariance matrix was used in the corresponding fit
+    t0set: None or PDF
+        None if `use_t0` is False or a PDF parsed from `t0pdfset` runcard key
+
+    Returns
+    -------
+    covariance_matrix : array
+        a covariance matrix as a numpy array
+
+    Examples
+    --------
+
+    >>> from validphys.api import API
+    >>> inp = {
+            'dataset_input': {'ATLASTTBARTOT'},
+            'theoryid': 52,
+            'use_cuts': 'no_cuts'
+        }
+    >>> cov = API.covariance_matrix(**inp)
+    TODO: complete example
     """
     loaded_data = dataset.load()
 
@@ -374,14 +414,40 @@ def covariance_matrix(dataset:DataSetSpec, fitthcovmat, t0set:(PDF, type(None))
         log.debug("Setting T0 predictions for %s" % dataset)
         loaded_data.SetT0(t0set.load_t0())
 
+    covmat = loaded_data.get_covmat()
     if fitthcovmat:
         loaded_thcov = fitthcovmat.load()
-        covmat = get_df_block(loaded_thcov, dataset.name, level=1) + loaded_data.get_covmat()
-        sqrtcovmat = np.linalg.cholesky(covmat)
-    else:
-        covmat = loaded_data.get_covmat()
-        sqrtcovmat = loaded_data.get_sqrtcovmat()
-    return covmat, sqrtcovmat
+        covmat += get_df_block(loaded_thcov, dataset.name, level=1)
+    return covmat
+
+def sqrt_covariance_matrix(covariance_matrix: np.array):
+    """Returns the lower-triangular Cholesky factor `covariance_matrix`
+
+    Parameters
+    ----------
+    covariance_matrix: array
+        a positive definite covariance matrix
+
+    Returns
+    -------
+    sqrt_covariance_matrix : array
+        lower triangular Cholesky factor of covariance_matrix
+
+    Notes
+    -----
+    The lower triangular is useful for efficient calculation of the chi^2
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from validphys.results import sqrt_covariance_matrix
+    >>> a = np.array([[1, 0.5], [0.5, 1]])
+    >>> sqrt_covariance_matrix(a)
+    array([[1.  , 0.    ],
+        [0.5    , 0.8660254]])
+    """
+    return la.cholesky(covariance_matrix, lower=True)
 
 @check_experiment_cuts_match_theorycovmat
 def experiment_covariance_matrix(
@@ -395,18 +461,25 @@ def experiment_covariance_matrix(
         log.debug("Setting T0 predictions for %s" % experiment)
         loaded_data.SetT0(t0set.load_t0())
 
+    covmat = loaded_data.get_covmat()
+
     if fitthcovmat:
         loaded_thcov = fitthcovmat.load()
         ds_names = loaded_thcov.index.get_level_values(1)
         indices = np.in1d(ds_names, [ds.name for ds in experiment.datasets]).nonzero()[0]
-        covmat = loaded_thcov.iloc[indices, indices].values + loaded_data.get_covmat()
-        sqrtcovmat = np.linalg.cholesky(covmat)
-    else:
-        covmat = loaded_data.get_covmat()
-        sqrtcovmat = loaded_data.get_sqrtcovmat()
-    return covmat, sqrtcovmat
-
-def results(dataset:(DataSetSpec), pdf:PDF, covariance_matrix):
+        covmat += loaded_thcov.iloc[indices, indices].values
+    return covmat
+
+def experiment_sqrt_covariance_matrix(experiment_covariance_matrix):
+    """Like `sqrt_covariance_matrix` but for an experiment"""
+    return sqrt_covariance_matrix(experiment_covariance_matrix)
+
+def results(
+        dataset:(DataSetSpec),
+        pdf:PDF,
+        covariance_matrix,
+        sqrt_covariance_matrix
+    ):
     """Tuple of data and theory results for a single pdf. The data will have an associated
     covariance matrix, which can include a contribution from the theory covariance matrix which
     is constructed from scale variation. The inclusion of this covariance matrix by default is used
@@ -416,17 +489,30 @@ def results(dataset:(DataSetSpec), pdf:PDF, covariance_matrix):
     An experiment is also allowed.
     (as a result of the C++ code layout)."""
     data = dataset.load()
-    return (DataResult(data, *covariance_matrix),
+    return (DataResult(data, covariance_matrix, sqrt_covariance_matrix),
             ThPredictionsResult.from_convolution(pdf, dataset, loaded_data=data))
 
-def experiment_results(experiment, pdf:PDF, experiment_covariance_matrix):
+def experiment_results(
+        experiment,
+        pdf:PDF,
+        experiment_covariance_matrix,
+        experiment_sqrt_covariance_matrix):
     """Like `results` but for a whole experiment"""
-    return results(experiment, pdf, experiment_covariance_matrix)
+    return results(
+        experiment,
+        pdf,
+        experiment_covariance_matrix,
+        experiment_sqrt_covariance_matrix
+        )
 
 #It's better to duplicate a few lines than to complicate the logic of
 #``results`` to support this.
 #TODO: The above comment doesn't make sense after adding T0. Deprecate this
-def pdf_results(dataset:(DataSetSpec,  ExperimentSpec), pdfs:Sequence, covariance_matrix:tuple):
+def pdf_results(
+        dataset:(DataSetSpec,  ExperimentSpec),
+        pdfs:Sequence,
+        covariance_matrix,
+        sqrt_covariance_matrix):
     """Return a list of results, the first for the data and the rest for
     each of the PDFs."""
 
@@ -438,12 +524,13 @@ def pdf_results(dataset:(DataSetSpec,  ExperimentSpec), pdfs:Sequence, covarianc
         th_results.append(th_result)
 
 
-    return (DataResult(data, *covariance_matrix), *th_results)
+    return (DataResult(data, covariance_matrix, sqrt_covariance_matrix), *th_results)
 
 @require_one('pdfs', 'pdf')
 @remove_outer('pdfs', 'pdf')
 def one_or_more_results(dataset:(DataSetSpec, ExperimentSpec),
-                        covariance_matrix: tuple,
+                        covariance_matrix,
+                        sqrt_covariance_matrix,
                         pdfs:(type(None), Sequence)=None,
                         pdf:(type(None), PDF)=None):
     """Generate a list of results, where the first element is the data values,
@@ -451,9 +538,9 @@ def one_or_more_results(dataset:(DataSetSpec, ExperimentSpec),
     Which of the two is selected intelligently depending on the namespace,
     when executing as an action."""
     if pdf:
-        return results(dataset, pdf, covariance_matrix)
+        return results(dataset, pdf, covariance_matrix, sqrt_covariance_matrix)
     else:
-        return pdf_results(dataset, pdfs, covariance_matrix)
+        return pdf_results(dataset, pdfs, covariance_matrix, sqrt_covariance_matrix)
     raise ValueError("Either 'pdf' or 'pdfs' is required")
 
 

From 955879fc5666a5aac17491b6a8673e85e8f6c6a6 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 4 Sep 2019 11:43:21 +0100
Subject: [PATCH 4/8] added covmat reg to covmat providers, use these covmat
 functions in n3fit

---
 n3fit/src/n3fit/fit.py              |  5 ++--
 n3fit/src/n3fit/io/reader.py        |  5 +---
 n3fit/src/n3fit/n3fit.py            |  2 +-
 validphys2/src/validphys/config.py  |  4 +++
 validphys2/src/validphys/results.py | 43 +++++++++++++++++++++++++----
 5 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/n3fit/src/n3fit/fit.py b/n3fit/src/n3fit/fit.py
index c3cb84375c..ebb26a8c60 100644
--- a/n3fit/src/n3fit/fit.py
+++ b/n3fit/src/n3fit/fit.py
@@ -15,6 +15,7 @@
 def fit(
     fitting,
     experiments,
+    experiments_covariance_matrix,
     t0set,
     replica,
     replica_path,
@@ -132,9 +133,9 @@ def fit(
     ##############################################################################
     all_exp_infos = [[] for _ in replica]
     # First loop over the experiments
-    for exp in experiments:
+    for exp, exp_cov in zip(experiments, experiments_covariance_matrix):
         log.info("Loading experiment: {0}".format(exp))
-        all_exp_dicts = reader.common_data_reader(exp, t0pdfset, replica_seeds=mcseeds, trval_seeds=trvalseeds)
+        all_exp_dicts = reader.common_data_reader(exp, exp_cov, t0pdfset, replica_seeds=mcseeds, trval_seeds=trvalseeds)
         for i, exp_dict in enumerate(all_exp_dicts):
             all_exp_infos[i].append(exp_dict)
 
diff --git a/n3fit/src/n3fit/io/reader.py b/n3fit/src/n3fit/io/reader.py
index cabad1b284..3c3ad046a1 100644
--- a/n3fit/src/n3fit/io/reader.py
+++ b/n3fit/src/n3fit/io/reader.py
@@ -173,7 +173,7 @@ def common_data_reader_experiment(experiment_c, experiment_spec):
     return parsed_datasets
 
 
-def common_data_reader(spec, t0pdfset, replica_seeds=None, trval_seeds=None):
+def common_data_reader(spec, covmat, t0pdfset, replica_seeds=None, trval_seeds=None):
     """
     Wrapper to read the information from validphys object
     This function receives either a validphyis experiment or dataset objects
@@ -247,9 +247,6 @@ def common_data_reader(spec, t0pdfset, replica_seeds=None, trval_seeds=None):
         )
 
     exp_name = spec.name
-    covmat = spec_c.get_covmat()
-    # Regularize the covmat with cond_num_threshold=500
-    covmat = regularize_covmat(covmat, cond_num_threshold=500)
 
     # Now it is time to build the masks for the training validation split
     all_dict_out = []
diff --git a/n3fit/src/n3fit/n3fit.py b/n3fit/src/n3fit/n3fit.py
index b9b8a12a2a..a04f11d71d 100755
--- a/n3fit/src/n3fit/n3fit.py
+++ b/n3fit/src/n3fit/n3fit.py
@@ -24,7 +24,7 @@
     actions_ = ['datacuts::theory fit']
 )
 
-N3FIT_PROVIDERS = ["n3fit.fit"]
+N3FIT_PROVIDERS = ["n3fit.fit", "validphys.results"]
 
 log = logging.getLogger(__name__)
 
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
index f0830d6bab..8829858357 100644
--- a/validphys2/src/validphys/config.py
+++ b/validphys2/src/validphys/config.py
@@ -868,6 +868,10 @@ def parse_groupby(self, grouping: str):
                 "correctly?")
         return grouping
 
+    def parse_perform_covmat_reg(self, do_reg: bool):
+        """Parse the `regularize_covmat` key from runcard"""
+        log.info("Regularizing covariance matrices")
+        return do_reg
 
 
 class Config(report.Config, CoreConfig, ParamfitsConfig):
diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py
index 7813702904..9e933c270f 100644
--- a/validphys2/src/validphys/results.py
+++ b/validphys2/src/validphys/results.py
@@ -24,8 +24,9 @@
                               check_dataset_cuts_match_theorycovmat,
                               check_experiment_cuts_match_theorycovmat)
 from validphys.core import DataSetSpec, PDF, ExperimentSpec
-from validphys.calcutils import (all_chi2, central_chi2, calc_chi2, calc_phi, bootstrap_values,
-                                 get_df_block)
+from validphys.calcutils import (
+    all_chi2, central_chi2, calc_chi2, calc_phi, bootstrap_values,
+    get_df_block, regularize_covmat)
 
 log = logging.getLogger(__name__)
 
@@ -361,7 +362,12 @@ def closure_pseudodata_replicas(experiments, pdf, nclosure:int,
 
 
 @check_dataset_cuts_match_theorycovmat
-def covariance_matrix(dataset:DataSetSpec, fitthcovmat, t0set:(PDF, type(None)) = None):
+def covariance_matrix(
+    dataset:DataSetSpec,
+    fitthcovmat,
+    t0set:(PDF, type(None)) = None,
+    perform_covmat_reg=False,
+    condition_number_threshold=500):
     """Returns the covariance matrix for a given `dataset`. By default the
     data central values will be used to calculate the multiplicative contributions
     to the covariance matrix.
@@ -373,12 +379,21 @@ def covariance_matrix(dataset:DataSetSpec, fitthcovmat, t0set:(PDF, type(None))
     More information on the t0 procedure can be found here:
     https://arxiv.org/abs/0912.2276
 
-    Additionally the user can specify `use_fit_thcovmat_if_present` to be True
+    The user can specify `use_fit_thcovmat_if_present` to be True
     and provide a corresponding `fit`. If the theory covmat was used in the
     corresponding `fit` and the specfied `dataset` was used in the fit then
     the theory covariance matrix for this `dataset` will be added in quadrature
     to the experimental covariance matrix.
 
+    Covariance matrix can be regularized according to
+    `calcutils.regularize_covmat` if the user specifies `perform_covmat_reg` to
+    be true. This algorithm sets a minimum threshold for eigenvalues that the
+    corresponding correlation matrix can have to be:
+
+    max(eigenvalue)/condition_number_threshold
+
+    by default condition_number_threshold is set to 500
+
     Parameters
     ----------
     dataset : DataSetSpec
@@ -388,6 +403,10 @@ def covariance_matrix(dataset:DataSetSpec, fitthcovmat, t0set:(PDF, type(None))
         covariance matrix was used in the corresponding fit
     t0set: None or PDF
         None if `use_t0` is False or a PDF parsed from `t0pdfset` runcard key
+    perform_covmat_reg: bool
+        whether or not to regularize the covariance matrix
+    condition_number_threshold: number
+        threshold used to regularize covariance matrix
 
     Returns
     -------
@@ -418,6 +437,11 @@ def covariance_matrix(dataset:DataSetSpec, fitthcovmat, t0set:(PDF, type(None))
     if fitthcovmat:
         loaded_thcov = fitthcovmat.load()
         covmat += get_df_block(loaded_thcov, dataset.name, level=1)
+    if perform_covmat_reg:
+        covmat = regularize_covmat(
+            covmat,
+            cond_num_threshold=condition_number_threshold
+        )
     return covmat
 
 def sqrt_covariance_matrix(covariance_matrix: np.array):
@@ -451,7 +475,11 @@ def sqrt_covariance_matrix(covariance_matrix: np.array):
 
 @check_experiment_cuts_match_theorycovmat
 def experiment_covariance_matrix(
-        experiment: ExperimentSpec, fitthcovmat, t0set:(PDF, type(None)) = None):
+        experiment: ExperimentSpec,
+        fitthcovmat,
+        t0set:(PDF, type(None)) = None,
+        perform_covmat_reg=False,
+        condition_number_threshold=500):
     """Like `covariance_matrix` except for an experiment"""
     loaded_data = experiment.load()
 
@@ -468,6 +496,11 @@ def experiment_covariance_matrix(
         ds_names = loaded_thcov.index.get_level_values(1)
         indices = np.in1d(ds_names, [ds.name for ds in experiment.datasets]).nonzero()[0]
         covmat += loaded_thcov.iloc[indices, indices].values
+    if perform_covmat_reg:
+        covmat = regularize_covmat(
+            covmat,
+            cond_num_threshold=condition_number_threshold
+        )
     return covmat
 
 def experiment_sqrt_covariance_matrix(experiment_covariance_matrix):

From ffb45c14313f8d019519676aa522b66c634fa822 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 4 Sep 2019 11:58:11 +0100
Subject: [PATCH 5/8] since using collected experiments_covariance_matrices, t0
 not required in as many functions, added comment in n3fit to help others
 understand why it is hardcoded to be true

---
 n3fit/src/n3fit/fit.py       | 12 ++++--------
 n3fit/src/n3fit/io/reader.py |  3 +--
 n3fit/src/n3fit/n3fit.py     |  3 ++-
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/n3fit/src/n3fit/fit.py b/n3fit/src/n3fit/fit.py
index ebb26a8c60..99ec6a1403 100644
--- a/n3fit/src/n3fit/fit.py
+++ b/n3fit/src/n3fit/fit.py
@@ -16,7 +16,6 @@ def fit(
     fitting,
     experiments,
     experiments_covariance_matrix,
-    t0set,
     replica,
     replica_path,
     output_path,
@@ -47,8 +46,9 @@ def fit(
         # Arguments:
             - `fitting`: dictionary with the hyperparameters of the fit
             - `experiments`: vp list of experiments to be included in the fit
-            - `t0set`: t0set name
-            - `replica`: a list of replica numbers to run over (tipycally just one)
+            - `experiments_covariance_matrix`: covariance matrices collected
+                                               over experiments
+            - `replica`: a list of replica numbers to run over (typically just one)
             - `replica_path`: path to the output of this run
             - `output_path`: name of the fit
             - `theorid`: theory id number
@@ -91,10 +91,6 @@ def fit(
     else:
         status_ok = "ok"
 
-    # Loading t0set from LHAPDF
-    if t0set is not None:
-        t0pdfset = t0set.load_t0()
-
     # First set the seed variables for
     # - Tr/Vl split
     # - Neural Network initialization
@@ -135,7 +131,7 @@ def fit(
     # First loop over the experiments
     for exp, exp_cov in zip(experiments, experiments_covariance_matrix):
         log.info("Loading experiment: {0}".format(exp))
-        all_exp_dicts = reader.common_data_reader(exp, exp_cov, t0pdfset, replica_seeds=mcseeds, trval_seeds=trvalseeds)
+        all_exp_dicts = reader.common_data_reader(exp, exp_cov, replica_seeds=mcseeds, trval_seeds=trvalseeds)
         for i, exp_dict in enumerate(all_exp_dicts):
             all_exp_infos[i].append(exp_dict)
 
diff --git a/n3fit/src/n3fit/io/reader.py b/n3fit/src/n3fit/io/reader.py
index 3c3ad046a1..d7a11ae3db 100644
--- a/n3fit/src/n3fit/io/reader.py
+++ b/n3fit/src/n3fit/io/reader.py
@@ -173,7 +173,7 @@ def common_data_reader_experiment(experiment_c, experiment_spec):
     return parsed_datasets
 
 
-def common_data_reader(spec, covmat, t0pdfset, replica_seeds=None, trval_seeds=None):
+def common_data_reader(spec, covmat, replica_seeds=None, trval_seeds=None):
     """
     Wrapper to read the information from validphys object
     This function receives either a validphyis experiment or dataset objects
@@ -215,7 +215,6 @@ def common_data_reader(spec, covmat, t0pdfset, replica_seeds=None, trval_seeds=N
     spec_c = spec.load()
     ndata = spec_c.GetNData()
     expdata_true = spec_c.get_cv().reshape(1, ndata)
-    spec_c.SetT0(t0pdfset)
     base_mcseed = int(hashlib.sha256(str(spec).encode()).hexdigest(), 16) % 10 ** 8
 
     if replica_seeds:
diff --git a/n3fit/src/n3fit/n3fit.py b/n3fit/src/n3fit/n3fit.py
index a04f11d71d..f22a1ec327 100755
--- a/n3fit/src/n3fit/n3fit.py
+++ b/n3fit/src/n3fit/n3fit.py
@@ -17,7 +17,8 @@
 from reportengine import colors
 from reportengine.compat import yaml
 
-
+# Set use_t0 true here for backwards compatibility. Causes fit covmats to
+# use t0 prescription
 N3FIT_FIXED_CONFIG = dict(
     use_cuts = 'internal',
     use_t0 = True,

From 04015bdc74a297ceaf70ea242bd69936d5d11991 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 4 Sep 2019 14:35:37 +0100
Subject: [PATCH 6/8] slightly improved log message about regularizing covmats
 (to be correct at least) added collect over datasets covmats

---
 validphys2/src/validphys/config.py  | 5 ++++-
 validphys2/src/validphys/results.py | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
index 8829858357..c0677ddd36 100644
--- a/validphys2/src/validphys/config.py
+++ b/validphys2/src/validphys/config.py
@@ -870,7 +870,10 @@ def parse_groupby(self, grouping: str):
 
     def parse_perform_covmat_reg(self, do_reg: bool):
         """Parse the `regularize_covmat` key from runcard"""
-        log.info("Regularizing covariance matrices")
+        if do_reg:
+            log.info("Regularizing covariance matrices")
+        else:
+            log.info("Not regularizing covariance matrices")
         return do_reg
 
 
diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py
index 9e933c270f..36a1ed9b84 100644
--- a/validphys2/src/validphys/results.py
+++ b/validphys2/src/validphys/results.py
@@ -444,6 +444,11 @@ def covariance_matrix(
         )
     return covmat
 
+datasets_covariance_matrix = collect(
+    'covariance_matrix',
+    ('experiments', 'experiment',)
+)
+
 def sqrt_covariance_matrix(covariance_matrix: np.array):
     """Returns the lower-triangular Cholesky factor `covariance_matrix`
 

From 17cf16c22cd301694ef51c8b3a8d2b0201b580d4 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Thu, 19 Sep 2019 12:14:44 +0100
Subject: [PATCH 7/8] make regularize_covmat nicer and remove log message I
 added for debugging purposes

---
 validphys2/src/validphys/calcutils.py | 2 +-
 validphys2/src/validphys/config.py    | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/validphys2/src/validphys/calcutils.py b/validphys2/src/validphys/calcutils.py
index 21d5ae73f3..e7f8298c4c 100644
--- a/validphys2/src/validphys/calcutils.py
+++ b/validphys2/src/validphys/calcutils.py
@@ -214,5 +214,5 @@ def regularize_covmat(covmat: np.array, cond_num_threshold=500):
     corr = (covmat/d)/d[:, np.newaxis]
     e_val, e_vec = la.eigh(corr)
     new_e_val = np.clip(e_val, a_min=max(e_val)/cond_num_threshold, a_max=None)
-    new_corr = e_vec@(np.diag(new_e_val)@e_vec.T)
+    new_corr = (new_e_val*e_vec)@e_vec.T
     return (new_corr*d)*d[:, np.newaxis]
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
index c0677ddd36..8f8b5ada7d 100644
--- a/validphys2/src/validphys/config.py
+++ b/validphys2/src/validphys/config.py
@@ -872,8 +872,6 @@ def parse_perform_covmat_reg(self, do_reg: bool):
         """Parse the `regularize_covmat` key from runcard"""
         if do_reg:
             log.info("Regularizing covariance matrices")
-        else:
-            log.info("Not regularizing covariance matrices")
         return do_reg
 
 

From f47f4de7ffc14a3b4e8f6e46e5fead75e8c85f20 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Thu, 19 Sep 2019 12:16:12 +0100
Subject: [PATCH 8/8] removed changes to n3fit

---
 n3fit/src/n3fit/fit.py       | 15 +++++++++------
 n3fit/src/n3fit/io/reader.py |  5 +++--
 n3fit/src/n3fit/n3fit.py     |  5 ++---
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/n3fit/src/n3fit/fit.py b/n3fit/src/n3fit/fit.py
index 99ec6a1403..c3cb84375c 100644
--- a/n3fit/src/n3fit/fit.py
+++ b/n3fit/src/n3fit/fit.py
@@ -15,7 +15,7 @@
 def fit(
     fitting,
     experiments,
-    experiments_covariance_matrix,
+    t0set,
     replica,
     replica_path,
     output_path,
@@ -46,9 +46,8 @@ def fit(
         # Arguments:
             - `fitting`: dictionary with the hyperparameters of the fit
             - `experiments`: vp list of experiments to be included in the fit
-            - `experiments_covariance_matrix`: covariance matrices collected
-                                               over experiments
-            - `replica`: a list of replica numbers to run over (typically just one)
+            - `t0set`: t0set name
+            - `replica`: a list of replica numbers to run over (tipycally just one)
             - `replica_path`: path to the output of this run
             - `output_path`: name of the fit
             - `theorid`: theory id number
@@ -91,6 +90,10 @@ def fit(
     else:
         status_ok = "ok"
 
+    # Loading t0set from LHAPDF
+    if t0set is not None:
+        t0pdfset = t0set.load_t0()
+
     # First set the seed variables for
     # - Tr/Vl split
     # - Neural Network initialization
@@ -129,9 +132,9 @@ def fit(
     ##############################################################################
     all_exp_infos = [[] for _ in replica]
     # First loop over the experiments
-    for exp, exp_cov in zip(experiments, experiments_covariance_matrix):
+    for exp in experiments:
         log.info("Loading experiment: {0}".format(exp))
-        all_exp_dicts = reader.common_data_reader(exp, exp_cov, replica_seeds=mcseeds, trval_seeds=trvalseeds)
+        all_exp_dicts = reader.common_data_reader(exp, t0pdfset, replica_seeds=mcseeds, trval_seeds=trvalseeds)
         for i, exp_dict in enumerate(all_exp_dicts):
             all_exp_infos[i].append(exp_dict)
 
diff --git a/n3fit/src/n3fit/io/reader.py b/n3fit/src/n3fit/io/reader.py
index d7a11ae3db..a155e7959c 100644
--- a/n3fit/src/n3fit/io/reader.py
+++ b/n3fit/src/n3fit/io/reader.py
@@ -8,7 +8,6 @@
 from NNPDF import RandomGenerator
 from validphys.core import ExperimentSpec as vp_Exp
 from validphys.core import DataSetSpec as vp_Dataset
-from validphys.calcutils import regularize_covmat
 
 
 def make_tr_val_mask(datasets, exp_name, seed):
@@ -173,7 +172,7 @@ def common_data_reader_experiment(experiment_c, experiment_spec):
     return parsed_datasets
 
 
-def common_data_reader(spec, covmat, replica_seeds=None, trval_seeds=None):
+def common_data_reader(spec, t0pdfset, replica_seeds=None, trval_seeds=None):
     """
     Wrapper to read the information from validphys object
     This function receives either a validphyis experiment or dataset objects
@@ -215,6 +214,7 @@ def common_data_reader(spec, covmat, replica_seeds=None, trval_seeds=None):
     spec_c = spec.load()
     ndata = spec_c.GetNData()
     expdata_true = spec_c.get_cv().reshape(1, ndata)
+    spec_c.SetT0(t0pdfset)
     base_mcseed = int(hashlib.sha256(str(spec).encode()).hexdigest(), 16) % 10 ** 8
 
     if replica_seeds:
@@ -246,6 +246,7 @@ def common_data_reader(spec, covmat, replica_seeds=None, trval_seeds=None):
         )
 
     exp_name = spec.name
+    covmat = spec_c.get_covmat()
 
     # Now it is time to build the masks for the training validation split
     all_dict_out = []
diff --git a/n3fit/src/n3fit/n3fit.py b/n3fit/src/n3fit/n3fit.py
index f22a1ec327..b9b8a12a2a 100755
--- a/n3fit/src/n3fit/n3fit.py
+++ b/n3fit/src/n3fit/n3fit.py
@@ -17,15 +17,14 @@
 from reportengine import colors
 from reportengine.compat import yaml
 
-# Set use_t0 true here for backwards compatibility. Causes fit covmats to
-# use t0 prescription
+
 N3FIT_FIXED_CONFIG = dict(
     use_cuts = 'internal',
     use_t0 = True,
     actions_ = ['datacuts::theory fit']
 )
 
-N3FIT_PROVIDERS = ["n3fit.fit", "validphys.results"]
+N3FIT_PROVIDERS = ["n3fit.fit"]
 
 log = logging.getLogger(__name__)