NNPDF · scarrazza · Oct 2, 2019 · Sep 3, 2019 · Sep 3, 2019 · Sep 4, 2019
diff --git a/validphys2/src/validphys/calcutils.py b/validphys2/src/validphys/calcutils.py
@@ -149,3 +149,70 @@ def get_df_block(matrix: pd.DataFrame, key: str, level):
             key, level=level, axis=1).values
     return block
 
+def regularize_covmat(covmat: np.array, cond_num_threshold=500):
+    """Given a covariance matrix, performs a regularization on
+    the corresponding correlation matrix and uses that to return a new
+    regularized covariance matrix:
+
+    corr_ij = cov_ij / (sigma_i * sigma_j)
+
+    where sigma_i = sqrt(diag(cov)_i). The correlation
+    matrix is then regularized by clipping the smallest eigenvalues to a
+    minimum acceptable values given by
+
+    max(eigenvalues of corr_ij)/cond_num_threshold
+
+    finally the process to get a correlation matrix from a covariance matrix
+    is inverted
+
+    new_cov_ij = (regularized corr)_ij * sigma_i * sigma_j
+
+    Parameters
+    ----------
+    covmat : array
+        a covariance matrix which is to be regularized.
+    cond_num_threshold : float
+        The acceptable condition number of the correlation matrix, by default
+        set to 500.
+
+    Returns
+    -------
+    new_covmat : array
+        A new covariance matrix which has been regularized according to
+        prescription above.
+
+    Notes
+    -----
+    (regularized corr)_ij is not technically a correlation matrix since it might
+    not have 1s on the diagonal.
+
+    TODO: is `regularized corr` the nearest matrix with condition number of
+    `cond_num_threshold` according to frob dist?
+
+    Examples
+    --------
+
+    >>> from validphys.calcutils import regularize_covmat
+    >>> import numpy as np
+    >>> import scipy.linalg as la
+    >>> np.random.seed(0)
+    >>> s = np.random.rand(3,3)
+    >>> cov = s@s.T
+    >>> cov
+    array([[1.17601578, 0.99135397, 1.45880096],
+       [0.99135397, 0.89356028, 1.23866193],
+       [1.45880096, 1.23866193, 1.91538757]])
+    >>> new_cov = regularize_covmat(cov, 100)
+    >>> new_cov
+    array([[1.18115619, 0.98945605, 1.45496881],
+       [0.98945605, 0.89426102, 1.24007682],
+       [1.45496881, 1.24007682, 1.9182444 ]])
+    >>> print(np.linalg.norm(new_cov-cov))
+    0.008697992044783345
+    """
+    d = np.sqrt(np.diag(covmat))
+    corr = (covmat/d)/d[:, np.newaxis]
+    e_val, e_vec = la.eigh(corr)
+    new_e_val = np.clip(e_val, a_min=max(e_val)/cond_num_threshold, a_max=None)
+    new_corr = (new_e_val*e_vec)@e_vec.T
+    return (new_corr*d)*d[:, np.newaxis]
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
@@ -868,6 +868,11 @@ def parse_groupby(self, grouping: str):
                 "correctly?")
         return grouping
 
+    def parse_perform_covmat_reg(self, do_reg: bool):
+        """Parse the `regularize_covmat` key from runcard"""
+        if do_reg:
+            log.info("Regularizing covariance matrices")
+        return do_reg
 
 
 class Config(report.Config, CoreConfig, ParamfitsConfig):

diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py
@@ -24,8 +24,9 @@
                               check_dataset_cuts_match_theorycovmat,
                               check_experiment_cuts_match_theorycovmat)
 from validphys.core import DataSetSpec, PDF, ExperimentSpec
-from validphys.calcutils import (all_chi2, central_chi2, calc_chi2, calc_phi, bootstrap_values,
-                                 get_df_block)
+from validphys.calcutils import (
+    all_chi2, central_chi2, calc_chi2, calc_phi, bootstrap_values,
+    get_df_block, regularize_covmat)
 
 log = logging.getLogger(__name__)
 
@@ -235,30 +236,33 @@ def experiments_covmat_no_table(
     for experiment, experiment_covmat in zip(
             experiments, experiments_covariance_matrix):
         name = experiment.name
-        mat, _ = experiment_covmat
-        df.loc[[name],[name]] = mat
+        df.loc[[name],[name]] = experiment_covmat
     return df
 
 @table
 def experiments_covmat(experiments_covmat_no_table):
     """Duplicate of experiments_covmat_no_table but with a table decorator."""
     return experiments_covmat_no_table
 
+exps_sqrt_covmat = collect(
+    'experiment_sqrt_covariance_matrix',
+    ('experiments',)
+)
+
 @table
 def experiments_sqrtcovmat(
-        experiments, experiments_index, experiments_covariance_matrix):
+        experiments, experiments_index, exps_sqrt_covmat):
     """Like experiments_covmat, but dump the lower triangular part of the
     Cholesky decomposition as used in the fit. The upper part indices are set
     to zero.
     """
     data = np.zeros((len(experiments_index),len(experiments_index)))
     df = pd.DataFrame(data, index=experiments_index, columns=experiments_index)
-    for experiment, experiment_covmat in zip(
-            experiments, experiments_covariance_matrix):
+    for experiment, exp_sqrt_covmat in zip(
+            experiments, exps_sqrt_covmat):
         name = experiment.name
-        _, mat = experiment_covmat
-        mat[np.triu_indices_from(mat, k=1)] = 0
-        df.loc[[name],[name]] = mat
+        exp_sqrt_covmat[np.triu_indices_from(exp_sqrt_covmat, k=1)] = 0
+        df.loc[[name],[name]] = exp_sqrt_covmat
     return df
 
 @table
@@ -272,9 +276,8 @@ def experiments_invcovmat(
     for experiment, experiment_covmat in zip(
             experiments, experiments_covariance_matrix):
         name = experiment.name
-        cov, _ = experiment_covmat
         #Improve this inversion if this method tuns out to be important
-        invcov = la.inv(cov)
+        invcov = la.inv(experiment_covmat)
         df.loc[[name],[name]] = invcov
     return df
 
@@ -359,12 +362,68 @@ def closure_pseudodata_replicas(experiments, pdf, nclosure:int,
 
 
 @check_dataset_cuts_match_theorycovmat
-def covariance_matrix(dataset:DataSetSpec, fitthcovmat, t0set:(PDF, type(None)) = None):
-    """Returns a tuple of Covariance matrix and sqrt covariance matrix for the given dataset
-    which includes theory contribution from scale variations if `use_theorycovmat` is True and
-    an appropriate fit from which the covariance matrix will be loaded is given
-
-    if t0set is specified then t0 predictions will be used to construct the covariance matrix.
+def covariance_matrix(
+    dataset:DataSetSpec,
+    fitthcovmat,
+    t0set:(PDF, type(None)) = None,
+    perform_covmat_reg=False,
+    condition_number_threshold=500):
+    """Returns the covariance matrix for a given `dataset`. By default the
+    data central values will be used to calculate the multiplicative contributions
+    to the covariance matrix.
+
+    The matrix can instead be constructed with
+    the t0 proceedure if the user sets `use_t0` to True and gives a
+    `t0pdfset`. In this case the central predictions from the `t0pdfset` will be
+    used to calculate the multiplicative contributions to the covariance matrix.
+    More information on the t0 procedure can be found here:
+    https://arxiv.org/abs/0912.2276
+
+    The user can specify `use_fit_thcovmat_if_present` to be True
+    and provide a corresponding `fit`. If the theory covmat was used in the
+    corresponding `fit` and the specfied `dataset` was used in the fit then
+    the theory covariance matrix for this `dataset` will be added in quadrature
+    to the experimental covariance matrix.
+
+    Covariance matrix can be regularized according to
+    `calcutils.regularize_covmat` if the user specifies `perform_covmat_reg` to
+    be true. This algorithm sets a minimum threshold for eigenvalues that the
+    corresponding correlation matrix can have to be:
+
+    max(eigenvalue)/condition_number_threshold
+
+    by default condition_number_threshold is set to 500
+
+    Parameters
+    ----------
+    dataset : DataSetSpec
+        object parsed from the `dataset_input` runcard key
+    fitthcovmat: None or ThCovMatSpec
+        None if either `use_thcovmat_if_present` is False or if no theory
+        covariance matrix was used in the corresponding fit
+    t0set: None or PDF
+        None if `use_t0` is False or a PDF parsed from `t0pdfset` runcard key
+    perform_covmat_reg: bool
+        whether or not to regularize the covariance matrix
+    condition_number_threshold: number
+        threshold used to regularize covariance matrix
+
+    Returns
+    -------
+    covariance_matrix : array
+        a covariance matrix as a numpy array
+
+    Examples
+    --------
+
+    >>> from validphys.api import API
+    >>> inp = {
+            'dataset_input': {'ATLASTTBARTOT'},
+            'theoryid': 52,
+            'use_cuts': 'no_cuts'
+        }
+    >>> cov = API.covariance_matrix(**inp)
+    TODO: complete example
     """
     loaded_data = dataset.load()
 
@@ -374,18 +433,58 @@ def covariance_matrix(dataset:DataSetSpec, fitthcovmat, t0set:(PDF, type(None))
         log.debug("Setting T0 predictions for %s" % dataset)
         loaded_data.SetT0(t0set.load_t0())
 
+    covmat = loaded_data.get_covmat()
     if fitthcovmat:
         loaded_thcov = fitthcovmat.load()
-        covmat = get_df_block(loaded_thcov, dataset.name, level=1) + loaded_data.get_covmat()
-        sqrtcovmat = np.linalg.cholesky(covmat)
-    else:
-        covmat = loaded_data.get_covmat()
-        sqrtcovmat = loaded_data.get_sqrtcovmat()
-    return covmat, sqrtcovmat
+        covmat += get_df_block(loaded_thcov, dataset.name, level=1)
+    if perform_covmat_reg:
+        covmat = regularize_covmat(
+            covmat,
+            cond_num_threshold=condition_number_threshold
+        )
+    return covmat
+
+datasets_covariance_matrix = collect(
+    'covariance_matrix',
+    ('experiments', 'experiment',)
+)
+
+def sqrt_covariance_matrix(covariance_matrix: np.array):
+    """Returns the lower-triangular Cholesky factor `covariance_matrix`
+
+    Parameters
+    ----------
+    covariance_matrix: array
+        a positive definite covariance matrix
+
+    Returns
+    -------
+    sqrt_covariance_matrix : array
+        lower triangular Cholesky factor of covariance_matrix
+
+    Notes
+    -----
+    The lower triangular is useful for efficient calculation of the chi^2
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from validphys.results import sqrt_covariance_matrix
+    >>> a = np.array([[1, 0.5], [0.5, 1]])
+    >>> sqrt_covariance_matrix(a)
+    array([[1.  , 0.    ],
+        [0.5    , 0.8660254]])
+    """
+    return la.cholesky(covariance_matrix, lower=True)
 
 @check_experiment_cuts_match_theorycovmat
 def experiment_covariance_matrix(
-        experiment: ExperimentSpec, fitthcovmat, t0set:(PDF, type(None)) = None):
+        experiment: ExperimentSpec,
+        fitthcovmat,
+        t0set:(PDF, type(None)) = None,
+        perform_covmat_reg=False,
+        condition_number_threshold=500):
     """Like `covariance_matrix` except for an experiment"""
     loaded_data = experiment.load()
 
@@ -395,18 +494,30 @@ def experiment_covariance_matrix(
         log.debug("Setting T0 predictions for %s" % experiment)
         loaded_data.SetT0(t0set.load_t0())
 
+    covmat = loaded_data.get_covmat()
+
     if fitthcovmat:
         loaded_thcov = fitthcovmat.load()
         ds_names = loaded_thcov.index.get_level_values(1)
         indices = np.in1d(ds_names, [ds.name for ds in experiment.datasets]).nonzero()[0]
-        covmat = loaded_thcov.iloc[indices, indices].values + loaded_data.get_covmat()
-        sqrtcovmat = np.linalg.cholesky(covmat)
-    else:
-        covmat = loaded_data.get_covmat()
-        sqrtcovmat = loaded_data.get_sqrtcovmat()
-    return covmat, sqrtcovmat
-
-def results(dataset:(DataSetSpec), pdf:PDF, covariance_matrix):
+        covmat += loaded_thcov.iloc[indices, indices].values
+    if perform_covmat_reg:
+        covmat = regularize_covmat(
+            covmat,
+            cond_num_threshold=condition_number_threshold
+        )
+    return covmat
+
+def experiment_sqrt_covariance_matrix(experiment_covariance_matrix):
+    """Like `sqrt_covariance_matrix` but for an experiment"""
+    return sqrt_covariance_matrix(experiment_covariance_matrix)
+
+def results(
+        dataset:(DataSetSpec),
+        pdf:PDF,
+        covariance_matrix,
+        sqrt_covariance_matrix
+    ):
     """Tuple of data and theory results for a single pdf. The data will have an associated
     covariance matrix, which can include a contribution from the theory covariance matrix which
     is constructed from scale variation. The inclusion of this covariance matrix by default is used
@@ -416,17 +527,30 @@ def results(dataset:(DataSetSpec), pdf:PDF, covariance_matrix):
     An experiment is also allowed.
     (as a result of the C++ code layout)."""
     data = dataset.load()
-    return (DataResult(data, *covariance_matrix),
+    return (DataResult(data, covariance_matrix, sqrt_covariance_matrix),
             ThPredictionsResult.from_convolution(pdf, dataset, loaded_data=data))
 
-def experiment_results(experiment, pdf:PDF, experiment_covariance_matrix):
+def experiment_results(
+        experiment,
+        pdf:PDF,
+        experiment_covariance_matrix,
+        experiment_sqrt_covariance_matrix):
     """Like `results` but for a whole experiment"""
-    return results(experiment, pdf, experiment_covariance_matrix)
+    return results(
+        experiment,
+        pdf,
+        experiment_covariance_matrix,
+        experiment_sqrt_covariance_matrix
+        )
 
 #It's better to duplicate a few lines than to complicate the logic of
 #``results`` to support this.
 #TODO: The above comment doesn't make sense after adding T0. Deprecate this
-def pdf_results(dataset:(DataSetSpec,  ExperimentSpec), pdfs:Sequence, covariance_matrix:tuple):
+def pdf_results(
+        dataset:(DataSetSpec,  ExperimentSpec),
+        pdfs:Sequence,
+        covariance_matrix,
+        sqrt_covariance_matrix):
     """Return a list of results, the first for the data and the rest for
     each of the PDFs."""
 
@@ -438,22 +562,23 @@ def pdf_results(dataset:(DataSetSpec,  ExperimentSpec), pdfs:Sequence, covarianc
         th_results.append(th_result)
 
 
-    return (DataResult(data, *covariance_matrix), *th_results)
+    return (DataResult(data, covariance_matrix, sqrt_covariance_matrix), *th_results)
 
 @require_one('pdfs', 'pdf')
 @remove_outer('pdfs', 'pdf')
 def one_or_more_results(dataset:(DataSetSpec, ExperimentSpec),
-                        covariance_matrix: tuple,
+                        covariance_matrix,
+                        sqrt_covariance_matrix,
                         pdfs:(type(None), Sequence)=None,
                         pdf:(type(None), PDF)=None):
     """Generate a list of results, where the first element is the data values,
     and the next is either the prediction for pdf or for each of the pdfs.
     Which of the two is selected intelligently depending on the namespace,
     when executing as an action."""
     if pdf:
-        return results(dataset, pdf, covariance_matrix)
+        return results(dataset, pdf, covariance_matrix, sqrt_covariance_matrix)
     else:
-        return pdf_results(dataset, pdfs, covariance_matrix)
+        return pdf_results(dataset, pdfs, covariance_matrix, sqrt_covariance_matrix)
     raise ValueError("Either 'pdf' or 'pdfs' is required")