NNPDF · Zaharid · Feb 13, 2020 · Jan 13, 2020 · Jan 15, 2020 · Jan 20, 2020
diff --git a/validphys2/src/validphys/calcutils.py b/validphys2/src/validphys/calcutils.py
@@ -149,70 +149,78 @@ def get_df_block(matrix: pd.DataFrame, key: str, level):
             key, level=level, axis=1).values
     return block
 
-def regularize_covmat(covmat: np.array, cond_num_threshold=500):
-    """Given a covariance matrix, performs a regularization on
-    the corresponding correlation matrix and uses that to return a new
-    regularized covariance matrix:
-
-    corr_ij = cov_ij / (sigma_i * sigma_j)
-
-    where sigma_i = sqrt(diag(cov)_i). The correlation
-    matrix is then regularized by clipping the smallest eigenvalues to a
-    minimum acceptable values given by
-
-    max(eigenvalues of corr_ij)/cond_num_threshold
-
-    finally the process to get a correlation matrix from a covariance matrix
-    is inverted
-
-    new_cov_ij = (regularized corr)_ij * sigma_i * sigma_j
+def regularize_covmat(covmat: np.array, norm_threshold=4):
+    """Given a covariance matrix, performs a regularization which is equivalent
+    to performing `regularize_l2` on the sqrt of `covmat`: the l2 norm of
+    the inverse of the correlation matrix calculated from `covmat` is set to be
+    less than or equal to `norm_threshold`. If the input covmat already fulfills
+    this criterion it is returned.
 
     Parameters
     ----------
     covmat : array
         a covariance matrix which is to be regularized.
-    cond_num_threshold : float
-        The acceptable condition number of the correlation matrix, by default
-        set to 500.
+    norm_threshold : float
+        The acceptable l2 norm of the sqrt correlation matrix, by default
+        set to 4.
 
     Returns
     -------
     new_covmat : array
         A new covariance matrix which has been regularized according to
         prescription above.
 
-    Notes
-    -----
-    (regularized corr)_ij is not technically a correlation matrix since it might
-    not have 1s on the diagonal.
+    """
+    # square up threshold since we have cov not sqrtcov
+    sqr_threshold = norm_threshold**2
+    d = np.sqrt(np.diag(covmat))[:, np.newaxis]
+    corr = covmat / d / d.T
+    e_val, e_vec = la.eigh(corr)
+    if 1 / e_val[0] <= sqr_threshold: # eigh gives eigenvals in ascending order
+        return covmat
+    new_e_val = np.clip(e_val, a_min=1/sqr_threshold, a_max=None)
+    return ((e_vec * new_e_val) @ e_vec.T) * d * d.T
 
-    TODO: is `regularized corr` the nearest matrix with condition number of
-    `cond_num_threshold` according to frob dist?
+def regularize_l2(sqrtcov, norm_threshold=4):
+    r"""Return a regularized version of `sqrtcov`.
 
-    Examples
-    --------
+    Given `sqrtcov` an (N, nsys) matrix, such that it's
+    gram matrix is the covariance matrix (`covmat = sqrtcov@sqrtcov.T`), first
+    decompose it like ``sqrtcov = D@A``, where `D` is a positive diagonal matrix
+    of standard deviations and `A` is the "square root" of the correlation
+    matrix, ``corrmat = A@A.T``. Then produce a new version of `A` which removes
+    the unstable behaviour and assemble a new square root covariance matrix,
+    which is returned.
 
-    >>> from validphys.calcutils import regularize_covmat
-    >>> import numpy as np
-    >>> import scipy.linalg as la
-    >>> np.random.seed(0)
-    >>> s = np.random.rand(3,3)
-    >>> cov = s@s.T
-    >>> cov
-    array([[1.17601578, 0.99135397, 1.45880096],
-       [0.99135397, 0.89356028, 1.23866193],
-       [1.45880096, 1.23866193, 1.91538757]])
-    >>> new_cov = regularize_covmat(cov, 100)
-    >>> new_cov
-    array([[1.18115619, 0.98945605, 1.45496881],
-       [0.98945605, 0.89426102, 1.24007682],
-       [1.45496881, 1.24007682, 1.9182444 ]])
-    >>> print(np.linalg.norm(new_cov-cov))
-    0.008697992044783345
+    The stability condition is controlled by `norm_threshold`. It is
+
+    .. math::
+
+        \left\Vert A+ \right\Vert_L2
+        \leq \frac{1}{norm_threshold}
+
+    A+ is the pseudoinverse of A, `norm_threshold` roughly corresponds to the
+    sqrt of the maximimum relative uncertainty in any systematic.
+
+    Parameters
+    ----------
+
+    sqrtcov : 2d array
+        An (N, nsys) matrix specifying the uncertainties.
+    norm_threshold : float
+        The tolerance for the regularization.
+
+    Returns
+    -------
+
+    newsqrtcov : 2d array
+        A regularized version of `sqrtcov`.
     """
-    d = np.sqrt(np.diag(covmat))
-    corr = (covmat/d)/d[:, np.newaxis]
-    e_val, e_vec = la.eigh(corr)
-    new_e_val = np.clip(e_val, a_min=max(e_val)/cond_num_threshold, a_max=None)
-    new_corr = (new_e_val*e_vec)@e_vec.T
-    return (new_corr*d)*d[:, np.newaxis]
+
+    d = np.sqrt(np.sum(sqrtcov ** 2, axis=1))[:, np.newaxis]
+    sqrtcorr = sqrtcov / d
+    u, s, vt = la.svd(sqrtcorr, full_matrices=False)
+    if 1 / s[-1] <= norm_threshold:
+        return sqrtcov
+    snew = np.clip(s, a_min=1/norm_threshold, a_max=None)
+    return u * (snew * d) @ vt
diff --git a/validphys2/src/validphys/checks.py b/validphys2/src/validphys/checks.py
@@ -237,3 +237,8 @@ def check_speclabels_different(dataspecs_speclabel):
 def check_two_dataspecs(dataspecs):
     l = len(dataspecs)
     check(l == 2, f"Expecting exactly 2 dataspecs, not {l}")
+
+@make_argcheck
+def check_norm_threshold(norm_threshold):
+    """Check norm_threshold is not None"""
+    check(norm_threshold is not None)
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
@@ -871,11 +871,28 @@ def parse_groupby(self, grouping: str):
                 "correctly?")
         return grouping
 
-    def parse_perform_covmat_reg(self, do_reg: bool):
-        """Parse the `regularize_covmat` key from runcard"""
-        if do_reg:
-            log.info("Regularizing covariance matrices")
-        return do_reg
+    def parse_norm_threshold(self, val: (numbers.Number, type(None))):
+        """The threshold to use for covariance matrix normalisation, sets
+        the maximum l2 norm of the inverse covariance matrix, by clipping
+        smallest eigenvalues
+
+        If norm_threshold is set to None, then no covmat regularization is
+        performed
+
+        """
+        if val is not None:
+            if val <= 0:
+                raise ConfigError("norm_threshold must be greater than zero.")
+            log.info(
+                f"Regularizing covariance matrices with norm threshold: {val}")
+        return val
+
+    def produce_no_covmat_reg(self):
+        """explicitly set norm_threshold to None so that no covariance matrix
+        regularization is performed
+
+        """
+        return {"norm_threshold": None}
 
     def parse_filter_rules(self, filter_rules: (list, type(None))):
         """A list of filter rules. See https://docs.nnpdf.science/vp/filters.html

diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py
@@ -20,10 +20,15 @@
 from reportengine.table import table
 from reportengine import collect
 
-from validphys.checks import (check_cuts_considered, check_pdf_is_montecarlo,
-                              check_speclabels_different, check_two_dataspecs,
-                              check_dataset_cuts_match_theorycovmat,
-                              check_experiment_cuts_match_theorycovmat)
+from validphys.checks import (
+    check_cuts_considered,
+    check_pdf_is_montecarlo,
+    check_speclabels_different,
+    check_two_dataspecs,
+    check_dataset_cuts_match_theorycovmat,
+    check_experiment_cuts_match_theorycovmat,
+    check_norm_threshold,
+)
 from validphys.core import DataSetSpec, PDF, ExperimentSpec
 from validphys.calcutils import (
     all_chi2, central_chi2, calc_chi2, calc_phi, bootstrap_values,
@@ -367,8 +372,7 @@ def covariance_matrix(
     dataset:DataSetSpec,
     fitthcovmat,
     t0set:(PDF, type(None)) = None,
-    perform_covmat_reg=False,
-    condition_number_threshold=500):
+    norm_threshold=None):
     """Returns the covariance matrix for a given `dataset`. By default the
     data central values will be used to calculate the multiplicative contributions
     to the covariance matrix.
@@ -387,13 +391,15 @@ def covariance_matrix(
     to the experimental covariance matrix.
 
     Covariance matrix can be regularized according to
-    `calcutils.regularize_covmat` if the user specifies `perform_covmat_reg` to
-    be true. This algorithm sets a minimum threshold for eigenvalues that the
-    corresponding correlation matrix can have to be:
+    `calcutils.regularize_covmat` if the user specifies `norm_threshold. This
+    algorithm sets a minimum threshold for eigenvalues that the corresponding
+    correlation matrix can have to be:
 
-    max(eigenvalue)/condition_number_threshold
+    1/(norm_threshold)^2
 
-    by default condition_number_threshold is set to 500
+    which has the effect of limiting the L2 norm of the inverse of the correlation
+    matrix. By default norm_threshold is None, to which means no regularization
+    is performed.
 
     Parameters
     ----------
@@ -406,7 +412,7 @@ def covariance_matrix(
         None if `use_t0` is False or a PDF parsed from `t0pdfset` runcard key
     perform_covmat_reg: bool
         whether or not to regularize the covariance matrix
-    condition_number_threshold: number
+    norm_threshold: number
         threshold used to regularize covariance matrix
 
     Returns
@@ -438,10 +444,10 @@ def covariance_matrix(
     if fitthcovmat:
         loaded_thcov = fitthcovmat.load()
         covmat += get_df_block(loaded_thcov, dataset.name, level=1)
-    if perform_covmat_reg:
+    if norm_threshold is not None:
         covmat = regularize_covmat(
             covmat,
-            cond_num_threshold=condition_number_threshold
+            norm_threshold=norm_threshold
         )
     return covmat
 
@@ -484,8 +490,7 @@ def experiment_covariance_matrix(
         experiment: ExperimentSpec,
         fitthcovmat,
         t0set:(PDF, type(None)) = None,
-        perform_covmat_reg=False,
-        condition_number_threshold=500):
+        norm_threshold=None):
     """Like `covariance_matrix` except for an experiment"""
     loaded_data = experiment.load()
 
@@ -502,10 +507,10 @@ def experiment_covariance_matrix(
         ds_names = loaded_thcov.index.get_level_values(1)
         indices = np.in1d(ds_names, [ds.name for ds in experiment.datasets]).nonzero()[0]
         covmat += loaded_thcov.iloc[indices, indices].values
-    if perform_covmat_reg:
+    if norm_threshold is not None:
         covmat = regularize_covmat(
             covmat,
-            cond_num_threshold=condition_number_threshold
+            norm_threshold=norm_threshold
         )
     return covmat
 
@@ -1068,6 +1073,99 @@ def experiments_central_values(experiment_result_table):
     central_theory_values = experiment_result_table["theory_central"]
     return central_theory_values
 
+dataspecs_each_dataset_chi2 = collect("each_dataset_chi2", ("dataspecs",))
+each_dataset = collect("dataset", ("experiments", "experiment"))
+dataspecs_each_dataset = collect("each_dataset", ("dataspecs",))
+
+@table
+@check_speclabels_different
+def dataspecs_dataset_chi2_difference_table(
+    dataspecs_each_dataset, dataspecs_each_dataset_chi2, dataspecs_speclabel):
+    r"""Returns a table with difference between the chi2 and the expected chi2
+    in units of the expected chi2 standard deviation, given by
+
+        chi2_diff = (\chi2 - N)/sqrt(2N)
+
+    for each dataset for each dataspec.
+
+    """
+    dfs = []
+    cols = [r"$(\chi^2 - N)/\sqrt{2N}$"]
+    for label, datasets, chi2s in zip(
+        dataspecs_speclabel, dataspecs_each_dataset, dataspecs_each_dataset_chi2):
+        records = []
+        for dataset, chi2 in zip(datasets, chi2s):
+            ndata = chi2.ndata
+
+            records.append(dict(
+                dataset=str(dataset),
+                chi2_stat = (chi2.central_result.mean() - ndata)/np.sqrt(2*ndata)
+            ))
+
+
+        df = pd.DataFrame.from_records(records,
+                columns=("dataset", "chi2_stat"),
+                index = ("dataset",)
+            )
+        df.columns = pd.MultiIndex.from_product(([label], cols))
+        dfs.append(df)
+    return pd.concat(dfs, axis=1)
+
+datasets_covmat_no_reg = collect(
+    "covariance_matrix", ("experiments", "experiment", "no_covmat_reg"))
+datasets_covmat_reg = collect(
+    "covariance_matrix", ("experiments", "experiment",))
+
+@table
+@check_norm_threshold
+def datasets_covariance_matrix_differences_table(
+    each_dataset, datasets_covmat_no_reg, datasets_covmat_reg, norm_threshold):
+    """For each dataset calculate and tabulate two max differences upon
+    regularization given a value for `norm_threshold`:
+
+    - max relative difference to the diagonal of the covariance matrix (%)
+    - max absolute difference to the correlation matrix of each covmat
+
+    """
+    records = []
+    for ds, reg, noreg in zip(
+        each_dataset, datasets_covmat_reg, datasets_covmat_no_reg):
+        cov_diag_rel_diff = np.diag(reg)/np.diag(noreg)
+        d_reg = np.sqrt(np.diag(reg))
+        d_noreg = np.sqrt(np.diag(noreg))
+        corr_reg = reg/d_reg[:, np.newaxis]/d_reg[np.newaxis, :]
+        corr_noreg = noreg/d_noreg[:, np.newaxis]/d_noreg[np.newaxis, :]
+        corr_abs_diff = abs(corr_reg - corr_noreg)
+        records.append(dict(
+                dataset=str(ds),
+                covdiff= np.max(abs(cov_diag_rel_diff- 1))*100, #make percentage
+                corrdiff=np.max(corr_abs_diff)
+            ))
+    df = pd.DataFrame.from_records(records,
+        columns=("dataset", "covdiff", "corrdiff"),
+        index = ("dataset",)
+        )
+    df.columns = ["Variance rel. diff. (%)", "Correlation max abs. diff."]
+    return df
+
+dataspecs_covmat_diff_tables = collect(
+    "datasets_covariance_matrix_differences_table", ("dataspecs",))
+
+@check_speclabels_different
+@table
+def dataspecs_datasets_covmat_differences_table(
+    dataspecs_speclabel, dataspecs_covmat_diff_tables
+):
+    """For each dataspec calculate and tabulate the two covmat differences
+    described in `datasets_covariance_matrix_differences_table`
+    (max relative difference in variance and max absolute correlation difference)
+
+    """
+    df = pd.concat( dataspecs_covmat_diff_tables, axis=1)
+    cols = df.columns.get_level_values(0).unique()
+    df.columns = pd.MultiIndex.from_product((dataspecs_speclabel, cols))
+    return df
+
 experiments_chi2 = collect(abs_chi2_data_experiment, ('experiments',))
 each_dataset_chi2 = collect(abs_chi2_data, ('experiments', 'experiment'))