Skip to content
110 changes: 59 additions & 51 deletions validphys2/src/validphys/calcutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,70 +149,78 @@ def get_df_block(matrix: pd.DataFrame, key: str, level):
key, level=level, axis=1).values
return block

def regularize_covmat(covmat: np.array, cond_num_threshold=500):
"""Given a covariance matrix, performs a regularization on
the corresponding correlation matrix and uses that to return a new
regularized covariance matrix:

corr_ij = cov_ij / (sigma_i * sigma_j)

where sigma_i = sqrt(diag(cov)_i). The correlation
matrix is then regularized by clipping the smallest eigenvalues to a
minimum acceptable values given by

max(eigenvalues of corr_ij)/cond_num_threshold

finally the process to get a correlation matrix from a covariance matrix
is inverted

new_cov_ij = (regularized corr)_ij * sigma_i * sigma_j
def regularize_covmat(covmat: np.array, norm_threshold=4):
"""Given a covariance matrix, performs a regularization which is equivalent
to performing `regularize_l2` on the sqrt of `covmat`: the l2 norm of
the inverse of the correlation matrix calculated from `covmat` is set to be
less than or equal to `norm_threshold`. If the input covmat already fulfills
this criterion it is returned.

Parameters
----------
covmat : array
a covariance matrix which is to be regularized.
cond_num_threshold : float
The acceptable condition number of the correlation matrix, by default
set to 500.
norm_threshold : float
The acceptable l2 norm of the sqrt correlation matrix, by default
set to 4.

Comment thread
wilsonmr marked this conversation as resolved.
Returns
-------
new_covmat : array
A new covariance matrix which has been regularized according to
prescription above.

Notes
-----
(regularized corr)_ij is not technically a correlation matrix since it might
not have 1s on the diagonal.
"""
# square up threshold since we have cov not sqrtcov
sqr_threshold = norm_threshold**2
d = np.sqrt(np.diag(covmat))[:, np.newaxis]
corr = covmat / d / d.T
e_val, e_vec = la.eigh(corr)
if 1 / e_val[0] <= sqr_threshold: # eigh gives eigenvals in ascending order
return covmat
new_e_val = np.clip(e_val, a_min=1/sqr_threshold, a_max=None)
return ((e_vec * new_e_val) @ e_vec.T) * d * d.T

TODO: is `regularized corr` the nearest matrix with condition number of
`cond_num_threshold` according to frob dist?
def regularize_l2(sqrtcov, norm_threshold=4):
r"""Return a regularized version of `sqrtcov`.

Examples
--------
Given `sqrtcov` an (N, nsys) matrix, such that it's
gram matrix is the covariance matrix (`covmat = sqrtcov@sqrtcov.T`), first
decompose it like ``sqrtcov = D@A``, where `D` is a positive diagonal matrix
of standard deviations and `A` is the "square root" of the correlation
matrix, ``corrmat = A@A.T``. Then produce a new version of `A` which removes
the unstable behaviour and assemble a new square root covariance matrix,
which is returned.

>>> from validphys.calcutils import regularize_covmat
>>> import numpy as np
>>> import scipy.linalg as la
>>> np.random.seed(0)
>>> s = np.random.rand(3,3)
>>> cov = s@s.T
>>> cov
array([[1.17601578, 0.99135397, 1.45880096],
[0.99135397, 0.89356028, 1.23866193],
[1.45880096, 1.23866193, 1.91538757]])
>>> new_cov = regularize_covmat(cov, 100)
>>> new_cov
array([[1.18115619, 0.98945605, 1.45496881],
[0.98945605, 0.89426102, 1.24007682],
[1.45496881, 1.24007682, 1.9182444 ]])
>>> print(np.linalg.norm(new_cov-cov))
0.008697992044783345
The stability condition is controlled by `norm_threshold`. It is

.. math::

\left\Vert A+ \right\Vert_L2
\leq \frac{1}{norm_threshold}

A+ is the pseudoinverse of A, `norm_threshold` roughly corresponds to the
sqrt of the maximimum relative uncertainty in any systematic.

Parameters
----------

sqrtcov : 2d array
An (N, nsys) matrix specifying the uncertainties.
norm_threshold : float
The tolerance for the regularization.

Returns
-------

newsqrtcov : 2d array
A regularized version of `sqrtcov`.
"""
d = np.sqrt(np.diag(covmat))
corr = (covmat/d)/d[:, np.newaxis]
e_val, e_vec = la.eigh(corr)
new_e_val = np.clip(e_val, a_min=max(e_val)/cond_num_threshold, a_max=None)
new_corr = (new_e_val*e_vec)@e_vec.T
return (new_corr*d)*d[:, np.newaxis]

d = np.sqrt(np.sum(sqrtcov ** 2, axis=1))[:, np.newaxis]
sqrtcorr = sqrtcov / d
u, s, vt = la.svd(sqrtcorr, full_matrices=False)
if 1 / s[-1] <= norm_threshold:
return sqrtcov
snew = np.clip(s, a_min=1/norm_threshold, a_max=None)
return u * (snew * d) @ vt
5 changes: 5 additions & 0 deletions validphys2/src/validphys/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,8 @@ def check_speclabels_different(dataspecs_speclabel):
def check_two_dataspecs(dataspecs):
l = len(dataspecs)
check(l == 2, f"Expecting exactly 2 dataspecs, not {l}")

@make_argcheck
def check_norm_threshold(norm_threshold):
"""Check norm_threshold is not None"""
check(norm_threshold is not None)
27 changes: 22 additions & 5 deletions validphys2/src/validphys/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,11 +871,28 @@ def parse_groupby(self, grouping: str):
"correctly?")
return grouping

def parse_perform_covmat_reg(self, do_reg: bool):
"""Parse the `regularize_covmat` key from runcard"""
if do_reg:
log.info("Regularizing covariance matrices")
return do_reg
def parse_norm_threshold(self, val: (numbers.Number, type(None))):
Comment thread
wilsonmr marked this conversation as resolved.
"""The threshold to use for covariance matrix normalisation, sets
the maximum l2 norm of the inverse covariance matrix, by clipping
smallest eigenvalues

If norm_threshold is set to None, then no covmat regularization is
performed

"""
if val is not None:
if val <= 0:
raise ConfigError("norm_threshold must be greater than zero.")
log.info(
f"Regularizing covariance matrices with norm threshold: {val}")
return val

def produce_no_covmat_reg(self):
"""explicitly set norm_threshold to None so that no covariance matrix
regularization is performed

"""
return {"norm_threshold": None}

def parse_filter_rules(self, filter_rules: (list, type(None))):
"""A list of filter rules. See https://docs.nnpdf.science/vp/filters.html
Expand Down
134 changes: 116 additions & 18 deletions validphys2/src/validphys/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,15 @@
from reportengine.table import table
from reportengine import collect

from validphys.checks import (check_cuts_considered, check_pdf_is_montecarlo,
check_speclabels_different, check_two_dataspecs,
check_dataset_cuts_match_theorycovmat,
check_experiment_cuts_match_theorycovmat)
from validphys.checks import (
check_cuts_considered,
check_pdf_is_montecarlo,
check_speclabels_different,
check_two_dataspecs,
check_dataset_cuts_match_theorycovmat,
check_experiment_cuts_match_theorycovmat,
check_norm_threshold,
)
from validphys.core import DataSetSpec, PDF, ExperimentSpec
from validphys.calcutils import (
all_chi2, central_chi2, calc_chi2, calc_phi, bootstrap_values,
Expand Down Expand Up @@ -367,8 +372,7 @@ def covariance_matrix(
dataset:DataSetSpec,
fitthcovmat,
t0set:(PDF, type(None)) = None,
perform_covmat_reg=False,
condition_number_threshold=500):
norm_threshold=None):
"""Returns the covariance matrix for a given `dataset`. By default the
data central values will be used to calculate the multiplicative contributions
to the covariance matrix.
Expand All @@ -387,13 +391,15 @@ def covariance_matrix(
to the experimental covariance matrix.

Covariance matrix can be regularized according to
`calcutils.regularize_covmat` if the user specifies `perform_covmat_reg` to
be true. This algorithm sets a minimum threshold for eigenvalues that the
corresponding correlation matrix can have to be:
`calcutils.regularize_covmat` if the user specifies `norm_threshold. This
algorithm sets a minimum threshold for eigenvalues that the corresponding
correlation matrix can have to be:

max(eigenvalue)/condition_number_threshold
1/(norm_threshold)^2

by default condition_number_threshold is set to 500
which has the effect of limiting the L2 norm of the inverse of the correlation
matrix. By default norm_threshold is None, to which means no regularization
is performed.

Parameters
----------
Expand All @@ -406,7 +412,7 @@ def covariance_matrix(
None if `use_t0` is False or a PDF parsed from `t0pdfset` runcard key
perform_covmat_reg: bool
whether or not to regularize the covariance matrix
condition_number_threshold: number
norm_threshold: number
threshold used to regularize covariance matrix

Returns
Expand Down Expand Up @@ -438,10 +444,10 @@ def covariance_matrix(
if fitthcovmat:
loaded_thcov = fitthcovmat.load()
covmat += get_df_block(loaded_thcov, dataset.name, level=1)
if perform_covmat_reg:
if norm_threshold is not None:
covmat = regularize_covmat(
covmat,
cond_num_threshold=condition_number_threshold
norm_threshold=norm_threshold
)
return covmat

Expand Down Expand Up @@ -484,8 +490,7 @@ def experiment_covariance_matrix(
experiment: ExperimentSpec,
fitthcovmat,
t0set:(PDF, type(None)) = None,
perform_covmat_reg=False,
condition_number_threshold=500):
norm_threshold=None):
"""Like `covariance_matrix` except for an experiment"""
loaded_data = experiment.load()

Expand All @@ -502,10 +507,10 @@ def experiment_covariance_matrix(
ds_names = loaded_thcov.index.get_level_values(1)
indices = np.in1d(ds_names, [ds.name for ds in experiment.datasets]).nonzero()[0]
covmat += loaded_thcov.iloc[indices, indices].values
if perform_covmat_reg:
if norm_threshold is not None:
covmat = regularize_covmat(
covmat,
cond_num_threshold=condition_number_threshold
norm_threshold=norm_threshold
)
return covmat

Expand Down Expand Up @@ -1068,6 +1073,99 @@ def experiments_central_values(experiment_result_table):
central_theory_values = experiment_result_table["theory_central"]
return central_theory_values

dataspecs_each_dataset_chi2 = collect("each_dataset_chi2", ("dataspecs",))
each_dataset = collect("dataset", ("experiments", "experiment"))
dataspecs_each_dataset = collect("each_dataset", ("dataspecs",))

@table
@check_speclabels_different
def dataspecs_dataset_chi2_difference_table(
dataspecs_each_dataset, dataspecs_each_dataset_chi2, dataspecs_speclabel):
r"""Returns a table with difference between the chi2 and the expected chi2
in units of the expected chi2 standard deviation, given by

chi2_diff = (\chi2 - N)/sqrt(2N)

for each dataset for each dataspec.

"""
dfs = []
cols = [r"$(\chi^2 - N)/\sqrt{2N}$"]
for label, datasets, chi2s in zip(
dataspecs_speclabel, dataspecs_each_dataset, dataspecs_each_dataset_chi2):
records = []
for dataset, chi2 in zip(datasets, chi2s):
ndata = chi2.ndata

records.append(dict(
dataset=str(dataset),
chi2_stat = (chi2.central_result.mean() - ndata)/np.sqrt(2*ndata)
))


df = pd.DataFrame.from_records(records,
columns=("dataset", "chi2_stat"),
index = ("dataset",)
)
df.columns = pd.MultiIndex.from_product(([label], cols))
dfs.append(df)
return pd.concat(dfs, axis=1)

datasets_covmat_no_reg = collect(
"covariance_matrix", ("experiments", "experiment", "no_covmat_reg"))
datasets_covmat_reg = collect(
"covariance_matrix", ("experiments", "experiment",))

@table
@check_norm_threshold
def datasets_covariance_matrix_differences_table(
each_dataset, datasets_covmat_no_reg, datasets_covmat_reg, norm_threshold):
"""For each dataset calculate and tabulate two max differences upon
regularization given a value for `norm_threshold`:

- max relative difference to the diagonal of the covariance matrix (%)
- max absolute difference to the correlation matrix of each covmat

"""
records = []
for ds, reg, noreg in zip(
each_dataset, datasets_covmat_reg, datasets_covmat_no_reg):
cov_diag_rel_diff = np.diag(reg)/np.diag(noreg)
d_reg = np.sqrt(np.diag(reg))
d_noreg = np.sqrt(np.diag(noreg))
corr_reg = reg/d_reg[:, np.newaxis]/d_reg[np.newaxis, :]
corr_noreg = noreg/d_noreg[:, np.newaxis]/d_noreg[np.newaxis, :]
corr_abs_diff = abs(corr_reg - corr_noreg)
records.append(dict(
dataset=str(ds),
covdiff= np.max(abs(cov_diag_rel_diff- 1))*100, #make percentage
corrdiff=np.max(corr_abs_diff)
))
df = pd.DataFrame.from_records(records,
columns=("dataset", "covdiff", "corrdiff"),
index = ("dataset",)
)
df.columns = ["Variance rel. diff. (%)", "Correlation max abs. diff."]
return df

dataspecs_covmat_diff_tables = collect(
"datasets_covariance_matrix_differences_table", ("dataspecs",))

@check_speclabels_different
@table
def dataspecs_datasets_covmat_differences_table(
dataspecs_speclabel, dataspecs_covmat_diff_tables
):
"""For each dataspec calculate and tabulate the two covmat differences
described in `datasets_covariance_matrix_differences_table`
(max relative difference in variance and max absolute correlation difference)

"""
df = pd.concat( dataspecs_covmat_diff_tables, axis=1)
cols = df.columns.get_level_values(0).unique()
df.columns = pd.MultiIndex.from_product((dataspecs_speclabel, cols))
return df

experiments_chi2 = collect(abs_chi2_data_experiment, ('experiments',))
each_dataset_chi2 = collect(abs_chi2_data, ('experiments', 'experiment'))

Expand Down
Loading