Skip to content
67 changes: 67 additions & 0 deletions validphys2/src/validphys/calcutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,70 @@ def get_df_block(matrix: pd.DataFrame, key: str, level):
key, level=level, axis=1).values
return block

def regularize_covmat(covmat: np.array, cond_num_threshold=500):
"""Given a covariance matrix, performs a regularization on
the corresponding correlation matrix and uses that to return a new
regularized covariance matrix:

corr_ij = cov_ij / (sigma_i * sigma_j)

where sigma_i = sqrt(diag(cov)_i). The correlation
matrix is then regularized by clipping the smallest eigenvalues to a
minimum acceptable values given by

max(eigenvalues of corr_ij)/cond_num_threshold

finally the process to get a correlation matrix from a covariance matrix
is inverted

new_cov_ij = (regularized corr)_ij * sigma_i * sigma_j

Parameters
----------
covmat : array
a covariance matrix which is to be regularized.
cond_num_threshold : float
The acceptable condition number of the correlation matrix, by default
set to 500.

Returns
-------
new_covmat : array
A new covariance matrix which has been regularized according to
prescription above.

Notes
-----
(regularized corr)_ij is not technically a correlation matrix since it might
not have 1s on the diagonal.

TODO: is `regularized corr` the nearest matrix with condition number of
`cond_num_threshold` according to frob dist?

Examples
--------

>>> from validphys.calcutils import regularize_covmat
>>> import numpy as np
>>> import scipy.linalg as la
>>> np.random.seed(0)
>>> s = np.random.rand(3,3)
>>> cov = s@s.T
>>> cov
array([[1.17601578, 0.99135397, 1.45880096],
[0.99135397, 0.89356028, 1.23866193],
[1.45880096, 1.23866193, 1.91538757]])
>>> new_cov = regularize_covmat(cov, 100)
>>> new_cov
array([[1.18115619, 0.98945605, 1.45496881],
[0.98945605, 0.89426102, 1.24007682],
[1.45496881, 1.24007682, 1.9182444 ]])
>>> print(np.linalg.norm(new_cov-cov))
0.008697992044783345
"""
d = np.sqrt(np.diag(covmat))
corr = (covmat/d)/d[:, np.newaxis]
e_val, e_vec = la.eigh(corr)
new_e_val = np.clip(e_val, a_min=max(e_val)/cond_num_threshold, a_max=None)
new_corr = (new_e_val*e_vec)@e_vec.T
return (new_corr*d)*d[:, np.newaxis]
5 changes: 5 additions & 0 deletions validphys2/src/validphys/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,11 @@ def parse_groupby(self, grouping: str):
"correctly?")
return grouping

def parse_perform_covmat_reg(self, do_reg: bool):
"""Parse the `regularize_covmat` key from runcard"""
if do_reg:
log.info("Regularizing covariance matrices")
return do_reg


class Config(report.Config, CoreConfig, ParamfitsConfig):
Expand Down
207 changes: 166 additions & 41 deletions validphys2/src/validphys/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
check_dataset_cuts_match_theorycovmat,
check_experiment_cuts_match_theorycovmat)
from validphys.core import DataSetSpec, PDF, ExperimentSpec
from validphys.calcutils import (all_chi2, central_chi2, calc_chi2, calc_phi, bootstrap_values,
get_df_block)
from validphys.calcutils import (
all_chi2, central_chi2, calc_chi2, calc_phi, bootstrap_values,
get_df_block, regularize_covmat)

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -235,30 +236,33 @@ def experiments_covmat_no_table(
for experiment, experiment_covmat in zip(
experiments, experiments_covariance_matrix):
name = experiment.name
mat, _ = experiment_covmat
df.loc[[name],[name]] = mat
df.loc[[name],[name]] = experiment_covmat
return df

@table
def experiments_covmat(experiments_covmat_no_table):
"""Duplicate of experiments_covmat_no_table but with a table decorator."""
return experiments_covmat_no_table

exps_sqrt_covmat = collect(
'experiment_sqrt_covariance_matrix',
('experiments',)
)

@table
def experiments_sqrtcovmat(
experiments, experiments_index, experiments_covariance_matrix):
experiments, experiments_index, exps_sqrt_covmat):
"""Like experiments_covmat, but dump the lower triangular part of the
Cholesky decomposition as used in the fit. The upper part indices are set
to zero.
"""
data = np.zeros((len(experiments_index),len(experiments_index)))
df = pd.DataFrame(data, index=experiments_index, columns=experiments_index)
for experiment, experiment_covmat in zip(
experiments, experiments_covariance_matrix):
for experiment, exp_sqrt_covmat in zip(
experiments, exps_sqrt_covmat):
name = experiment.name
_, mat = experiment_covmat
mat[np.triu_indices_from(mat, k=1)] = 0
df.loc[[name],[name]] = mat
exp_sqrt_covmat[np.triu_indices_from(exp_sqrt_covmat, k=1)] = 0
df.loc[[name],[name]] = exp_sqrt_covmat
return df

@table
Expand All @@ -272,9 +276,8 @@ def experiments_invcovmat(
for experiment, experiment_covmat in zip(
experiments, experiments_covariance_matrix):
name = experiment.name
cov, _ = experiment_covmat
#Improve this inversion if this method tuns out to be important
invcov = la.inv(cov)
invcov = la.inv(experiment_covmat)
df.loc[[name],[name]] = invcov
return df

Expand Down Expand Up @@ -359,12 +362,68 @@ def closure_pseudodata_replicas(experiments, pdf, nclosure:int,


@check_dataset_cuts_match_theorycovmat
def covariance_matrix(dataset:DataSetSpec, fitthcovmat, t0set:(PDF, type(None)) = None):
"""Returns a tuple of Covariance matrix and sqrt covariance matrix for the given dataset
which includes theory contribution from scale variations if `use_theorycovmat` is True and
an appropriate fit from which the covariance matrix will be loaded is given

if t0set is specified then t0 predictions will be used to construct the covariance matrix.
def covariance_matrix(
dataset:DataSetSpec,
fitthcovmat,
t0set:(PDF, type(None)) = None,
perform_covmat_reg=False,
condition_number_threshold=500):
"""Returns the covariance matrix for a given `dataset`. By default the
data central values will be used to calculate the multiplicative contributions
to the covariance matrix.

The matrix can instead be constructed with
the t0 proceedure if the user sets `use_t0` to True and gives a
`t0pdfset`. In this case the central predictions from the `t0pdfset` will be
used to calculate the multiplicative contributions to the covariance matrix.
More information on the t0 procedure can be found here:
https://arxiv.org/abs/0912.2276

The user can specify `use_fit_thcovmat_if_present` to be True
and provide a corresponding `fit`. If the theory covmat was used in the
corresponding `fit` and the specfied `dataset` was used in the fit then
the theory covariance matrix for this `dataset` will be added in quadrature
to the experimental covariance matrix.

Covariance matrix can be regularized according to
`calcutils.regularize_covmat` if the user specifies `perform_covmat_reg` to
be true. This algorithm sets a minimum threshold for eigenvalues that the
corresponding correlation matrix can have to be:

max(eigenvalue)/condition_number_threshold

by default condition_number_threshold is set to 500

Parameters
----------
dataset : DataSetSpec
object parsed from the `dataset_input` runcard key
fitthcovmat: None or ThCovMatSpec
None if either `use_thcovmat_if_present` is False or if no theory
covariance matrix was used in the corresponding fit
t0set: None or PDF
None if `use_t0` is False or a PDF parsed from `t0pdfset` runcard key
perform_covmat_reg: bool
whether or not to regularize the covariance matrix
condition_number_threshold: number
threshold used to regularize covariance matrix

Returns
-------
covariance_matrix : array
a covariance matrix as a numpy array

Examples
--------

>>> from validphys.api import API
>>> inp = {
'dataset_input': {'ATLASTTBARTOT'},
'theoryid': 52,
'use_cuts': 'no_cuts'
}
>>> cov = API.covariance_matrix(**inp)
TODO: complete example
"""
loaded_data = dataset.load()

Expand All @@ -374,18 +433,58 @@ def covariance_matrix(dataset:DataSetSpec, fitthcovmat, t0set:(PDF, type(None))
log.debug("Setting T0 predictions for %s" % dataset)
loaded_data.SetT0(t0set.load_t0())

covmat = loaded_data.get_covmat()
if fitthcovmat:
loaded_thcov = fitthcovmat.load()
covmat = get_df_block(loaded_thcov, dataset.name, level=1) + loaded_data.get_covmat()
sqrtcovmat = np.linalg.cholesky(covmat)
else:
covmat = loaded_data.get_covmat()
sqrtcovmat = loaded_data.get_sqrtcovmat()
return covmat, sqrtcovmat
covmat += get_df_block(loaded_thcov, dataset.name, level=1)
if perform_covmat_reg:
covmat = regularize_covmat(
covmat,
cond_num_threshold=condition_number_threshold
)
return covmat

datasets_covariance_matrix = collect(
'covariance_matrix',
('experiments', 'experiment',)
)

def sqrt_covariance_matrix(covariance_matrix: np.array):
"""Returns the lower-triangular Cholesky factor `covariance_matrix`

Parameters
----------
covariance_matrix: array
a positive definite covariance matrix

Returns
-------
sqrt_covariance_matrix : array
lower triangular Cholesky factor of covariance_matrix

Notes
-----
The lower triangular is useful for efficient calculation of the chi^2

Examples
--------

>>> import numpy as np
>>> from validphys.results import sqrt_covariance_matrix
>>> a = np.array([[1, 0.5], [0.5, 1]])
>>> sqrt_covariance_matrix(a)
array([[1. , 0. ],
[0.5 , 0.8660254]])
"""
return la.cholesky(covariance_matrix, lower=True)

@check_experiment_cuts_match_theorycovmat
def experiment_covariance_matrix(
experiment: ExperimentSpec, fitthcovmat, t0set:(PDF, type(None)) = None):
experiment: ExperimentSpec,
fitthcovmat,
t0set:(PDF, type(None)) = None,
perform_covmat_reg=False,
condition_number_threshold=500):
"""Like `covariance_matrix` except for an experiment"""
loaded_data = experiment.load()

Expand All @@ -395,18 +494,30 @@ def experiment_covariance_matrix(
log.debug("Setting T0 predictions for %s" % experiment)
loaded_data.SetT0(t0set.load_t0())

covmat = loaded_data.get_covmat()

if fitthcovmat:
loaded_thcov = fitthcovmat.load()
ds_names = loaded_thcov.index.get_level_values(1)
indices = np.in1d(ds_names, [ds.name for ds in experiment.datasets]).nonzero()[0]
covmat = loaded_thcov.iloc[indices, indices].values + loaded_data.get_covmat()
sqrtcovmat = np.linalg.cholesky(covmat)
else:
covmat = loaded_data.get_covmat()
sqrtcovmat = loaded_data.get_sqrtcovmat()
return covmat, sqrtcovmat

def results(dataset:(DataSetSpec), pdf:PDF, covariance_matrix):
covmat += loaded_thcov.iloc[indices, indices].values
if perform_covmat_reg:
covmat = regularize_covmat(
covmat,
cond_num_threshold=condition_number_threshold
)
return covmat

def experiment_sqrt_covariance_matrix(experiment_covariance_matrix):
"""Like `sqrt_covariance_matrix` but for an experiment"""
return sqrt_covariance_matrix(experiment_covariance_matrix)

def results(
dataset:(DataSetSpec),
pdf:PDF,
covariance_matrix,
sqrt_covariance_matrix
):
"""Tuple of data and theory results for a single pdf. The data will have an associated
covariance matrix, which can include a contribution from the theory covariance matrix which
is constructed from scale variation. The inclusion of this covariance matrix by default is used
Expand All @@ -416,17 +527,30 @@ def results(dataset:(DataSetSpec), pdf:PDF, covariance_matrix):
An experiment is also allowed.
(as a result of the C++ code layout)."""
data = dataset.load()
return (DataResult(data, *covariance_matrix),
return (DataResult(data, covariance_matrix, sqrt_covariance_matrix),
ThPredictionsResult.from_convolution(pdf, dataset, loaded_data=data))

def experiment_results(experiment, pdf:PDF, experiment_covariance_matrix):
def experiment_results(
experiment,
pdf:PDF,
experiment_covariance_matrix,
experiment_sqrt_covariance_matrix):
"""Like `results` but for a whole experiment"""
return results(experiment, pdf, experiment_covariance_matrix)
return results(
experiment,
pdf,
experiment_covariance_matrix,
experiment_sqrt_covariance_matrix
)

#It's better to duplicate a few lines than to complicate the logic of
#``results`` to support this.
#TODO: The above comment doesn't make sense after adding T0. Deprecate this
def pdf_results(dataset:(DataSetSpec, ExperimentSpec), pdfs:Sequence, covariance_matrix:tuple):
def pdf_results(
dataset:(DataSetSpec, ExperimentSpec),
pdfs:Sequence,
covariance_matrix,
sqrt_covariance_matrix):
"""Return a list of results, the first for the data and the rest for
each of the PDFs."""

Expand All @@ -438,22 +562,23 @@ def pdf_results(dataset:(DataSetSpec, ExperimentSpec), pdfs:Sequence, covarianc
th_results.append(th_result)


return (DataResult(data, *covariance_matrix), *th_results)
return (DataResult(data, covariance_matrix, sqrt_covariance_matrix), *th_results)

@require_one('pdfs', 'pdf')
@remove_outer('pdfs', 'pdf')
def one_or_more_results(dataset:(DataSetSpec, ExperimentSpec),
covariance_matrix: tuple,
covariance_matrix,
sqrt_covariance_matrix,
pdfs:(type(None), Sequence)=None,
pdf:(type(None), PDF)=None):
"""Generate a list of results, where the first element is the data values,
and the next is either the prediction for pdf or for each of the pdfs.
Which of the two is selected intelligently depending on the namespace,
when executing as an action."""
if pdf:
return results(dataset, pdf, covariance_matrix)
return results(dataset, pdf, covariance_matrix, sqrt_covariance_matrix)
else:
return pdf_results(dataset, pdfs, covariance_matrix)
return pdf_results(dataset, pdfs, covariance_matrix, sqrt_covariance_matrix)
raise ValueError("Either 'pdf' or 'pdfs' is required")


Expand Down