diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 6126a14f34..385c2921e4 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -1,6 +1,7 @@ """Module for handling logic and manipulation of covariance and correlation matrices on different levels of abstraction """ +from collections import namedtuple import logging import numpy as np @@ -23,10 +24,19 @@ log = logging.getLogger(__name__) +INTRA_DATASET_SYS_NAME = ("THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR") -def covmat_from_systematics(commondata): - """Function that takes in a :py:class:`validphys.coredata.CommonData` object - and generates the associated covariance matrix using the systematics. The logic +Uncertainties = namedtuple( + "Uncertainties", ("stat", "unc", "corr", "thunc", "thcorr", "special") +) + + +def split_uncertainties(commondata): + """Take the statistical uncertainty and systematics table from + a :py:class:`validphys.coredata.CommonData` object + and split into the different different uncertainty archetypes each of + which may be handled differently by future operations, such as constructing + the covariance matrix. The logic of the different archetypes is best described by the now deprecated C++ code: .. code-block:: c++ @@ -75,35 +85,118 @@ def covmat_from_systematics(commondata): return CovMat; } - We see that for the diagonal elements of the covariance matrix, we must add the squared - statistical error. This is done by creating a diagonal matrix from the vector of statistical - errors, called ``stat_mat``, and squaring the matrix before the return line. - - We now pick datapoint i and loop over all other datapoints j. For each such pair of points, - loop over all the systematics, if the l'th systematic of data point i has name ``SKIP`` we ignore - it. This is handled by scanning over all sytematic errors in the ``sys_errors`` dataframe and dropping - any columns which correspond to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe + if the systematic of data point i has name ``SKIP`` we ignore + it. This is handled by scanning over all sytematic errors in the + ``sys_errors`` dataframe and dropping any columns which correspond + to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe defined below contains only the systematics without the ``SKIP`` name. - Note that in the switch statement an ADDitive or MULTiplicative systype of systematic i is handled - by either multiplying the additive or multiplicative uncertainties respectively. We instead opt to - create a purely additive systematic table where all elements are to be understood as additive - systematics and systematics of type MULT have been correctly transformed to their additive - values. This dataframe we call ``additive_mat``. + Note that in the switch statement an ADDitive or MULTiplicative systype + is handled by either multiplying the additive or multiplicative + uncertainties respectively. We convert uncertainties so that they are all + absolute: + - Additive (ADD) systematics are left unchanged + - multiplicative (MULT) systematics need to be converted from a + percentage by multiplying by the central value + and dividing by 100. + + Finally, the systematics are split into the five possible archetypes + of systematics uncertainties: uncorrelated (UNCORR), correlated (CORR), + theory_uncorrelated (THEORYUNCORR), theory_correlated (THEORYCORR) and + special_correlated systematics. + + + Parameters + ---------- + commondata: :py:class:`validphys.coredata.CommonData` + A commondata object containing information on the data central values, statistical and + systematic uncertainties, and information on their correlations. + + Returns + ------- + uncertainties: :py:class:`validphys.covmats.Uncertainties` + A ``namedtuple`` with the following attributes + + stat: np.array + 1-D array containing statistical uncertainties + unc: np.2darray + numpy array of uncorrelated systematics, can be empty if there + are no uncorrelated systematics. These are semantically similar + to statistical error, and only contribute the diagonal component + of the covariance matrix, because they are uncorrelated across + data points. + corr: np.2darray + Similar to uncorrelated except they can be correlated across + data points. + thunc: np.2darray + numpy array of uncorrelated "theory" uncertainties which + are not to be confused with theory covariance uncertainties. + Instead they are, for example, uncertainties in the c-factors + thcorr: np.2darray + same as theory_uncorrelated except they can be correlated across + data points. + special: pd.DataFrame + Dataframe of the systematics which can be correlated across + datasets, the columns of the dataframe are the systematic names. + Each systematic of this type has a unique name, by returning + these systematics as a dataframe with the unique names as column + headers, we can easily join up the special_correlated systematics + from multiple datasets, retaining correlations, using + ``pandas.concat``. + """ + sys_name = commondata.systype_table["name"].to_numpy() + # Dropping systematics that have type SKIP + sys_errors_df = commondata.sys_errors.loc[:, sys_name != "SKIP"] + sys_name = sys_name[sys_name != "SKIP"] + + # Diagonal matrix containing the statistical uncertainty for each + # data point + stat = commondata.stat_errors.to_numpy() + + # Systematic uncertainties converted to absolute uncertainties (additives + # are left unchanged, multiplicative uncertainties are in percentage format + # so get multiplied by central value and divided by 100). + abs_sys_errors_df = sys_errors_df.apply( + lambda x: [ + i.add if i.sys_type == "ADD" else (i.mult * j / 100) + for i, j in zip(x, commondata.central_values) + ] + ) + abs_sys_errors = abs_sys_errors_df.to_numpy() + + # set columns for special_correlated errors + abs_sys_errors_df.columns = sys_name + special_corr = abs_sys_errors_df.loc[:, ~np.isin(sys_name, INTRA_DATASET_SYS_NAME)] + + thunc = abs_sys_errors[:, sys_name == "THEORYUNCORR"] + unc = abs_sys_errors[:, sys_name == "UNCORR"] + thcorr = abs_sys_errors[:, sys_name == "THEORYCORR"] + corr = abs_sys_errors[:, sys_name == "CORR"] + + uncertainties = Uncertainties(stat, unc, corr, thunc, thcorr, special_corr) + return uncertainties - Next we notice that a systematic is correlated if its name is neither ``UNCORR`` nor ``THEORYUNCORR``. We - create a dataframe called ``correlated_mat`` where any column corresponding to an ``UNCORR`` or - ``THEORYUNCORR`` systematic is set to 0. - From here it's a matter of staring at a piece of paper for a while to realise the contribution to the - covariance matrix arising due to correlated systematics is ``correlated_mat @ additive_mat.T``. Note - that in the case where ``i == j`` is met, we double count this contribution using this method, so we set - the diagonal elements of this particular contribution to 0 by using ``np.fill_diagonal``. +def covmat_from_systematics(commondata, use_theory_errors=True): + """Given a single :py:class:`validphys.coredata.CommonData`, obtain the + tuple of statistical uncertainty and systematics from + :py:meth:`split_uncertainties` and + construct the covariance matrix. - Finally, the ``i == j`` case is handled by summing the square row elements of ``additive_mat``, this is - done by using the ``np.einsum`` function. + Uncorrelated contributions from: ``stat``, ``uncorrelated`` and + ``theory_uncorrelated`` are added in quadrature + to the diagonal of the covmat. - For more information on the generation of the covariance matrix see the `paper `_ + From here it's a matter of staring at a piece of paper for a while to + realise the contribution to the covariance matrix arising due to + correlated systematics is schematically ``A_correlated @ A_correlated.T``, + where A_correlated is a matrix N_dat by N_sys. The total contribution + from correlated systematics is found by adding together the result of + mutiplying each correlated systematic matrix by its transpose + (``correlated``, ``theory_correlated`` and ``special_correlated``). + + For more information on the generation of the covariance matrix see the + `paper `_ outlining the procedure, specifically equation 2 and surrounding text. Paramaters @@ -111,6 +204,8 @@ def covmat_from_systematics(commondata): commondata : validphys.coredata.CommonData CommonData which stores information about systematic errors, their treatment and description. + use_theory_errors: bool + Whether or not to include errors with name ``THEORY*` in the covmat Returns ------- @@ -141,47 +236,61 @@ def covmat_from_systematics(commondata): [3.46727332e-05, 3.45492831e-05, 2.56283708e-05, ..., 4.14126235e-05, 4.15843357e-05, 1.43824457e-04]]) """ - systype = commondata.systype_table["name"] - # Dropping systematics that have type SKIP - sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"] + stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(commondata) + + special_vals = special_corr.values + cov_mat = np.diag( + stat ** 2 + + (unc ** 2).sum(axis=1) + + (thunc ** 2).sum(axis=1) * use_theory_errors + ) + ( + corr @ corr.T + + thcorr @ thcorr.T * use_theory_errors + + special_vals @ special_vals.T + ) + return cov_mat - # Diagonal matrix containing the statistical uncertainty for each - # data point - stat_mat = np.diag(commondata.stat_errors.to_numpy()) - # Systematic uncertainties converted to additive uncertainties - additive_mat = sys_errors.apply( - lambda x: [ - i.add if i.sys_type == "ADD" else (i.mult * j / 100) - for i, j in zip(x, commondata.central_values) - ] - ) +def datasets_covmat_from_systematics(list_of_commondata, use_theory_errors=True): + """Given a list containing :py:class:`validphys.coredata.CommonData` s, + construct the full covariance matrix. - # Matrix where all non-zero values are due to correlated systematics - correlated_mat = additive_mat.copy() - # If all systypes were SKIP then we'd have an empty df - if not correlated_mat.empty: - correlated_mat.loc[:, (systype == "UNCORR") | (systype == "THEORYUNCORR")] = 0 - correlated_mat = correlated_mat.to_numpy() - additive_mat = additive_mat.to_numpy() - else: - # Some datasets have zero systematic uncertainties. - # We treat this special edge case by creating - # a matrix of zeros so it can be broadcast - # with the statistical error matrix at the end - correlated_mat = additive_mat = np.zeros_like(stat_mat) - - # Avoid double counting in case that the i = j element is a correlated - # systematic. We take care of this case in the einsum below - off_diagonals = correlated_mat @ additive_mat.T - np.fill_diagonal(off_diagonals, 0) - - cov_mat = ( - stat_mat ** 2 - + off_diagonals - + np.diag(np.einsum("ij, ij -> i", additive_mat, additive_mat)) - ) - return cov_mat + This is similar to :py:meth:`covmat_from_systematics` + except that ``special_corr`` is concatenated across all datasets + before being multiplied by its transpose to give off block-diagonal + contributions. The other systematics contribute to the block diagonal in the + same way as :py:meth:`covmat_from_systematics`. + + Paramaters + ---------- + list_of_commondata : list[validphys.coredata.CommonData] + list of CommonData objects. + use_theory_errors: bool + Whether or not to include errors with name ``THEORY*` in the covmat + + Returns + ------- + cov_mat : np.array + Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts) + containing uncertainty and correlation information. + + """ + special_corrs = [] + block_diags = [] + for cd in list_of_commondata: + stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(cd) + special_corrs.append(special_corr) + diag_covmat = np.diag( + stat ** 2 + + (unc ** 2).sum(axis=1) + + (thunc ** 2).sum(axis=1) * use_theory_errors + ) + (corr @ corr.T + thcorr @ thcorr.T * use_theory_errors) + block_diags.append(diag_covmat) + special_sys = pd.concat(special_corrs, axis=0, sort=False) + # non-overlapping systematics are set to NaN by concat, fill with 0 instead. + special_sys.fillna(0, inplace=True) + diag = la.block_diag(*block_diags) + return diag + special_sys.values @ special_sys.values.T def sqrt_covmat(covariance_matrix): diff --git a/validphys2/src/validphys/tests/conftest.py b/validphys2/src/validphys/tests/conftest.py index 3f3e2d9e8d..46683785a0 100644 --- a/validphys2/src/validphys/tests/conftest.py +++ b/validphys2/src/validphys/tests/conftest.py @@ -34,6 +34,16 @@ def tmp(tmpdir): 'datasets': [{'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10}]} ] +# Experiments which have non trivial correlations between their datasets +CORR_DATA = [ + {'dataset': 'ATLASWZRAP36PB', 'cfac': ['QCD']}, + {'dataset': 'ATLASZHIGHMASS49FB', 'cfac': ['QCD']}, + {'dataset': 'ATLASLOMASSDY11EXT', 'cfac': ['QCD']}, + {'dataset': 'ATLASWZRAP11', 'frac': 0.5, 'cfac': ['QCD']}, + {'dataset': 'CMSZDIFF12', 'cfac': ('QCD', 'NRM'), 'sys': 10}, + {'dataset': 'CMSJETS11', 'frac': 0.5, 'sys': 10}, +] + SINGLE_EXP = [ { 'experiment': 'pseudo experiment', @@ -69,6 +79,12 @@ def tmp(tmpdir): def data_config(): return base_config +@pytest.fixture(scope='module') +def data_internal_cuts_config(data_config): + config_dict = dict(data_config) + config_dict.update(use_cuts='internal') + return config_dict + @pytest.fixture(scope='module') def data_witht0_config(): config_dict = dict( @@ -83,6 +99,19 @@ def data_singleexp_witht0_config(data_witht0_config): config_dict.update({'experiments': SINGLE_EXP}) return config_dict +@pytest.fixture(scope='module') +def data_with_correlations_config(): + corr_dict = dict(base_config) + corr_dict.pop("experiments") + corr_dict.update(dataset_inputs=CORR_DATA) + return corr_dict + +@pytest.fixture(scope='module') +def data_with_correlations_internal_cuts_config(data_with_correlations_config): + config_dict = dict(data_with_correlations_config) + config_dict.update(use_cuts='internal') + return config_dict + @pytest.fixture(scope='module') def weighted_data_witht0_config(data_witht0_config): config_dict = dict(data_witht0_config) diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py index 942eabc871..b46cc20fa0 100644 --- a/validphys2/src/validphys/tests/test_covmats.py +++ b/validphys2/src/validphys/tests/test_covmats.py @@ -3,15 +3,57 @@ Tests related to the computation of the covariance matrix and its derivatives """ +import random + import pytest import numpy as np -import random + +from validphys.api import API +from validphys.commondataparser import load_commondata +from validphys.covmats import ( + sqrt_covmat, + datasets_covmat_from_systematics +) from validphys.loader import Loader from validphys.tests.conftest import THEORYID -from validphys.api import API -from validphys.covmats import sqrt_covmat + + +def test_covmat_from_systematics_correlated(data_with_correlations_config): + """Test the covariance matrix generation from a set of correlated datasets + given their systematic errors + """ + data = API.data(**data_with_correlations_config) + cds = [ds.commondata for ds in data.datasets] + + ld_cds = list(map(load_commondata, cds)) + + covmat = datasets_covmat_from_systematics(ld_cds) + + cpp_covmat = API.groups_covmat(**data_with_correlations_config) + + np.testing.assert_allclose(cpp_covmat, covmat) + + +def test_covmat_from_systematics(data_internal_cuts_config): + """Test which checks the python computation of the covmat relating to a + collection of datasets matches that of the C++ computation. Note that the + datasets are cut using the internal rules, but the datasets are not correlated. + """ + data = API.data(**data_internal_cuts_config) + cds = [ds.commondata for ds in data.datasets] + + ld_cds = list(map(load_commondata, cds)) + + internal_cuts = [ds.cuts for ds in data.datasets] + cut_ld_cds = list(map(lambda x: x[0].with_cuts(x[1]), zip(ld_cds, internal_cuts))) + + covmat = datasets_covmat_from_systematics(cut_ld_cds) + + cpp_covmat = API.groups_covmat(**data_internal_cuts_config) + + np.testing.assert_allclose(cpp_covmat, covmat) def test_cpp_sqrtcovmat():