From 3b44fc85df9cac3a7de7780fdffe965a05226f5d Mon Sep 17 00:00:00 2001 From: wilsonm Date: Wed, 14 Oct 2020 19:08:12 +0100 Subject: [PATCH 1/4] propose way of constructing covmat of list of commondata objects, without creating fake commondata object of multiple datasets --- validphys2/src/validphys/commondataparser.py | 155 ----------- validphys2/src/validphys/covmats.py | 242 ++++++++++-------- .../src/validphys/tests/test_covmats.py | 16 +- 3 files changed, 140 insertions(+), 273 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 2a9538c2ba..7fcdb9644e 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -92,158 +92,3 @@ def parse_systypes(systypefile): systypetable.set_index("sys_index", inplace=True) return systypetable - - -def combine_commondata(commondata_list): - """Function that takes in a sequence of :py:class:`validphys.coredata.CommonData` - objects and returns an effective :py:class:`validphys.coredata.CommonData` object - which has taken into account the (possible) correlations between the datasets. This - effective ``CommonData`` can then be used to obtain a correlated covariance matrix, - for example. - - .. note:: - The return of this function does not have a file on disk associated with it. - As such the ``CommonData`` instance has a ``setname`` of ``"N/A"`` and a - ``processtype`` of ``"MIXED"``. - - Parameters - ---------- - commondata_list: sequence - Sequence of :py:class:`validphys.coredata.CommonData` objects - - Returns - ------- - comb_cd: validphys.coredata.CommonData - The combined ``CommonData`` - - Raises - ------ - ValueError: if one or more of the input ``CommonData`` objects have a different - number of kinematic variables - - Example - ------- - >>> from validphys.commondataparser import combine_commondata, load_commondata - >>> from validphys.loader import Loader - >>> l = Loader() - >>> cd1 = l.check_commondata("ATLASLOMASSDY11EXT") - >>> cd2 = l.check_commondata("ATLASZHIGHMASS49FB") - >>> ld1, ld2 = map(load_commondata, (cd1, cd2)) - >>> comb_cd = combine_commondata((ld1, ld2)) - # Note that the two datasets have one pair of correlated systematics - # so the total number of systematics is one fewer than the sum - >>> comb_cd.nsys - 18 - >>> ld1.nsys + ld2.nsys - 19 - >>> from validphys.covmats import covmat_from_systematics - >>> covmat = covmat_from_systematics(comb_cd) - >>> covmat.shape - (19, 19) - >>> ld1.ndata + ld2.ndata - 19 - """ - # Raise an exception if any of the CommonDatas has - # a different number of kinematic variables to the others - # TODO: handle this case instead of raising an error - nkins = [cd.nkin for cd in commondata_list] - if not all(x == nkins[0] for x in nkins): - raise ValueError("One or more of the datasets have a different number of kinematics.") - - # Combine the systematics from each of the CommonDatas - systematics, systype_table = combine_commondata_systematics(commondata_list) - - # The number of data points is the sum of data points per CommonData while - # the systematics have correlations accounted for - ndata, nsys = systematics.shape - - header = ['process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] - kinematics = pd.concat([i.commondata_table.loc[:, header] for i in commondata_list], ignore_index=True) - kinematics.index += 1 - - additive = systematics.applymap(lambda x: x.add) - multiplicative = systematics.applymap(lambda x: x.mult) - additive.columns = (f"sys.add.{i + 1}" for i in range(nsys)) - multiplicative.columns = (f"sys.mult.{i + 1}" for i in range(nsys)) - - # Table containing alternating additive and multiplicative systematic uncertainties - systematics_header = np.empty((additive.columns.size + multiplicative.columns.size,), dtype=object) - systematics_header[0::2], systematics_header[1::2] = additive.columns, multiplicative.columns - systematics_table = additive.join(multiplicative)[systematics_header] - - commondata_table = pd.concat([kinematics, systematics_table], axis=1) - - # The effective CommonData object to return. The set has no unique name or - # process type and so there is no file associated with this object. - comb_cd = CommonData("N/A", ndata, "MIXED", 3, nsys, commondata_table, systype_table) - - return comb_cd - - -def combine_commondata_systematics(commondata_list): - """Function that takes in a sequence of :py:class:`validphys.coredata.CommonData` - and returns a tuple of ``pd.DataFrame``. The first ``DataFrame`` contains - :py:class:`validphys.coredata.SystematicError` which has accounted for the correlations - between systematics while the second contains information about the systematic errors - themselves such as their type and name. - - Parameters - ---------- - commondata_list: sequence - Sequence of :py:class:`validphys.coredata.CommonData` objects - - Returns - ------- - eff_sys_errors, systype_table: (pd.DataFrame, pd.DataFrame) - Tuple of DataFrames containing systematic errors and information about their name and type - """ - special_types = frozenset({"UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR", "SKIP"}) - - # Create an effective systematic error table which is block diagonal - # where the off-diagonal blocks are filled with zeros as sentinel values. - eff_sys_errors = pd.DataFrame(la.block_diag(*[i.sys_errors for i in commondata_list])) - # Replace 0s with np.nan so we can use df.fillna later on - eff_sys_errors.replace(0, np.nan, inplace=True) - # Make index and columns start from 1 - eff_sys_errors.index += 1 - eff_sys_errors.columns += 1 - - systypes = pd.concat([cd.systype_table for cd in commondata_list], ignore_index=True) - systypes.index += 1 - - # If the systypes are duplicated and not one of the special - # types above, then those set of systematics are correlated - # and we keep the first occurence and replace each subsequent - # occurence with a sentinel value of None - duplicates = systypes.groupby("name").groups - # Remove the duplicates that are special types - [duplicates.pop(key, None) for key in special_types] - - for cols in duplicates.values(): - # The columns that were correlated - to_combine = eff_sys_errors.loc[:, cols] - # We backwards propagate the uncertainties - to_combine.fillna(method="bfill", axis=1, inplace=True) - # Replace the eff_sys_errors column with - # the systematics that were correlated - prepared_column = to_combine.iloc[:, 0] - eff_sys_errors.loc[:, cols[0]] = prepared_column - - # The zeros are replaced with validphys.coredata.SystematicError types, we label the - # name as "SKIP" so any future logic knows that these systematic errors should be ignored - dummy_error = SystematicError(0, 0, None, "SKIP") - eff_sys_errors.fillna(dummy_error, inplace=True) - - keep_columns = (systypes["name"].isin(special_types)) | ~( - systypes.duplicated(subset="name", keep="first") - ) - keep_columns.index = eff_sys_errors.columns - eff_sys_errors = eff_sys_errors.loc[:, keep_columns] - # After slicing with keep_columns, the index - # of the systematic errors need to be reset - eff_sys_errors.columns = range(1, len(eff_sys_errors.columns) + 1) - - systype_table = systypes[keep_columns] - systype_table.index = range(1, len(eff_sys_errors.columns) + 1) - - return eff_sys_errors, systype_table diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 6126a14f34..03a8214da0 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -24,86 +24,91 @@ log = logging.getLogger(__name__) -def covmat_from_systematics(commondata): - """Function that takes in a :py:class:`validphys.coredata.CommonData` object - and generates the associated covariance matrix using the systematics. The logic - is best described by the now deprecated C++ code: - - .. code-block:: c++ - - auto CovMat = NNPDF::matrix(ndat, ndat); - - for (int i = 0; i < ndat; i++) - { - for (int j = 0; j < ndat; j++) - { - double sig = 0.0; - double signor = 0.0; - - // Statistical error - if (i == j) - sig += pow(stat_error[i],2); - - for (int l = 0; l < nsys; l++) - { - sysError const& isys = systematic_errors[i][l]; - sysError const& jsys = systematic_errors[j][l]; - if (isys.name != jsys.name) - throw RuntimeException("ComputeCovMat", "Inconsistent naming of systematics"); - if (isys.name == "SKIP") - continue; // Continue if systype is skipped - if ((isys.name == "THEORYCORR" || isys.name == "THEORYUNCORR") && !use_theory_errors) - continue; // Continue if systype is theoretical and use_theory_errors == false - const bool is_correlated = ( isys.name != "UNCORR" && isys.name !="THEORYUNCORR"); - if (i == j || is_correlated) - switch (isys.type) - { - case ADD: sig += isys.add *jsys.add; break; - case MULT: if (mult_errors) { signor += isys.mult*jsys.mult; break; } - else { continue; } - case UNSET: throw RuntimeException("ComputeCovMat", "UNSET systype encountered"); - } - } - - // Covariance matrix entry - CovMat(i, j) = (sig + signor*central_values[i]*central_values[j]*1e-4); - // Covariance matrix weight - CovMat(i, j) /= sqrt_weights[i]*sqrt_weights[j]; - } - } - - return CovMat; - } - - We see that for the diagonal elements of the covariance matrix, we must add the squared - statistical error. This is done by creating a diagonal matrix from the vector of statistical - errors, called ``stat_mat``, and squaring the matrix before the return line. +def split_systematics(commondata): + """Take the systematics table from a commondata and remove any systematics + which have name "SKIP". Next transform the systematics into the correct units. + Additive (ADD) systematics are left unchanged, whereas multiplicative (MULT) + systematics need to be converted from a percentage into an uncertainty + by multiplying by the central value and dividing by 100. - We now pick datapoint i and loop over all other datapoints j. For each such pair of points, - loop over all the systematics, if the l'th systematic of data point i has name ``SKIP`` we ignore - it. This is handled by scanning over all sytematic errors in the ``sys_errors`` dataframe and dropping - any columns which correspond to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe - defined below contains only the systematics without the ``SKIP`` name. + Finally the systematics are split into the different components which + are treated differently when constructing the covariance matrix. - Note that in the switch statement an ADDitive or MULTiplicative systype of systematic i is handled - by either multiplying the additive or multiplicative uncertainties respectively. We instead opt to - create a purely additive systematic table where all elements are to be understood as additive - systematics and systematics of type MULT have been correctly transformed to their additive - values. This dataframe we call ``additive_mat``. + Returns a tuple of the statistical uncertainty and 5 possible groups + of systematics uncertainties: UNCORR, CORR, THEORYUNCORR, THEORYCORR and + inter-dataset systematics - Next we notice that a systematic is correlated if its name is neither ``UNCORR`` nor ``THEORYUNCORR``. We - create a dataframe called ``correlated_mat`` where any column corresponding to an ``UNCORR`` or - ``THEORYUNCORR`` systematic is set to 0. + Returns + ------- + uncertainties: tuple + stat: np.array + 1-D array containing statistical uncertainties + uncorrelated: np.2darray + numpy array of uncorrelated systematics, can be empty if there + are no uncorrelated systematics. These are semantically similar + to statistical error, and only contribute the diagonal component + of the covariance matrix, because they are uncorrelated across + data points. + correlated: np.2darray + Similar to uncorrelated except they can be correlated across + data points. + theory_uncorr: np.2darray + numpy array of uncorrelated "theory" uncertainties which + are not to be confused with theory covariance uncertainties. + Instead they are, for example, uncertainties in the c-factors + theory_corr: np.2darray + same as theory_uncorr except they can be correlated across + data points. + special_corr: pd.DataFrame + a dataframe of the systematics which can be correlated across + datasets, the columns of the dataframe are the systematic names. + Each systematic of this type has a unique name, which is used + to correlated these systematics across datasets. - From here it's a matter of staring at a piece of paper for a while to realise the contribution to the - covariance matrix arising due to correlated systematics is ``correlated_mat @ additive_mat.T``. Note - that in the case where ``i == j`` is met, we double count this contribution using this method, so we set - the diagonal elements of this particular contribution to 0 by using ``np.fill_diagonal``. + """ + systype = commondata.systype_table["name"].values + # Dropping systematics that have type SKIP + sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"] + systype = systype[systype != "SKIP"] - Finally, the ``i == j`` case is handled by summing the square row elements of ``additive_mat``, this is - done by using the ``np.einsum`` function. + # Diagonal matrix containing the statistical uncertainty for each + # data point + stat = commondata.stat_errors.values - For more information on the generation of the covariance matrix see the `paper `_ + # Systematic uncertainties converted to proper units (additives are left + # unchanged, multiplicative uncertainties are in percentage format so get + # multiplied by central value and divided by 100) + sys_mat = sys_errors.apply( + lambda x: [ + i.add if i.sys_type == "ADD" else (i.mult * j / 100) + for i, j in zip(x, commondata.central_values) + ] + ) + # set columns for special_corr errors + sys_mat.columns = systype + + special = ["THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR"] + + special_corr = sys_mat.loc[:, ~np.isin(systype, special)] + thunc = sys_mat.loc[:, systype == "THEORYUNCORR"].values + unc = sys_mat.loc[:, systype == "UNCORR"].values + thcorr = sys_mat.loc[:, systype == "THEORYCORR"].values + corr = sys_mat.loc[:, systype == "CORR"].values + return stat, unc, corr, thunc, thcorr, special_corr + + +def covmat_from_systematics(commondata, use_theory_errors=True): + """Taking the tuple of statistical uncertainty and systematics returned by + :py:meth:`split_systematics` and construct the covariance matrix. uncorrelated + contributions from ``stat``, ``uncorrelated`` and ``theory_uncorr`` are + added in quadrature + to the diagonal of the covmat. The correlated systematics ``correlated``, + ``theory_corr`` and ``special_corr`` are multiplied by their transpose + to give contributions to both the diagonal and off diagonal components of + the covmat. + + For more information on the generation of the covariance matrix see the + `paper `_ outlining the procedure, specifically equation 2 and surrounding text. Paramaters @@ -111,6 +116,8 @@ def covmat_from_systematics(commondata): commondata : validphys.coredata.CommonData CommonData which stores information about systematic errors, their treatment and description. + use_theory_errors: bool + Whether or not to include errors with name ``THEORY*` in the covmat Returns ------- @@ -141,48 +148,63 @@ def covmat_from_systematics(commondata): [3.46727332e-05, 3.45492831e-05, 2.56283708e-05, ..., 4.14126235e-05, 4.15843357e-05, 1.43824457e-04]]) """ - systype = commondata.systype_table["name"] - # Dropping systematics that have type SKIP - sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"] + stat, unc, corr, thunc, thcorr, special_corr = split_systematics(commondata) - # Diagonal matrix containing the statistical uncertainty for each - # data point - stat_mat = np.diag(commondata.stat_errors.to_numpy()) - - # Systematic uncertainties converted to additive uncertainties - additive_mat = sys_errors.apply( - lambda x: [ - i.add if i.sys_type == "ADD" else (i.mult * j / 100) - for i, j in zip(x, commondata.central_values) - ] + special_vals = special_corr.values + cov_mat = ( + np.diag( + stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors + ) + + ( + corr @ corr.T + + thcorr @ thcorr.T * use_theory_errors + + special_vals @ special_vals.T + ) ) + return cov_mat - # Matrix where all non-zero values are due to correlated systematics - correlated_mat = additive_mat.copy() - # If all systypes were SKIP then we'd have an empty df - if not correlated_mat.empty: - correlated_mat.loc[:, (systype == "UNCORR") | (systype == "THEORYUNCORR")] = 0 - correlated_mat = correlated_mat.to_numpy() - additive_mat = additive_mat.to_numpy() - else: - # Some datasets have zero systematic uncertainties. - # We treat this special edge case by creating - # a matrix of zeros so it can be broadcast - # with the statistical error matrix at the end - correlated_mat = additive_mat = np.zeros_like(stat_mat) +def commondatas_covmat_from_systematics(list_of_commondata, use_theory_errors=True): + """Given a list of commondata, construct the full covariance matrix. - # Avoid double counting in case that the i = j element is a correlated - # systematic. We take care of this case in the einsum below - off_diagonals = correlated_mat @ additive_mat.T - np.fill_diagonal(off_diagonals, 0) + This is similar to :py:meth:`covmat_from_systematics` + except that ``special_corr`` systematics are concatenated across all datasets + before it is multiplied by its transpose to give off block-diagonal + contributions. The other systematics contribute to the block diagonal in the + same way as :py:meth:`covmat_from_systematics`. - cov_mat = ( - stat_mat ** 2 - + off_diagonals - + np.diag(np.einsum("ij, ij -> i", additive_mat, additive_mat)) - ) - return cov_mat + Paramaters + ---------- + list_of_commondata : list[validphys.coredata.CommonData] + list of CommonData objects. + use_theory_errors: bool + Whether or not to include errors with name ``THEORY*` in the covmat + + Returns + ------- + cov_mat : np.array + Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts) + containing uncertainty and correlation information. + """ + special_corrs = [] + block_diags = [] + for cd in list_of_commondata: + stat, unc, corr, thunc, thcorr, special_corr = split_systematics(cd) + special_corrs.append(special_corr) + diag_covmat = ( + np.diag( + stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors + ) + + ( + corr @ corr.T + + thcorr @ thcorr.T * use_theory_errors + ) + ) + block_diags.append(diag_covmat) + special_sys = pd.concat(special_corrs, axis=0, sort=False) + special_sys.fillna(0, inplace=True) + diag = la.block_diag(*block_diags) + return diag + special_sys.values @ special_sys.values.T def sqrt_covmat(covariance_matrix): """Function that computes the square root of the covariance matrix. diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py index 9266a502b8..916e9805a5 100644 --- a/validphys2/src/validphys/tests/test_covmats.py +++ b/validphys2/src/validphys/tests/test_covmats.py @@ -12,10 +12,12 @@ from validphys.api import API -from validphys.commondataparser import combine_commondata, load_commondata -from validphys.covmats import covmat_from_systematics, sqrt_covmat +from validphys.commondataparser import load_commondata +from validphys.covmats import ( + sqrt_covmat, + commondatas_covmat_from_systematics +) from validphys.loader import Loader -from validphys.tableloader import parse_exp_mat from validphys.tests.conftest import THEORYID REGRESSION_FOLDER = pathlib.Path(__file__).with_name("regressions") @@ -36,8 +38,7 @@ def test_covmat_from_systematics_correlated(): cds = list(map(l.check_commondata, correlated_datasets)) ld_cds = list(map(load_commondata, cds)) - combo_cd = combine_commondata(ld_cds) - covmat = covmat_from_systematics(combo_cd) + covmat = commondatas_covmat_from_systematics(ld_cds) dss = [ l.check_dataset(i, theoryid=THEORYID, cuts=None) for i in correlated_datasets @@ -45,7 +46,7 @@ def test_covmat_from_systematics_correlated(): exp = l.check_experiment("null", dss) ld_exp = exp.load() - assert np.allclose(ld_exp.get_covmat(), covmat) + np.testing.assert_allclose(ld_exp.get_covmat(), covmat) def test_covmat_from_systematics(data_config): @@ -64,9 +65,8 @@ def test_covmat_from_systematics(data_config): for cd in cds ] cut_ld_cds = list(map(lambda x: x[0].with_cuts(x[1]), zip(ld_cds, internal_cuts))) - combo_cd = combine_commondata(cut_ld_cds) - covmat = covmat_from_systematics(combo_cd) + covmat = commondatas_covmat_from_systematics(cut_ld_cds) dss = [ l.check_dataset(i.name, theoryid=THEORYID, cuts="internal") From 98fccc19818b5a8394236151d313abf9b76e9388 Mon Sep 17 00:00:00 2001 From: wilsonm Date: Wed, 21 Oct 2020 14:17:03 +0100 Subject: [PATCH 2/4] Update names and docstrings of covmat construction functions --- validphys2/src/validphys/covmats.py | 139 ++++++++++-------- .../src/validphys/tests/test_covmats.py | 6 +- 2 files changed, 79 insertions(+), 66 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 03a8214da0..c2da1a8859 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -23,20 +23,29 @@ log = logging.getLogger(__name__) +INTRA_DATASET_SYS_NAME = ("THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR") -def split_systematics(commondata): - """Take the systematics table from a commondata and remove any systematics - which have name "SKIP". Next transform the systematics into the correct units. - Additive (ADD) systematics are left unchanged, whereas multiplicative (MULT) - systematics need to be converted from a percentage into an uncertainty - by multiplying by the central value and dividing by 100. - Finally the systematics are split into the different components which - are treated differently when constructing the covariance matrix. +def split_uncertainties(commondata): + """Take the statistical uncertainty and systematics table from ``commondata`` + and split into the different different uncertainty archetypes each of + which may be handled differently by future operations, such as constructing + the covariance matrix. - Returns a tuple of the statistical uncertainty and 5 possible groups - of systematics uncertainties: UNCORR, CORR, THEORYUNCORR, THEORYCORR and - inter-dataset systematics + First we remove any systematics which have name "SKIP" because, as the name + implies, these systematics will be skipped. + + Next the systematics are transformed into the correct units: + + - Additive (ADD) systematics are left unchanged + - multiplicative (MULT) systematics need to be converted from a + percentage into an uncertainty by multiplying by the central value + and dividing by 100. + + Finally, the systematics are split into the five possible archetypes + of systematics uncertainties: uncorrelated (UNCORR), correlated (CORR), + theory_uncorrelated (THEORYUNCORR), theory_correlated (THEORYCORR) and + special_correlated systematics Returns ------- @@ -52,58 +61,64 @@ def split_systematics(commondata): correlated: np.2darray Similar to uncorrelated except they can be correlated across data points. - theory_uncorr: np.2darray + theory_uncorrelated: np.2darray numpy array of uncorrelated "theory" uncertainties which are not to be confused with theory covariance uncertainties. Instead they are, for example, uncertainties in the c-factors - theory_corr: np.2darray - same as theory_uncorr except they can be correlated across + theory_correlated: np.2darray + same as theory_uncorrelated except they can be correlated across data points. - special_corr: pd.DataFrame - a dataframe of the systematics which can be correlated across + special_correlated: pd.DataFrame + Dataframe of the systematics which can be correlated across datasets, the columns of the dataframe are the systematic names. - Each systematic of this type has a unique name, which is used - to correlated these systematics across datasets. + Each systematic of this type has a unique name, by returning + these systematics as a dataframe with the unique names as column + headers, we can easily join up the special_correlated systematics + from multiple datasets, retaining correlations, using + ``pandas.concat``. """ - systype = commondata.systype_table["name"].values + sys_name = commondata.systype_table["name"].to_numpy() # Dropping systematics that have type SKIP - sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"] - systype = systype[systype != "SKIP"] + sys_errors_df = commondata.sys_errors.loc[:, sys_name != "SKIP"] + sys_name = sys_name[sys_name != "SKIP"] # Diagonal matrix containing the statistical uncertainty for each # data point - stat = commondata.stat_errors.values + stat = commondata.stat_errors.to_numpy() - # Systematic uncertainties converted to proper units (additives are left - # unchanged, multiplicative uncertainties are in percentage format so get - # multiplied by central value and divided by 100) - sys_mat = sys_errors.apply( + # Systematic uncertainties converted to absolute uncertainties (additives + # are left unchanged, multiplicative uncertainties are in percentage format + # so get multiplied by central value and divided by 100). + abs_sys_errors_df = sys_errors_df.apply( lambda x: [ i.add if i.sys_type == "ADD" else (i.mult * j / 100) for i, j in zip(x, commondata.central_values) ] ) - # set columns for special_corr errors - sys_mat.columns = systype + abs_sys_errors = abs_sys_errors_df.to_numpy() - special = ["THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR"] + # set columns for special_correlated errors + abs_sys_errors_df.columns = sys_name + special_corr = abs_sys_errors_df.loc[:, ~np.isin(sys_name, INTRA_DATASET_SYS_NAME)] - special_corr = sys_mat.loc[:, ~np.isin(systype, special)] - thunc = sys_mat.loc[:, systype == "THEORYUNCORR"].values - unc = sys_mat.loc[:, systype == "UNCORR"].values - thcorr = sys_mat.loc[:, systype == "THEORYCORR"].values - corr = sys_mat.loc[:, systype == "CORR"].values + thunc = abs_sys_errors[:, sys_name == "THEORYUNCORR"] + unc = abs_sys_errors[:, sys_name == "UNCORR"] + thcorr = abs_sys_errors[:, sys_name == "THEORYCORR"] + corr = abs_sys_errors[:, sys_name == "CORR"] return stat, unc, corr, thunc, thcorr, special_corr def covmat_from_systematics(commondata, use_theory_errors=True): - """Taking the tuple of statistical uncertainty and systematics returned by - :py:meth:`split_systematics` and construct the covariance matrix. uncorrelated - contributions from ``stat``, ``uncorrelated`` and ``theory_uncorr`` are - added in quadrature - to the diagonal of the covmat. The correlated systematics ``correlated``, - ``theory_corr`` and ``special_corr`` are multiplied by their transpose + """Given a single ``commondata``, obtain the tuple of statistical + uncertainty and systematics from :py:meth:`split_uncertainties` and + construct the covariance matrix. + Uncorrelated contributions from: ``stat``, ``uncorrelated`` and + ``theory_uncorrelated`` are added in quadrature + to the diagonal of the covmat. + + The correlated systematics: ``correlated``, + ``theory_correlated`` and ``special_correlated`` are multiplied by their transpose to give contributions to both the diagonal and off diagonal components of the covmat. @@ -148,27 +163,27 @@ def covmat_from_systematics(commondata, use_theory_errors=True): [3.46727332e-05, 3.45492831e-05, 2.56283708e-05, ..., 4.14126235e-05, 4.15843357e-05, 1.43824457e-04]]) """ - stat, unc, corr, thunc, thcorr, special_corr = split_systematics(commondata) + stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(commondata) special_vals = special_corr.values - cov_mat = ( - np.diag( - stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors - ) - + ( - corr @ corr.T + - thcorr @ thcorr.T * use_theory_errors + - special_vals @ special_vals.T - ) + cov_mat = np.diag( + stat ** 2 + + (unc ** 2).sum(axis=1) + + (thunc ** 2).sum(axis=1) * use_theory_errors + ) + ( + corr @ corr.T + + thcorr @ thcorr.T * use_theory_errors + + special_vals @ special_vals.T ) return cov_mat -def commondatas_covmat_from_systematics(list_of_commondata, use_theory_errors=True): + +def datasets_covmat_from_systematics(list_of_commondata, use_theory_errors=True): """Given a list of commondata, construct the full covariance matrix. This is similar to :py:meth:`covmat_from_systematics` - except that ``special_corr`` systematics are concatenated across all datasets - before it is multiplied by its transpose to give off block-diagonal + except that ``special_corr`` is concatenated across all datasets + before being multiplied by its transpose to give off block-diagonal contributions. The other systematics contribute to the block diagonal in the same way as :py:meth:`covmat_from_systematics`. @@ -189,23 +204,21 @@ def commondatas_covmat_from_systematics(list_of_commondata, use_theory_errors=Tr special_corrs = [] block_diags = [] for cd in list_of_commondata: - stat, unc, corr, thunc, thcorr, special_corr = split_systematics(cd) + stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(cd) special_corrs.append(special_corr) - diag_covmat = ( - np.diag( - stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors - ) - + ( - corr @ corr.T + - thcorr @ thcorr.T * use_theory_errors - ) - ) + diag_covmat = np.diag( + stat ** 2 + + (unc ** 2).sum(axis=1) + + (thunc ** 2).sum(axis=1) * use_theory_errors + ) + (corr @ corr.T + thcorr @ thcorr.T * use_theory_errors) block_diags.append(diag_covmat) special_sys = pd.concat(special_corrs, axis=0, sort=False) + # non-overlapping systematics are set to NaN by concat, fill with 0 instead. special_sys.fillna(0, inplace=True) diag = la.block_diag(*block_diags) return diag + special_sys.values @ special_sys.values.T + def sqrt_covmat(covariance_matrix): """Function that computes the square root of the covariance matrix. diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py index 916e9805a5..4ca16649a9 100644 --- a/validphys2/src/validphys/tests/test_covmats.py +++ b/validphys2/src/validphys/tests/test_covmats.py @@ -15,7 +15,7 @@ from validphys.commondataparser import load_commondata from validphys.covmats import ( sqrt_covmat, - commondatas_covmat_from_systematics + datasets_covmat_from_systematics ) from validphys.loader import Loader from validphys.tests.conftest import THEORYID @@ -38,7 +38,7 @@ def test_covmat_from_systematics_correlated(): cds = list(map(l.check_commondata, correlated_datasets)) ld_cds = list(map(load_commondata, cds)) - covmat = commondatas_covmat_from_systematics(ld_cds) + covmat = datasets_covmat_from_systematics(ld_cds) dss = [ l.check_dataset(i, theoryid=THEORYID, cuts=None) for i in correlated_datasets @@ -66,7 +66,7 @@ def test_covmat_from_systematics(data_config): ] cut_ld_cds = list(map(lambda x: x[0].with_cuts(x[1]), zip(ld_cds, internal_cuts))) - covmat = commondatas_covmat_from_systematics(cut_ld_cds) + covmat = datasets_covmat_from_systematics(cut_ld_cds) dss = [ l.check_dataset(i.name, theoryid=THEORYID, cuts="internal") From b601377269cb560aa319cf92c5783a18cb274435 Mon Sep 17 00:00:00 2001 From: wilsonm Date: Fri, 23 Oct 2020 00:29:15 +0100 Subject: [PATCH 3/4] Add back the docstring from shayan editing to make sense with slight reshuffling of functions --- validphys2/src/validphys/covmats.py | 82 ++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 14 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index c2da1a8859..5dd1dc2e8b 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -27,25 +27,73 @@ def split_uncertainties(commondata): - """Take the statistical uncertainty and systematics table from ``commondata`` + """Take the statistical uncertainty and systematics table from + a :py:class:`validphys.coredata.CommonData` object and split into the different different uncertainty archetypes each of which may be handled differently by future operations, such as constructing - the covariance matrix. + the covariance matrix. The logic of the different archetypes + is best described by the now deprecated C++ code: + + .. code-block:: c++ + auto CovMat = NNPDF::matrix(ndat, ndat); + for (int i = 0; i < ndat; i++) + { + for (int j = 0; j < ndat; j++) + { + double sig = 0.0; + double signor = 0.0; + // Statistical error + if (i == j) + sig += pow(stat_error[i],2); + for (int l = 0; l < nsys; l++) + { + sysError const& isys = systematic_errors[i][l]; + sysError const& jsys = systematic_errors[j][l]; + if (isys.name != jsys.name) + throw RuntimeException("ComputeCovMat", "Inconsistent naming of systematics"); + if (isys.name == "SKIP") + continue; // Continue if systype is skipped + if ((isys.name == "THEORYCORR" || isys.name == "THEORYUNCORR") && !use_theory_errors) + continue; // Continue if systype is theoretical and use_theory_errors == false + const bool is_correlated = ( isys.name != "UNCORR" && isys.name !="THEORYUNCORR"); + if (i == j || is_correlated) + switch (isys.type) + { + case ADD: sig += isys.add *jsys.add; break; + case MULT: if (mult_errors) { signor += isys.mult*jsys.mult; break; } + else { continue; } + case UNSET: throw RuntimeException("ComputeCovMat", "UNSET systype encountered"); + } + } + // Covariance matrix entry + CovMat(i, j) = (sig + signor*central_values[i]*central_values[j]*1e-4); + // Covariance matrix weight + CovMat(i, j) /= sqrt_weights[i]*sqrt_weights[j]; + } + } - First we remove any systematics which have name "SKIP" because, as the name - implies, these systematics will be skipped. + return CovMat; + } - Next the systematics are transformed into the correct units: + if the systematic of data point i has name ``SKIP`` we ignore + it. This is handled by scanning over all sytematic errors in the + ``sys_errors`` dataframe and dropping any columns which correspond + to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe + defined below contains only the systematics without the ``SKIP`` name. + Note that in the switch statement an ADDitive or MULTiplicative systype + is handled by either multiplying the additive or multiplicative + uncertainties respectively. We convert uncertainties so that they are all + absolute: - Additive (ADD) systematics are left unchanged - multiplicative (MULT) systematics need to be converted from a - percentage into an uncertainty by multiplying by the central value + percentage by multiplying by the central value and dividing by 100. Finally, the systematics are split into the five possible archetypes of systematics uncertainties: uncorrelated (UNCORR), correlated (CORR), theory_uncorrelated (THEORYUNCORR), theory_correlated (THEORYCORR) and - special_correlated systematics + special_correlated systematics. Returns ------- @@ -110,17 +158,22 @@ def split_uncertainties(commondata): def covmat_from_systematics(commondata, use_theory_errors=True): - """Given a single ``commondata``, obtain the tuple of statistical - uncertainty and systematics from :py:meth:`split_uncertainties` and + """Given a single :py:class:`validphys.coredata.CommonData`, obtain the + tuple of statistical uncertainty and systematics from + :py:meth:`split_uncertainties` and construct the covariance matrix. + Uncorrelated contributions from: ``stat``, ``uncorrelated`` and ``theory_uncorrelated`` are added in quadrature to the diagonal of the covmat. - The correlated systematics: ``correlated``, - ``theory_correlated`` and ``special_correlated`` are multiplied by their transpose - to give contributions to both the diagonal and off diagonal components of - the covmat. + From here it's a matter of staring at a piece of paper for a while to + realise the contribution to the covariance matrix arising due to + correlated systematics is schematically ``A_correlated @ A_correlated.T``, + where A_correlated is a matrix N_dat by N_sys. The total contribution + from correlated systematics is found by adding together the result of + mutiplying each correlated systematic matrix by its transpose + (``correlated``, ``theory_correlated`` and ``special_correlated``). For more information on the generation of the covariance matrix see the `paper `_ @@ -179,7 +232,8 @@ def covmat_from_systematics(commondata, use_theory_errors=True): def datasets_covmat_from_systematics(list_of_commondata, use_theory_errors=True): - """Given a list of commondata, construct the full covariance matrix. + """Given a list containing :py:class:`validphys.coredata.CommonData` s, + construct the full covariance matrix. This is similar to :py:meth:`covmat_from_systematics` except that ``special_corr`` is concatenated across all datasets From d8fb0437994a15b426e329daef72ea39c1f2934d Mon Sep 17 00:00:00 2001 From: wilsonm Date: Fri, 23 Oct 2020 00:33:38 +0100 Subject: [PATCH 4/4] undo erroneous removal of blank lines --- validphys2/src/validphys/covmats.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 5dd1dc2e8b..0de87dc2ec 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -35,16 +35,20 @@ def split_uncertainties(commondata): is best described by the now deprecated C++ code: .. code-block:: c++ + auto CovMat = NNPDF::matrix(ndat, ndat); + for (int i = 0; i < ndat; i++) { for (int j = 0; j < ndat; j++) { double sig = 0.0; double signor = 0.0; + // Statistical error if (i == j) sig += pow(stat_error[i],2); + for (int l = 0; l < nsys; l++) { sysError const& isys = systematic_errors[i][l]; @@ -65,6 +69,7 @@ def split_uncertainties(commondata): case UNSET: throw RuntimeException("ComputeCovMat", "UNSET systype encountered"); } } + // Covariance matrix entry CovMat(i, j) = (sig + signor*central_values[i]*central_values[j]*1e-4); // Covariance matrix weight