diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 2a9538c2ba..7fcdb9644e 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -92,158 +92,3 @@ def parse_systypes(systypefile): systypetable.set_index("sys_index", inplace=True) return systypetable - - -def combine_commondata(commondata_list): - """Function that takes in a sequence of :py:class:`validphys.coredata.CommonData` - objects and returns an effective :py:class:`validphys.coredata.CommonData` object - which has taken into account the (possible) correlations between the datasets. This - effective ``CommonData`` can then be used to obtain a correlated covariance matrix, - for example. - - .. note:: - The return of this function does not have a file on disk associated with it. - As such the ``CommonData`` instance has a ``setname`` of ``"N/A"`` and a - ``processtype`` of ``"MIXED"``. - - Parameters - ---------- - commondata_list: sequence - Sequence of :py:class:`validphys.coredata.CommonData` objects - - Returns - ------- - comb_cd: validphys.coredata.CommonData - The combined ``CommonData`` - - Raises - ------ - ValueError: if one or more of the input ``CommonData`` objects have a different - number of kinematic variables - - Example - ------- - >>> from validphys.commondataparser import combine_commondata, load_commondata - >>> from validphys.loader import Loader - >>> l = Loader() - >>> cd1 = l.check_commondata("ATLASLOMASSDY11EXT") - >>> cd2 = l.check_commondata("ATLASZHIGHMASS49FB") - >>> ld1, ld2 = map(load_commondata, (cd1, cd2)) - >>> comb_cd = combine_commondata((ld1, ld2)) - # Note that the two datasets have one pair of correlated systematics - # so the total number of systematics is one fewer than the sum - >>> comb_cd.nsys - 18 - >>> ld1.nsys + ld2.nsys - 19 - >>> from validphys.covmats import covmat_from_systematics - >>> covmat = covmat_from_systematics(comb_cd) - >>> covmat.shape - (19, 19) - >>> ld1.ndata + ld2.ndata - 19 - """ - # Raise an exception if any of the CommonDatas has - # a different number of kinematic variables to the others - # TODO: handle this case instead of raising an error - nkins = [cd.nkin for cd in commondata_list] - if not all(x == nkins[0] for x in nkins): - raise ValueError("One or more of the datasets have a different number of kinematics.") - - # Combine the systematics from each of the CommonDatas - systematics, systype_table = combine_commondata_systematics(commondata_list) - - # The number of data points is the sum of data points per CommonData while - # the systematics have correlations accounted for - ndata, nsys = systematics.shape - - header = ['process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] - kinematics = pd.concat([i.commondata_table.loc[:, header] for i in commondata_list], ignore_index=True) - kinematics.index += 1 - - additive = systematics.applymap(lambda x: x.add) - multiplicative = systematics.applymap(lambda x: x.mult) - additive.columns = (f"sys.add.{i + 1}" for i in range(nsys)) - multiplicative.columns = (f"sys.mult.{i + 1}" for i in range(nsys)) - - # Table containing alternating additive and multiplicative systematic uncertainties - systematics_header = np.empty((additive.columns.size + multiplicative.columns.size,), dtype=object) - systematics_header[0::2], systematics_header[1::2] = additive.columns, multiplicative.columns - systematics_table = additive.join(multiplicative)[systematics_header] - - commondata_table = pd.concat([kinematics, systematics_table], axis=1) - - # The effective CommonData object to return. The set has no unique name or - # process type and so there is no file associated with this object. - comb_cd = CommonData("N/A", ndata, "MIXED", 3, nsys, commondata_table, systype_table) - - return comb_cd - - -def combine_commondata_systematics(commondata_list): - """Function that takes in a sequence of :py:class:`validphys.coredata.CommonData` - and returns a tuple of ``pd.DataFrame``. The first ``DataFrame`` contains - :py:class:`validphys.coredata.SystematicError` which has accounted for the correlations - between systematics while the second contains information about the systematic errors - themselves such as their type and name. - - Parameters - ---------- - commondata_list: sequence - Sequence of :py:class:`validphys.coredata.CommonData` objects - - Returns - ------- - eff_sys_errors, systype_table: (pd.DataFrame, pd.DataFrame) - Tuple of DataFrames containing systematic errors and information about their name and type - """ - special_types = frozenset({"UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR", "SKIP"}) - - # Create an effective systematic error table which is block diagonal - # where the off-diagonal blocks are filled with zeros as sentinel values. - eff_sys_errors = pd.DataFrame(la.block_diag(*[i.sys_errors for i in commondata_list])) - # Replace 0s with np.nan so we can use df.fillna later on - eff_sys_errors.replace(0, np.nan, inplace=True) - # Make index and columns start from 1 - eff_sys_errors.index += 1 - eff_sys_errors.columns += 1 - - systypes = pd.concat([cd.systype_table for cd in commondata_list], ignore_index=True) - systypes.index += 1 - - # If the systypes are duplicated and not one of the special - # types above, then those set of systematics are correlated - # and we keep the first occurence and replace each subsequent - # occurence with a sentinel value of None - duplicates = systypes.groupby("name").groups - # Remove the duplicates that are special types - [duplicates.pop(key, None) for key in special_types] - - for cols in duplicates.values(): - # The columns that were correlated - to_combine = eff_sys_errors.loc[:, cols] - # We backwards propagate the uncertainties - to_combine.fillna(method="bfill", axis=1, inplace=True) - # Replace the eff_sys_errors column with - # the systematics that were correlated - prepared_column = to_combine.iloc[:, 0] - eff_sys_errors.loc[:, cols[0]] = prepared_column - - # The zeros are replaced with validphys.coredata.SystematicError types, we label the - # name as "SKIP" so any future logic knows that these systematic errors should be ignored - dummy_error = SystematicError(0, 0, None, "SKIP") - eff_sys_errors.fillna(dummy_error, inplace=True) - - keep_columns = (systypes["name"].isin(special_types)) | ~( - systypes.duplicated(subset="name", keep="first") - ) - keep_columns.index = eff_sys_errors.columns - eff_sys_errors = eff_sys_errors.loc[:, keep_columns] - # After slicing with keep_columns, the index - # of the systematic errors need to be reset - eff_sys_errors.columns = range(1, len(eff_sys_errors.columns) + 1) - - systype_table = systypes[keep_columns] - systype_table.index = range(1, len(eff_sys_errors.columns) + 1) - - return eff_sys_errors, systype_table diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 6126a14f34..0de87dc2ec 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -23,10 +23,15 @@ log = logging.getLogger(__name__) +INTRA_DATASET_SYS_NAME = ("THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR") -def covmat_from_systematics(commondata): - """Function that takes in a :py:class:`validphys.coredata.CommonData` object - and generates the associated covariance matrix using the systematics. The logic + +def split_uncertainties(commondata): + """Take the statistical uncertainty and systematics table from + a :py:class:`validphys.coredata.CommonData` object + and split into the different different uncertainty archetypes each of + which may be handled differently by future operations, such as constructing + the covariance matrix. The logic of the different archetypes is best described by the now deprecated C++ code: .. code-block:: c++ @@ -75,35 +80,108 @@ def covmat_from_systematics(commondata): return CovMat; } - We see that for the diagonal elements of the covariance matrix, we must add the squared - statistical error. This is done by creating a diagonal matrix from the vector of statistical - errors, called ``stat_mat``, and squaring the matrix before the return line. - - We now pick datapoint i and loop over all other datapoints j. For each such pair of points, - loop over all the systematics, if the l'th systematic of data point i has name ``SKIP`` we ignore - it. This is handled by scanning over all sytematic errors in the ``sys_errors`` dataframe and dropping - any columns which correspond to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe + if the systematic of data point i has name ``SKIP`` we ignore + it. This is handled by scanning over all sytematic errors in the + ``sys_errors`` dataframe and dropping any columns which correspond + to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe defined below contains only the systematics without the ``SKIP`` name. - Note that in the switch statement an ADDitive or MULTiplicative systype of systematic i is handled - by either multiplying the additive or multiplicative uncertainties respectively. We instead opt to - create a purely additive systematic table where all elements are to be understood as additive - systematics and systematics of type MULT have been correctly transformed to their additive - values. This dataframe we call ``additive_mat``. + Note that in the switch statement an ADDitive or MULTiplicative systype + is handled by either multiplying the additive or multiplicative + uncertainties respectively. We convert uncertainties so that they are all + absolute: + - Additive (ADD) systematics are left unchanged + - multiplicative (MULT) systematics need to be converted from a + percentage by multiplying by the central value + and dividing by 100. + + Finally, the systematics are split into the five possible archetypes + of systematics uncertainties: uncorrelated (UNCORR), correlated (CORR), + theory_uncorrelated (THEORYUNCORR), theory_correlated (THEORYCORR) and + special_correlated systematics. - Next we notice that a systematic is correlated if its name is neither ``UNCORR`` nor ``THEORYUNCORR``. We - create a dataframe called ``correlated_mat`` where any column corresponding to an ``UNCORR`` or - ``THEORYUNCORR`` systematic is set to 0. + Returns + ------- + uncertainties: tuple + stat: np.array + 1-D array containing statistical uncertainties + uncorrelated: np.2darray + numpy array of uncorrelated systematics, can be empty if there + are no uncorrelated systematics. These are semantically similar + to statistical error, and only contribute the diagonal component + of the covariance matrix, because they are uncorrelated across + data points. + correlated: np.2darray + Similar to uncorrelated except they can be correlated across + data points. + theory_uncorrelated: np.2darray + numpy array of uncorrelated "theory" uncertainties which + are not to be confused with theory covariance uncertainties. + Instead they are, for example, uncertainties in the c-factors + theory_correlated: np.2darray + same as theory_uncorrelated except they can be correlated across + data points. + special_correlated: pd.DataFrame + Dataframe of the systematics which can be correlated across + datasets, the columns of the dataframe are the systematic names. + Each systematic of this type has a unique name, by returning + these systematics as a dataframe with the unique names as column + headers, we can easily join up the special_correlated systematics + from multiple datasets, retaining correlations, using + ``pandas.concat``. - From here it's a matter of staring at a piece of paper for a while to realise the contribution to the - covariance matrix arising due to correlated systematics is ``correlated_mat @ additive_mat.T``. Note - that in the case where ``i == j`` is met, we double count this contribution using this method, so we set - the diagonal elements of this particular contribution to 0 by using ``np.fill_diagonal``. + """ + sys_name = commondata.systype_table["name"].to_numpy() + # Dropping systematics that have type SKIP + sys_errors_df = commondata.sys_errors.loc[:, sys_name != "SKIP"] + sys_name = sys_name[sys_name != "SKIP"] - Finally, the ``i == j`` case is handled by summing the square row elements of ``additive_mat``, this is - done by using the ``np.einsum`` function. + # Diagonal matrix containing the statistical uncertainty for each + # data point + stat = commondata.stat_errors.to_numpy() - For more information on the generation of the covariance matrix see the `paper `_ + # Systematic uncertainties converted to absolute uncertainties (additives + # are left unchanged, multiplicative uncertainties are in percentage format + # so get multiplied by central value and divided by 100). + abs_sys_errors_df = sys_errors_df.apply( + lambda x: [ + i.add if i.sys_type == "ADD" else (i.mult * j / 100) + for i, j in zip(x, commondata.central_values) + ] + ) + abs_sys_errors = abs_sys_errors_df.to_numpy() + + # set columns for special_correlated errors + abs_sys_errors_df.columns = sys_name + special_corr = abs_sys_errors_df.loc[:, ~np.isin(sys_name, INTRA_DATASET_SYS_NAME)] + + thunc = abs_sys_errors[:, sys_name == "THEORYUNCORR"] + unc = abs_sys_errors[:, sys_name == "UNCORR"] + thcorr = abs_sys_errors[:, sys_name == "THEORYCORR"] + corr = abs_sys_errors[:, sys_name == "CORR"] + return stat, unc, corr, thunc, thcorr, special_corr + + +def covmat_from_systematics(commondata, use_theory_errors=True): + """Given a single :py:class:`validphys.coredata.CommonData`, obtain the + tuple of statistical uncertainty and systematics from + :py:meth:`split_uncertainties` and + construct the covariance matrix. + + Uncorrelated contributions from: ``stat``, ``uncorrelated`` and + ``theory_uncorrelated`` are added in quadrature + to the diagonal of the covmat. + + From here it's a matter of staring at a piece of paper for a while to + realise the contribution to the covariance matrix arising due to + correlated systematics is schematically ``A_correlated @ A_correlated.T``, + where A_correlated is a matrix N_dat by N_sys. The total contribution + from correlated systematics is found by adding together the result of + mutiplying each correlated systematic matrix by its transpose + (``correlated``, ``theory_correlated`` and ``special_correlated``). + + For more information on the generation of the covariance matrix see the + `paper `_ outlining the procedure, specifically equation 2 and surrounding text. Paramaters @@ -111,6 +189,8 @@ def covmat_from_systematics(commondata): commondata : validphys.coredata.CommonData CommonData which stores information about systematic errors, their treatment and description. + use_theory_errors: bool + Whether or not to include errors with name ``THEORY*` in the covmat Returns ------- @@ -141,47 +221,61 @@ def covmat_from_systematics(commondata): [3.46727332e-05, 3.45492831e-05, 2.56283708e-05, ..., 4.14126235e-05, 4.15843357e-05, 1.43824457e-04]]) """ - systype = commondata.systype_table["name"] - # Dropping systematics that have type SKIP - sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"] + stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(commondata) + + special_vals = special_corr.values + cov_mat = np.diag( + stat ** 2 + + (unc ** 2).sum(axis=1) + + (thunc ** 2).sum(axis=1) * use_theory_errors + ) + ( + corr @ corr.T + + thcorr @ thcorr.T * use_theory_errors + + special_vals @ special_vals.T + ) + return cov_mat - # Diagonal matrix containing the statistical uncertainty for each - # data point - stat_mat = np.diag(commondata.stat_errors.to_numpy()) - # Systematic uncertainties converted to additive uncertainties - additive_mat = sys_errors.apply( - lambda x: [ - i.add if i.sys_type == "ADD" else (i.mult * j / 100) - for i, j in zip(x, commondata.central_values) - ] - ) +def datasets_covmat_from_systematics(list_of_commondata, use_theory_errors=True): + """Given a list containing :py:class:`validphys.coredata.CommonData` s, + construct the full covariance matrix. - # Matrix where all non-zero values are due to correlated systematics - correlated_mat = additive_mat.copy() - # If all systypes were SKIP then we'd have an empty df - if not correlated_mat.empty: - correlated_mat.loc[:, (systype == "UNCORR") | (systype == "THEORYUNCORR")] = 0 - correlated_mat = correlated_mat.to_numpy() - additive_mat = additive_mat.to_numpy() - else: - # Some datasets have zero systematic uncertainties. - # We treat this special edge case by creating - # a matrix of zeros so it can be broadcast - # with the statistical error matrix at the end - correlated_mat = additive_mat = np.zeros_like(stat_mat) - - # Avoid double counting in case that the i = j element is a correlated - # systematic. We take care of this case in the einsum below - off_diagonals = correlated_mat @ additive_mat.T - np.fill_diagonal(off_diagonals, 0) - - cov_mat = ( - stat_mat ** 2 - + off_diagonals - + np.diag(np.einsum("ij, ij -> i", additive_mat, additive_mat)) - ) - return cov_mat + This is similar to :py:meth:`covmat_from_systematics` + except that ``special_corr`` is concatenated across all datasets + before being multiplied by its transpose to give off block-diagonal + contributions. The other systematics contribute to the block diagonal in the + same way as :py:meth:`covmat_from_systematics`. + + Paramaters + ---------- + list_of_commondata : list[validphys.coredata.CommonData] + list of CommonData objects. + use_theory_errors: bool + Whether or not to include errors with name ``THEORY*` in the covmat + + Returns + ------- + cov_mat : np.array + Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts) + containing uncertainty and correlation information. + + """ + special_corrs = [] + block_diags = [] + for cd in list_of_commondata: + stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(cd) + special_corrs.append(special_corr) + diag_covmat = np.diag( + stat ** 2 + + (unc ** 2).sum(axis=1) + + (thunc ** 2).sum(axis=1) * use_theory_errors + ) + (corr @ corr.T + thcorr @ thcorr.T * use_theory_errors) + block_diags.append(diag_covmat) + special_sys = pd.concat(special_corrs, axis=0, sort=False) + # non-overlapping systematics are set to NaN by concat, fill with 0 instead. + special_sys.fillna(0, inplace=True) + diag = la.block_diag(*block_diags) + return diag + special_sys.values @ special_sys.values.T def sqrt_covmat(covariance_matrix): diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py index 9266a502b8..4ca16649a9 100644 --- a/validphys2/src/validphys/tests/test_covmats.py +++ b/validphys2/src/validphys/tests/test_covmats.py @@ -12,10 +12,12 @@ from validphys.api import API -from validphys.commondataparser import combine_commondata, load_commondata -from validphys.covmats import covmat_from_systematics, sqrt_covmat +from validphys.commondataparser import load_commondata +from validphys.covmats import ( + sqrt_covmat, + datasets_covmat_from_systematics +) from validphys.loader import Loader -from validphys.tableloader import parse_exp_mat from validphys.tests.conftest import THEORYID REGRESSION_FOLDER = pathlib.Path(__file__).with_name("regressions") @@ -36,8 +38,7 @@ def test_covmat_from_systematics_correlated(): cds = list(map(l.check_commondata, correlated_datasets)) ld_cds = list(map(load_commondata, cds)) - combo_cd = combine_commondata(ld_cds) - covmat = covmat_from_systematics(combo_cd) + covmat = datasets_covmat_from_systematics(ld_cds) dss = [ l.check_dataset(i, theoryid=THEORYID, cuts=None) for i in correlated_datasets @@ -45,7 +46,7 @@ def test_covmat_from_systematics_correlated(): exp = l.check_experiment("null", dss) ld_exp = exp.load() - assert np.allclose(ld_exp.get_covmat(), covmat) + np.testing.assert_allclose(ld_exp.get_covmat(), covmat) def test_covmat_from_systematics(data_config): @@ -64,9 +65,8 @@ def test_covmat_from_systematics(data_config): for cd in cds ] cut_ld_cds = list(map(lambda x: x[0].with_cuts(x[1]), zip(ld_cds, internal_cuts))) - combo_cd = combine_commondata(cut_ld_cds) - covmat = covmat_from_systematics(combo_cd) + covmat = datasets_covmat_from_systematics(cut_ld_cds) dss = [ l.check_dataset(i.name, theoryid=THEORYID, cuts="internal")