NNPDF · Zaharid · Nov 4, 2020 · Oct 7, 2020 · Oct 8, 2020 · Oct 8, 2020
diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
@@ -1,6 +1,7 @@
 """Module for handling logic and manipulation of covariance and correlation
 matrices on different levels of abstraction
 """
+from collections import namedtuple
 import logging
 
 import numpy as np
@@ -23,10 +24,19 @@
 
 log = logging.getLogger(__name__)
 
+INTRA_DATASET_SYS_NAME = ("THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR")
 
-def covmat_from_systematics(commondata):
-    """Function that takes in a :py:class:`validphys.coredata.CommonData` object
-    and generates the associated covariance matrix using the systematics. The logic
+Uncertainties = namedtuple(
+    "Uncertainties", ("stat", "unc", "corr", "thunc", "thcorr", "special")
+)
+
+
+def split_uncertainties(commondata):
+    """Take the statistical uncertainty and systematics table from
+    a :py:class:`validphys.coredata.CommonData` object
+    and split into the different different uncertainty archetypes each of
+    which may be handled differently by future operations, such as constructing
+    the covariance matrix. The logic of the different archetypes
     is best described by the now deprecated C++ code:
 
     .. code-block:: c++
@@ -75,42 +85,127 @@ def covmat_from_systematics(commondata):
         return CovMat;
       }
 
-    We see that for the diagonal elements of the covariance matrix, we must add the squared
-    statistical error. This is done by creating a diagonal matrix from the vector of statistical
-    errors, called ``stat_mat``, and squaring the matrix before the return line.
-
-    We now pick datapoint i and loop over all other datapoints j. For each such pair of points,
-    loop over all the systematics, if the l'th systematic of data point i has name ``SKIP`` we ignore
-    it. This is handled by scanning over all sytematic errors in the ``sys_errors`` dataframe and dropping
-    any columns which correspond to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe
+    if the systematic of data point i has name ``SKIP`` we ignore
+    it. This is handled by scanning over all sytematic errors in the
+    ``sys_errors`` dataframe and dropping any columns which correspond
+    to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe
     defined below contains only the systematics without the ``SKIP`` name.
 
-    Note that in the switch statement an ADDitive or MULTiplicative systype of systematic i is handled
-    by either multiplying the additive or multiplicative uncertainties respectively. We instead opt to
-    create a purely additive systematic table where all elements are to be understood as additive
-    systematics and systematics of type MULT have been correctly transformed to their additive
-    values. This dataframe we call ``additive_mat``.
+    Note that in the switch statement an ADDitive or MULTiplicative systype
+    is handled by either multiplying the additive or multiplicative
+    uncertainties respectively. We convert uncertainties so that they are all
+    absolute:
+        - Additive (ADD) systematics are left unchanged
+        - multiplicative (MULT) systematics need to be converted from a
+        percentage by multiplying by the central value
+        and dividing by 100.
+
+    Finally, the systematics are split into the five possible archetypes
+    of systematics uncertainties: uncorrelated (UNCORR), correlated (CORR),
+    theory_uncorrelated (THEORYUNCORR), theory_correlated (THEORYCORR) and
+    special_correlated systematics.
+
+
+    Parameters
+    ----------
+    commondata: :py:class:`validphys.coredata.CommonData`
+        A commondata object containing information on the data central values, statistical and
+        systematic uncertainties, and information on their correlations.
+
+    Returns
+    -------
+        uncertainties: :py:class:`validphys.covmats.Uncertainties`
+            A ``namedtuple`` with the following attributes
+
+            stat: np.array
+                1-D array containing statistical uncertainties
+            unc: np.2darray
+                numpy array of uncorrelated systematics, can be empty if there
+                are no uncorrelated systematics. These are semantically similar
+                to statistical error, and only contribute the diagonal component
+                of the covariance matrix, because they are uncorrelated across
+                data points.
+            corr: np.2darray
+                Similar to uncorrelated except they can be correlated across
+                data points.
+            thunc: np.2darray
+                numpy array of uncorrelated "theory" uncertainties which
+                are not to be confused with theory covariance uncertainties.
+                Instead they are, for example, uncertainties in the c-factors
+            thcorr: np.2darray
+                same as theory_uncorrelated except they can be correlated across
+                data points.
+            special: pd.DataFrame
+                Dataframe of the systematics which can be correlated across
+                datasets, the columns of the dataframe are the systematic names.
+                Each systematic of this type has a unique name, by returning
+                these systematics as a dataframe with the unique names as column
+                headers, we can easily join up the special_correlated systematics
+                from multiple datasets, retaining correlations, using
+                ``pandas.concat``.
+    """
+    sys_name = commondata.systype_table["name"].to_numpy()
+    # Dropping systematics that have type SKIP
+    sys_errors_df = commondata.sys_errors.loc[:, sys_name != "SKIP"]
+    sys_name = sys_name[sys_name != "SKIP"]
+
+    # Diagonal matrix containing the statistical uncertainty for each
+    # data point
+    stat = commondata.stat_errors.to_numpy()
+
+    # Systematic uncertainties converted to absolute uncertainties (additives
+    # are left unchanged, multiplicative uncertainties are in percentage format
+    # so get multiplied by central value and divided by 100).
+    abs_sys_errors_df = sys_errors_df.apply(
+        lambda x: [
+            i.add if i.sys_type == "ADD" else (i.mult * j / 100)
+            for i, j in zip(x, commondata.central_values)
+        ]
+    )
+    abs_sys_errors = abs_sys_errors_df.to_numpy()
+
+    # set columns for special_correlated errors
+    abs_sys_errors_df.columns = sys_name
+    special_corr = abs_sys_errors_df.loc[:, ~np.isin(sys_name, INTRA_DATASET_SYS_NAME)]
+
+    thunc = abs_sys_errors[:, sys_name == "THEORYUNCORR"]
+    unc = abs_sys_errors[:, sys_name == "UNCORR"]
+    thcorr = abs_sys_errors[:, sys_name == "THEORYCORR"]
+    corr = abs_sys_errors[:, sys_name == "CORR"]
+
+    uncertainties = Uncertainties(stat, unc, corr, thunc, thcorr, special_corr)
+    return uncertainties
 
-    Next we notice that a systematic is correlated if its name is neither ``UNCORR`` nor ``THEORYUNCORR``. We
-    create a dataframe called ``correlated_mat`` where any column corresponding to an ``UNCORR`` or
-    ``THEORYUNCORR`` systematic is set to 0.
 
-    From here it's a matter of staring at a piece of paper for a while to realise the contribution to the
-    covariance matrix arising due to correlated systematics is ``correlated_mat @ additive_mat.T``. Note
-    that in the case where ``i == j`` is met, we double count this contribution using this method, so we set
-    the diagonal elements of this particular contribution to 0 by using ``np.fill_diagonal``.
+def covmat_from_systematics(commondata, use_theory_errors=True):
+    """Given a single :py:class:`validphys.coredata.CommonData`, obtain the
+    tuple of statistical uncertainty and systematics from
+    :py:meth:`split_uncertainties` and
+    construct the covariance matrix.
 
-    Finally, the ``i == j`` case is handled by summing the square row elements of ``additive_mat``, this is
-    done by using the ``np.einsum`` function.
+    Uncorrelated contributions from: ``stat``, ``uncorrelated`` and
+    ``theory_uncorrelated`` are added in quadrature
+    to the diagonal of the covmat.
 
-    For more information on the generation of the covariance matrix see the `paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
+    From here it's a matter of staring at a piece of paper for a while to
+    realise the contribution to the covariance matrix arising due to
+    correlated systematics is schematically ``A_correlated @ A_correlated.T``,
+    where A_correlated is a matrix N_dat by N_sys. The total contribution
+    from correlated systematics is found by adding together the result of
+    mutiplying each correlated systematic matrix by its transpose
+    (``correlated``, ``theory_correlated`` and ``special_correlated``).
+
+    For more information on the generation of the covariance matrix see the
+    `paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
     outlining the procedure, specifically equation 2 and surrounding text.
 
     Paramaters
     ----------
     commondata : validphys.coredata.CommonData
         CommonData which stores information about systematic errors,
         their treatment and description.
+    use_theory_errors: bool
+        Whether or not to include errors with name ``THEORY*` in the covmat
 
     Returns
     -------
@@ -141,47 +236,61 @@ def covmat_from_systematics(commondata):
            [3.46727332e-05, 3.45492831e-05, 2.56283708e-05, ...,
             4.14126235e-05, 4.15843357e-05, 1.43824457e-04]])
     """
-    systype = commondata.systype_table["name"]
-    # Dropping systematics that have type SKIP
-    sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"]
+    stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(commondata)
+
+    special_vals = special_corr.values
+    cov_mat = np.diag(
+        stat ** 2
+        + (unc ** 2).sum(axis=1)
+        + (thunc ** 2).sum(axis=1) * use_theory_errors
+    ) + (
+        corr @ corr.T
+        + thcorr @ thcorr.T * use_theory_errors
+        + special_vals @ special_vals.T
+    )
+    return cov_mat
 
-    # Diagonal matrix containing the statistical uncertainty for each
-    # data point
-    stat_mat = np.diag(commondata.stat_errors.to_numpy())
 
-    # Systematic uncertainties converted to additive uncertainties
-    additive_mat = sys_errors.apply(
-        lambda x: [
-            i.add if i.sys_type == "ADD" else (i.mult * j / 100)
-            for i, j in zip(x, commondata.central_values)
-        ]
-    )
+def datasets_covmat_from_systematics(list_of_commondata, use_theory_errors=True):
+    """Given a list containing :py:class:`validphys.coredata.CommonData` s,
+    construct the full covariance matrix.
 
-    # Matrix where all non-zero values are due to correlated systematics
-    correlated_mat = additive_mat.copy()
-    # If all systypes were SKIP then we'd have an empty df
-    if not correlated_mat.empty:
-        correlated_mat.loc[:, (systype == "UNCORR") | (systype == "THEORYUNCORR")] = 0
-        correlated_mat = correlated_mat.to_numpy()
-        additive_mat = additive_mat.to_numpy()
-    else:
-        # Some datasets have zero systematic uncertainties.
-        # We treat this special edge case by creating
-        # a matrix of zeros so it can be broadcast
-        # with the statistical error matrix at the end
-        correlated_mat = additive_mat = np.zeros_like(stat_mat)
-
-    # Avoid double counting in case that the i = j element is a correlated
-    # systematic. We take care of this case in the einsum below
-    off_diagonals = correlated_mat @ additive_mat.T
-    np.fill_diagonal(off_diagonals, 0)
-
-    cov_mat = (
-        stat_mat ** 2
-        + off_diagonals
-        + np.diag(np.einsum("ij, ij -> i", additive_mat, additive_mat))
-    )
-    return cov_mat
+    This is similar to :py:meth:`covmat_from_systematics`
+    except that ``special_corr`` is concatenated across all datasets
+    before being multiplied by its transpose to give off block-diagonal
+    contributions. The other systematics contribute to the block diagonal in the
+    same way as :py:meth:`covmat_from_systematics`.
+
+    Paramaters
+    ----------
+    list_of_commondata : list[validphys.coredata.CommonData]
+        list of CommonData objects.
+    use_theory_errors: bool
+        Whether or not to include errors with name ``THEORY*` in the covmat
+
+    Returns
+    -------
+    cov_mat : np.array
+        Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts)
+        containing uncertainty and correlation information.
+
+    """
+    special_corrs = []
+    block_diags = []
+    for cd in list_of_commondata:
+        stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(cd)
+        special_corrs.append(special_corr)
+        diag_covmat = np.diag(
+            stat ** 2
+            + (unc ** 2).sum(axis=1)
+            + (thunc ** 2).sum(axis=1) * use_theory_errors
+        ) + (corr @ corr.T + thcorr @ thcorr.T * use_theory_errors)
+        block_diags.append(diag_covmat)
+    special_sys = pd.concat(special_corrs, axis=0, sort=False)
+    # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
+    special_sys.fillna(0, inplace=True)
+    diag = la.block_diag(*block_diags)
+    return diag + special_sys.values @ special_sys.values.T
 
 
 def sqrt_covmat(covariance_matrix):

diff --git a/validphys2/src/validphys/tests/conftest.py b/validphys2/src/validphys/tests/conftest.py
@@ -34,6 +34,16 @@ def tmp(tmpdir):
         'datasets': [{'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10}]}
     ]
 
+# Experiments which have non trivial correlations between their datasets
+CORR_DATA = [
+    {'dataset': 'ATLASWZRAP36PB', 'cfac': ['QCD']},
+    {'dataset': 'ATLASZHIGHMASS49FB', 'cfac': ['QCD']},
+    {'dataset': 'ATLASLOMASSDY11EXT', 'cfac': ['QCD']},
+    {'dataset': 'ATLASWZRAP11', 'frac': 0.5, 'cfac': ['QCD']},
+    {'dataset': 'CMSZDIFF12', 'cfac': ('QCD', 'NRM'), 'sys': 10},
+    {'dataset': 'CMSJETS11', 'frac': 0.5, 'sys': 10},
+]
+
 SINGLE_EXP = [
     {
         'experiment': 'pseudo experiment',
@@ -69,6 +79,12 @@ def tmp(tmpdir):
 def data_config():
     return base_config
 
+@pytest.fixture(scope='module')
+def data_internal_cuts_config(data_config):
+    config_dict = dict(data_config)
+    config_dict.update(use_cuts='internal')
+    return config_dict
+
 @pytest.fixture(scope='module')
 def data_witht0_config():
     config_dict = dict(
@@ -83,6 +99,19 @@ def data_singleexp_witht0_config(data_witht0_config):
     config_dict.update({'experiments': SINGLE_EXP})
     return config_dict
 
+@pytest.fixture(scope='module')
+def data_with_correlations_config():
+    corr_dict = dict(base_config)
+    corr_dict.pop("experiments")
+    corr_dict.update(dataset_inputs=CORR_DATA)
+    return corr_dict
+
+@pytest.fixture(scope='module')
+def data_with_correlations_internal_cuts_config(data_with_correlations_config):
+    config_dict = dict(data_with_correlations_config)
+    config_dict.update(use_cuts='internal')
+    return config_dict
+
 @pytest.fixture(scope='module')
 def weighted_data_witht0_config(data_witht0_config):
     config_dict = dict(data_witht0_config)

diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py
@@ -3,15 +3,57 @@
 
 Tests related to the computation of the covariance matrix and its derivatives
 """
+import random
+
 import pytest
 
 import numpy as np
-import random
 
+
+from validphys.api import API
+from validphys.commondataparser import load_commondata
+from validphys.covmats import (
+    sqrt_covmat,
+    datasets_covmat_from_systematics
+)
 from validphys.loader import Loader
 from validphys.tests.conftest import THEORYID
-from validphys.api import API
-from validphys.covmats import sqrt_covmat
+
+
+def test_covmat_from_systematics_correlated(data_with_correlations_config):
+    """Test the covariance matrix generation from a set of correlated datasets
+    given their systematic errors
+    """
+    data = API.data(**data_with_correlations_config)
+    cds = [ds.commondata for ds in data.datasets]
+
+    ld_cds = list(map(load_commondata, cds))
+
+    covmat = datasets_covmat_from_systematics(ld_cds)
+
+    cpp_covmat = API.groups_covmat(**data_with_correlations_config)
+
+    np.testing.assert_allclose(cpp_covmat, covmat)
+
+
+def test_covmat_from_systematics(data_internal_cuts_config):
+    """Test which checks the python computation of the covmat relating to a
+    collection of datasets matches that of the C++ computation. Note that the
+    datasets are cut using the internal rules, but the datasets are not correlated.
+    """
+    data = API.data(**data_internal_cuts_config)
+    cds = [ds.commondata for ds in data.datasets]
+
+    ld_cds = list(map(load_commondata, cds))
+
+    internal_cuts = [ds.cuts for ds in data.datasets]
+    cut_ld_cds = list(map(lambda x: x[0].with_cuts(x[1]), zip(ld_cds, internal_cuts)))
+
+    covmat = datasets_covmat_from_systematics(cut_ld_cds)
+
+    cpp_covmat = API.groups_covmat(**data_internal_cuts_config)
+
+    np.testing.assert_allclose(cpp_covmat, covmat)
 
 
 def test_cpp_sqrtcovmat():