From 3b44fc85df9cac3a7de7780fdffe965a05226f5d Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 14 Oct 2020 19:08:12 +0100
Subject: [PATCH 1/4] propose way of constructing covmat of list of commondata
 objects, without creating fake commondata object of multiple datasets

---
 validphys2/src/validphys/commondataparser.py  | 155 -----------
 validphys2/src/validphys/covmats.py           | 242 ++++++++++--------
 .../src/validphys/tests/test_covmats.py       |  16 +-
 3 files changed, 140 insertions(+), 273 deletions(-)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index 2a9538c2ba..7fcdb9644e 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -92,158 +92,3 @@ def parse_systypes(systypefile):
     systypetable.set_index("sys_index", inplace=True)
 
     return systypetable
-
-
-def combine_commondata(commondata_list):
-    """Function that takes in a sequence of :py:class:`validphys.coredata.CommonData`
-    objects and returns an effective :py:class:`validphys.coredata.CommonData` object
-    which has taken into account the (possible) correlations between the datasets. This
-    effective ``CommonData`` can then be used to obtain a correlated covariance matrix,
-    for example.
-
-    .. note::
-        The return of this function does not have a file on disk associated with it.
-        As such the ``CommonData`` instance has a ``setname`` of ``"N/A"`` and a
-        ``processtype`` of ``"MIXED"``.
-
-    Parameters
-    ----------
-    commondata_list: sequence
-        Sequence of :py:class:`validphys.coredata.CommonData` objects
-
-    Returns
-    -------
-    comb_cd: validphys.coredata.CommonData
-        The combined ``CommonData``
-
-    Raises
-    ------
-    ValueError: if one or more of the input ``CommonData`` objects have a different
-        number of kinematic variables
-
-    Example
-    -------
-    >>> from validphys.commondataparser import combine_commondata, load_commondata
-    >>> from validphys.loader import Loader
-    >>> l = Loader()
-    >>> cd1 = l.check_commondata("ATLASLOMASSDY11EXT")
-    >>> cd2 = l.check_commondata("ATLASZHIGHMASS49FB")
-    >>> ld1, ld2 = map(load_commondata, (cd1, cd2))
-    >>> comb_cd = combine_commondata((ld1, ld2))
-    # Note that the two datasets have one pair of correlated systematics
-    # so the total number of systematics is one fewer than the sum
-    >>> comb_cd.nsys
-    18
-    >>> ld1.nsys + ld2.nsys
-    19
-    >>> from validphys.covmats import covmat_from_systematics
-    >>> covmat = covmat_from_systematics(comb_cd)
-    >>> covmat.shape
-    (19, 19)
-    >>> ld1.ndata + ld2.ndata
-    19
-    """
-    # Raise an exception if any of the CommonDatas has
-    # a different number of kinematic variables to the others
-    # TODO: handle this case instead of raising an error
-    nkins = [cd.nkin for cd in commondata_list]
-    if not all(x == nkins[0] for x in nkins):
-        raise ValueError("One or more of the datasets have a different number of kinematics.")
-
-    # Combine the systematics from each of the CommonDatas
-    systematics, systype_table = combine_commondata_systematics(commondata_list)
-
-    # The number of data points is the sum of data points per CommonData while
-    # the systematics have correlations accounted for
-    ndata, nsys = systematics.shape
-
-    header = ['process', 'kin1', 'kin2', 'kin3', 'data', 'stat']
-    kinematics = pd.concat([i.commondata_table.loc[:, header] for i in commondata_list], ignore_index=True)
-    kinematics.index += 1
-
-    additive = systematics.applymap(lambda x: x.add)
-    multiplicative = systematics.applymap(lambda x: x.mult)
-    additive.columns = (f"sys.add.{i + 1}" for i in range(nsys))
-    multiplicative.columns = (f"sys.mult.{i + 1}" for i in range(nsys))
-
-    # Table containing alternating additive and multiplicative systematic uncertainties
-    systematics_header = np.empty((additive.columns.size + multiplicative.columns.size,), dtype=object)
-    systematics_header[0::2], systematics_header[1::2] = additive.columns, multiplicative.columns
-    systematics_table = additive.join(multiplicative)[systematics_header]
-
-    commondata_table = pd.concat([kinematics, systematics_table], axis=1)
-
-    # The effective CommonData object to return. The set has no unique name or
-    # process type and so there is no file associated with this object.
-    comb_cd = CommonData("N/A", ndata, "MIXED", 3, nsys, commondata_table, systype_table)
-
-    return comb_cd
-
-
-def combine_commondata_systematics(commondata_list):
-    """Function that takes in a sequence of :py:class:`validphys.coredata.CommonData`
-    and returns a tuple of ``pd.DataFrame``. The first ``DataFrame`` contains
-    :py:class:`validphys.coredata.SystematicError` which has accounted for the correlations
-    between systematics while the second contains information about the systematic errors
-    themselves such as their type and name.
-
-    Parameters
-    ----------
-    commondata_list: sequence
-        Sequence of :py:class:`validphys.coredata.CommonData` objects
-
-    Returns
-    -------
-    eff_sys_errors, systype_table: (pd.DataFrame, pd.DataFrame)
-        Tuple of DataFrames containing systematic errors and information about their name and type
-    """
-    special_types = frozenset({"UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR", "SKIP"})
-
-    # Create an effective systematic error table which is block diagonal
-    # where the off-diagonal blocks are filled with zeros as sentinel values.
-    eff_sys_errors = pd.DataFrame(la.block_diag(*[i.sys_errors for i in commondata_list]))
-    # Replace 0s with np.nan so we can use df.fillna later on
-    eff_sys_errors.replace(0, np.nan, inplace=True)
-    # Make index and columns start from 1
-    eff_sys_errors.index += 1
-    eff_sys_errors.columns += 1
-
-    systypes = pd.concat([cd.systype_table for cd in commondata_list], ignore_index=True)
-    systypes.index += 1
-
-    # If the systypes are duplicated and not one of the special
-    # types above, then those set of systematics are correlated
-    # and we keep the first occurence and replace each subsequent
-    # occurence with a sentinel value of None
-    duplicates = systypes.groupby("name").groups
-    # Remove the duplicates that are special types
-    [duplicates.pop(key, None) for key in special_types]
-
-    for cols in duplicates.values():
-        # The columns that were correlated
-        to_combine = eff_sys_errors.loc[:, cols]
-        # We backwards propagate the uncertainties
-        to_combine.fillna(method="bfill", axis=1, inplace=True)
-        # Replace the eff_sys_errors column with
-        # the systematics that were correlated
-        prepared_column = to_combine.iloc[:, 0]
-        eff_sys_errors.loc[:, cols[0]] = prepared_column
-
-    # The zeros are replaced with validphys.coredata.SystematicError types, we label the
-    # name as "SKIP" so any future logic knows that these systematic errors should be ignored
-    dummy_error = SystematicError(0, 0, None, "SKIP")
-    eff_sys_errors.fillna(dummy_error, inplace=True)
-
-    keep_columns = (systypes["name"].isin(special_types)) | ~(
-        systypes.duplicated(subset="name", keep="first")
-    )
-    keep_columns.index = eff_sys_errors.columns
-    eff_sys_errors = eff_sys_errors.loc[:, keep_columns]
-    # After slicing with keep_columns, the index
-    # of the systematic errors need to be reset
-    eff_sys_errors.columns = range(1, len(eff_sys_errors.columns) + 1)
-
-    systype_table = systypes[keep_columns]
-    systype_table.index = range(1, len(eff_sys_errors.columns) + 1)
-
-    return eff_sys_errors, systype_table
diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 6126a14f34..03a8214da0 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -24,86 +24,91 @@
 log = logging.getLogger(__name__)
 
 
-def covmat_from_systematics(commondata):
-    """Function that takes in a :py:class:`validphys.coredata.CommonData` object
-    and generates the associated covariance matrix using the systematics. The logic
-    is best described by the now deprecated C++ code:
-
-    .. code-block:: c++
-
-            auto CovMat = NNPDF::matrix<double>(ndat, ndat);
-
-            for (int i = 0; i < ndat; i++)
-            {
-              for (int j = 0; j < ndat; j++)
-              {
-                double sig    = 0.0;
-                double signor = 0.0;
-
-                // Statistical error
-                if (i == j)
-                  sig += pow(stat_error[i],2);
-
-                for (int l = 0; l < nsys; l++)
-                {
-                  sysError const& isys = systematic_errors[i][l];
-                  sysError const& jsys = systematic_errors[j][l];
-                  if (isys.name != jsys.name)
-                      throw RuntimeException("ComputeCovMat", "Inconsistent naming of systematics");
-                  if (isys.name == "SKIP")
-                      continue; // Continue if systype is skipped
-                  if ((isys.name == "THEORYCORR" || isys.name == "THEORYUNCORR") && !use_theory_errors)
-                      continue; // Continue if systype is theoretical and use_theory_errors == false
-                  const bool is_correlated = ( isys.name != "UNCORR" && isys.name !="THEORYUNCORR");
-                  if (i == j || is_correlated)
-                    switch (isys.type)
-                    {
-                      case ADD:   sig    += isys.add *jsys.add;  break;
-                      case MULT: if (mult_errors) { signor += isys.mult*jsys.mult; break; }
-                                 else { continue; }
-                      case UNSET: throw RuntimeException("ComputeCovMat", "UNSET systype encountered");
-                    }
-                }
-
-                // Covariance matrix entry
-                CovMat(i, j) = (sig + signor*central_values[i]*central_values[j]*1e-4);
-            // Covariance matrix weight
-            CovMat(i, j) /= sqrt_weights[i]*sqrt_weights[j];
-          }
-        }
-
-        return CovMat;
-      }
-
-    We see that for the diagonal elements of the covariance matrix, we must add the squared
-    statistical error. This is done by creating a diagonal matrix from the vector of statistical
-    errors, called ``stat_mat``, and squaring the matrix before the return line.
+def split_systematics(commondata):
+    """Take the systematics table from a commondata and remove any systematics
+    which have name "SKIP". Next transform the systematics into the correct units.
+    Additive (ADD) systematics are left unchanged, whereas multiplicative (MULT)
+    systematics need to be converted from a percentage into an uncertainty
+    by multiplying by the central value and dividing by 100.
 
-    We now pick datapoint i and loop over all other datapoints j. For each such pair of points,
-    loop over all the systematics, if the l'th systematic of data point i has name ``SKIP`` we ignore
-    it. This is handled by scanning over all sytematic errors in the ``sys_errors`` dataframe and dropping
-    any columns which correspond to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe
-    defined below contains only the systematics without the ``SKIP`` name.
+    Finally the systematics are split into the different components which
+    are treated differently when constructing the covariance matrix.
 
-    Note that in the switch statement an ADDitive or MULTiplicative systype of systematic i is handled
-    by either multiplying the additive or multiplicative uncertainties respectively. We instead opt to
-    create a purely additive systematic table where all elements are to be understood as additive
-    systematics and systematics of type MULT have been correctly transformed to their additive
-    values. This dataframe we call ``additive_mat``.
+    Returns a tuple of the statistical uncertainty and 5 possible groups
+    of systematics uncertainties: UNCORR, CORR, THEORYUNCORR, THEORYCORR and
+    inter-dataset systematics
 
-    Next we notice that a systematic is correlated if its name is neither ``UNCORR`` nor ``THEORYUNCORR``. We
-    create a dataframe called ``correlated_mat`` where any column corresponding to an ``UNCORR`` or
-    ``THEORYUNCORR`` systematic is set to 0.
+    Returns
+    -------
+        uncertainties: tuple
+            stat: np.array
+                1-D array containing statistical uncertainties
+            uncorrelated: np.2darray
+                numpy array of uncorrelated systematics, can be empty if there
+                are no uncorrelated systematics. These are semantically similar
+                to statistical error, and only contribute the diagonal component
+                of the covariance matrix, because they are uncorrelated across
+                data points.
+            correlated: np.2darray
+                Similar to uncorrelated except they can be correlated across
+                data points.
+            theory_uncorr: np.2darray
+                numpy array of uncorrelated "theory" uncertainties which
+                are not to be confused with theory covariance uncertainties.
+                Instead they are, for example, uncertainties in the c-factors
+            theory_corr: np.2darray
+                same as theory_uncorr except they can be correlated across
+                data points.
+            special_corr: pd.DataFrame
+                a dataframe of the systematics which can be correlated across
+                datasets, the columns of the dataframe are the systematic names.
+                Each systematic of this type has a unique name, which is used
+                to correlated these systematics across datasets.
 
-    From here it's a matter of staring at a piece of paper for a while to realise the contribution to the
-    covariance matrix arising due to correlated systematics is ``correlated_mat @ additive_mat.T``. Note
-    that in the case where ``i == j`` is met, we double count this contribution using this method, so we set
-    the diagonal elements of this particular contribution to 0 by using ``np.fill_diagonal``.
+    """
+    systype = commondata.systype_table["name"].values
+    # Dropping systematics that have type SKIP
+    sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"]
+    systype = systype[systype != "SKIP"]
 
-    Finally, the ``i == j`` case is handled by summing the square row elements of ``additive_mat``, this is
-    done by using the ``np.einsum`` function.
+    # Diagonal matrix containing the statistical uncertainty for each
+    # data point
+    stat = commondata.stat_errors.values
 
-    For more information on the generation of the covariance matrix see the `paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
+    # Systematic uncertainties converted to proper units (additives are left
+    # unchanged, multiplicative uncertainties are in percentage format so get
+    # multiplied by central value and divided by 100)
+    sys_mat = sys_errors.apply(
+        lambda x: [
+            i.add if i.sys_type == "ADD" else (i.mult * j / 100)
+            for i, j in zip(x, commondata.central_values)
+        ]
+    )
+    # set columns for special_corr errors
+    sys_mat.columns = systype
+
+    special = ["THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR"]
+
+    special_corr = sys_mat.loc[:, ~np.isin(systype, special)]
+    thunc = sys_mat.loc[:, systype == "THEORYUNCORR"].values
+    unc = sys_mat.loc[:, systype == "UNCORR"].values
+    thcorr = sys_mat.loc[:, systype == "THEORYCORR"].values
+    corr = sys_mat.loc[:, systype == "CORR"].values
+    return stat, unc, corr, thunc, thcorr, special_corr
+
+
+def covmat_from_systematics(commondata, use_theory_errors=True):
+    """Taking the tuple of statistical uncertainty and systematics returned by
+    :py:meth:`split_systematics` and construct the covariance matrix. uncorrelated
+    contributions from ``stat``, ``uncorrelated`` and ``theory_uncorr`` are
+    added in quadrature
+    to the diagonal of the covmat. The correlated systematics ``correlated``,
+    ``theory_corr`` and ``special_corr`` are multiplied by their transpose
+    to give contributions to both the diagonal and off diagonal components of
+    the covmat.
+
+    For more information on the generation of the covariance matrix see the
+    `paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
     outlining the procedure, specifically equation 2 and surrounding text.
 
     Paramaters
@@ -111,6 +116,8 @@ def covmat_from_systematics(commondata):
     commondata : validphys.coredata.CommonData
         CommonData which stores information about systematic errors,
         their treatment and description.
+    use_theory_errors: bool
+        Whether or not to include errors with name ``THEORY*` in the covmat
 
     Returns
     -------
@@ -141,48 +148,63 @@ def covmat_from_systematics(commondata):
            [3.46727332e-05, 3.45492831e-05, 2.56283708e-05, ...,
             4.14126235e-05, 4.15843357e-05, 1.43824457e-04]])
     """
-    systype = commondata.systype_table["name"]
-    # Dropping systematics that have type SKIP
-    sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"]
+    stat, unc, corr, thunc, thcorr, special_corr = split_systematics(commondata)
 
-    # Diagonal matrix containing the statistical uncertainty for each
-    # data point
-    stat_mat = np.diag(commondata.stat_errors.to_numpy())
-
-    # Systematic uncertainties converted to additive uncertainties
-    additive_mat = sys_errors.apply(
-        lambda x: [
-            i.add if i.sys_type == "ADD" else (i.mult * j / 100)
-            for i, j in zip(x, commondata.central_values)
-        ]
+    special_vals = special_corr.values
+    cov_mat = (
+        np.diag(
+            stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors
+        )
+        + (
+            corr @ corr.T +
+            thcorr @ thcorr.T * use_theory_errors +
+            special_vals @ special_vals.T
+        )
     )
+    return cov_mat
 
-    # Matrix where all non-zero values are due to correlated systematics
-    correlated_mat = additive_mat.copy()
-    # If all systypes were SKIP then we'd have an empty df
-    if not correlated_mat.empty:
-        correlated_mat.loc[:, (systype == "UNCORR") | (systype == "THEORYUNCORR")] = 0
-        correlated_mat = correlated_mat.to_numpy()
-        additive_mat = additive_mat.to_numpy()
-    else:
-        # Some datasets have zero systematic uncertainties.
-        # We treat this special edge case by creating
-        # a matrix of zeros so it can be broadcast
-        # with the statistical error matrix at the end
-        correlated_mat = additive_mat = np.zeros_like(stat_mat)
+def commondatas_covmat_from_systematics(list_of_commondata, use_theory_errors=True):
+    """Given a list of commondata, construct the full covariance matrix.
 
-    # Avoid double counting in case that the i = j element is a correlated
-    # systematic. We take care of this case in the einsum below
-    off_diagonals = correlated_mat @ additive_mat.T
-    np.fill_diagonal(off_diagonals, 0)
+    This is similar to :py:meth:`covmat_from_systematics`
+    except that ``special_corr`` systematics are concatenated across all datasets
+    before it is multiplied by its transpose to give off block-diagonal
+    contributions. The other systematics contribute to the block diagonal in the
+    same way as :py:meth:`covmat_from_systematics`.
 
-    cov_mat = (
-        stat_mat ** 2
-        + off_diagonals
-        + np.diag(np.einsum("ij, ij -> i", additive_mat, additive_mat))
-    )
-    return cov_mat
+    Paramaters
+    ----------
+    list_of_commondata : list[validphys.coredata.CommonData]
+        list of CommonData objects.
+    use_theory_errors: bool
+        Whether or not to include errors with name ``THEORY*` in the covmat
+
+    Returns
+    -------
+    cov_mat : np.array
+        Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts)
+        containing uncertainty and correlation information.
 
+    """
+    special_corrs = []
+    block_diags = []
+    for cd in list_of_commondata:
+        stat, unc, corr, thunc, thcorr, special_corr = split_systematics(cd)
+        special_corrs.append(special_corr)
+        diag_covmat = (
+            np.diag(
+                stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors
+            )
+            + (
+                corr @ corr.T +
+                thcorr @ thcorr.T * use_theory_errors
+            )
+        )
+        block_diags.append(diag_covmat)
+    special_sys = pd.concat(special_corrs, axis=0, sort=False)
+    special_sys.fillna(0, inplace=True)
+    diag = la.block_diag(*block_diags)
+    return diag + special_sys.values @ special_sys.values.T
 
 def sqrt_covmat(covariance_matrix):
     """Function that computes the square root of the covariance matrix.
diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py
index 9266a502b8..916e9805a5 100644
--- a/validphys2/src/validphys/tests/test_covmats.py
+++ b/validphys2/src/validphys/tests/test_covmats.py
@@ -12,10 +12,12 @@
 
 
 from validphys.api import API
-from validphys.commondataparser import combine_commondata, load_commondata
-from validphys.covmats import covmat_from_systematics, sqrt_covmat
+from validphys.commondataparser import load_commondata
+from validphys.covmats import (
+    sqrt_covmat,
+    commondatas_covmat_from_systematics
+)
 from validphys.loader import Loader
-from validphys.tableloader import parse_exp_mat
 from validphys.tests.conftest import THEORYID
 
 REGRESSION_FOLDER = pathlib.Path(__file__).with_name("regressions")
@@ -36,8 +38,7 @@ def test_covmat_from_systematics_correlated():
     cds = list(map(l.check_commondata, correlated_datasets))
     ld_cds = list(map(load_commondata, cds))
 
-    combo_cd = combine_commondata(ld_cds)
-    covmat = covmat_from_systematics(combo_cd)
+    covmat = commondatas_covmat_from_systematics(ld_cds)
 
     dss = [
         l.check_dataset(i, theoryid=THEORYID, cuts=None) for i in correlated_datasets
@@ -45,7 +46,7 @@ def test_covmat_from_systematics_correlated():
     exp = l.check_experiment("null", dss)
     ld_exp = exp.load()
 
-    assert np.allclose(ld_exp.get_covmat(), covmat)
+    np.testing.assert_allclose(ld_exp.get_covmat(), covmat)
 
 
 def test_covmat_from_systematics(data_config):
@@ -64,9 +65,8 @@ def test_covmat_from_systematics(data_config):
         for cd in cds
     ]
     cut_ld_cds = list(map(lambda x: x[0].with_cuts(x[1]), zip(ld_cds, internal_cuts)))
-    combo_cd = combine_commondata(cut_ld_cds)
 
-    covmat = covmat_from_systematics(combo_cd)
+    covmat = commondatas_covmat_from_systematics(cut_ld_cds)
 
     dss = [
         l.check_dataset(i.name, theoryid=THEORYID, cuts="internal")

From 98fccc19818b5a8394236151d313abf9b76e9388 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 21 Oct 2020 14:17:03 +0100
Subject: [PATCH 2/4] Update names and docstrings of covmat construction
 functions

---
 validphys2/src/validphys/covmats.py           | 139 ++++++++++--------
 .../src/validphys/tests/test_covmats.py       |   6 +-
 2 files changed, 79 insertions(+), 66 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 03a8214da0..c2da1a8859 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -23,20 +23,29 @@
 
 log = logging.getLogger(__name__)
 
+INTRA_DATASET_SYS_NAME = ("THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR")
 
-def split_systematics(commondata):
-    """Take the systematics table from a commondata and remove any systematics
-    which have name "SKIP". Next transform the systematics into the correct units.
-    Additive (ADD) systematics are left unchanged, whereas multiplicative (MULT)
-    systematics need to be converted from a percentage into an uncertainty
-    by multiplying by the central value and dividing by 100.
 
-    Finally the systematics are split into the different components which
-    are treated differently when constructing the covariance matrix.
+def split_uncertainties(commondata):
+    """Take the statistical uncertainty and systematics table from ``commondata``
+    and split into the different different uncertainty archetypes each of
+    which may be handled differently by future operations, such as constructing
+    the covariance matrix.
 
-    Returns a tuple of the statistical uncertainty and 5 possible groups
-    of systematics uncertainties: UNCORR, CORR, THEORYUNCORR, THEORYCORR and
-    inter-dataset systematics
+    First we remove any systematics which have name "SKIP" because, as the name
+    implies, these systematics will be skipped.
+
+    Next the systematics are transformed into the correct units:
+
+        - Additive (ADD) systematics are left unchanged
+        - multiplicative (MULT) systematics need to be converted from a
+        percentage into an uncertainty by multiplying by the central value
+        and dividing by 100.
+
+    Finally, the systematics are split into the five possible archetypes
+    of systematics uncertainties: uncorrelated (UNCORR), correlated (CORR),
+    theory_uncorrelated (THEORYUNCORR), theory_correlated (THEORYCORR) and
+    special_correlated systematics
 
     Returns
     -------
@@ -52,58 +61,64 @@ def split_systematics(commondata):
             correlated: np.2darray
                 Similar to uncorrelated except they can be correlated across
                 data points.
-            theory_uncorr: np.2darray
+            theory_uncorrelated: np.2darray
                 numpy array of uncorrelated "theory" uncertainties which
                 are not to be confused with theory covariance uncertainties.
                 Instead they are, for example, uncertainties in the c-factors
-            theory_corr: np.2darray
-                same as theory_uncorr except they can be correlated across
+            theory_correlated: np.2darray
+                same as theory_uncorrelated except they can be correlated across
                 data points.
-            special_corr: pd.DataFrame
-                a dataframe of the systematics which can be correlated across
+            special_correlated: pd.DataFrame
+                Dataframe of the systematics which can be correlated across
                 datasets, the columns of the dataframe are the systematic names.
-                Each systematic of this type has a unique name, which is used
-                to correlated these systematics across datasets.
+                Each systematic of this type has a unique name, by returning
+                these systematics as a dataframe with the unique names as column
+                headers, we can easily join up the special_correlated systematics
+                from multiple datasets, retaining correlations, using
+                ``pandas.concat``.
 
     """
-    systype = commondata.systype_table["name"].values
+    sys_name = commondata.systype_table["name"].to_numpy()
     # Dropping systematics that have type SKIP
-    sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"]
-    systype = systype[systype != "SKIP"]
+    sys_errors_df = commondata.sys_errors.loc[:, sys_name != "SKIP"]
+    sys_name = sys_name[sys_name != "SKIP"]
 
     # Diagonal matrix containing the statistical uncertainty for each
     # data point
-    stat = commondata.stat_errors.values
+    stat = commondata.stat_errors.to_numpy()
 
-    # Systematic uncertainties converted to proper units (additives are left
-    # unchanged, multiplicative uncertainties are in percentage format so get
-    # multiplied by central value and divided by 100)
-    sys_mat = sys_errors.apply(
+    # Systematic uncertainties converted to absolute uncertainties (additives
+    # are left unchanged, multiplicative uncertainties are in percentage format
+    # so get multiplied by central value and divided by 100).
+    abs_sys_errors_df = sys_errors_df.apply(
         lambda x: [
             i.add if i.sys_type == "ADD" else (i.mult * j / 100)
             for i, j in zip(x, commondata.central_values)
         ]
     )
-    # set columns for special_corr errors
-    sys_mat.columns = systype
+    abs_sys_errors = abs_sys_errors_df.to_numpy()
 
-    special = ["THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR"]
+    # set columns for special_correlated errors
+    abs_sys_errors_df.columns = sys_name
+    special_corr = abs_sys_errors_df.loc[:, ~np.isin(sys_name, INTRA_DATASET_SYS_NAME)]
 
-    special_corr = sys_mat.loc[:, ~np.isin(systype, special)]
-    thunc = sys_mat.loc[:, systype == "THEORYUNCORR"].values
-    unc = sys_mat.loc[:, systype == "UNCORR"].values
-    thcorr = sys_mat.loc[:, systype == "THEORYCORR"].values
-    corr = sys_mat.loc[:, systype == "CORR"].values
+    thunc = abs_sys_errors[:, sys_name == "THEORYUNCORR"]
+    unc = abs_sys_errors[:, sys_name == "UNCORR"]
+    thcorr = abs_sys_errors[:, sys_name == "THEORYCORR"]
+    corr = abs_sys_errors[:, sys_name == "CORR"]
     return stat, unc, corr, thunc, thcorr, special_corr
 
 
 def covmat_from_systematics(commondata, use_theory_errors=True):
-    """Taking the tuple of statistical uncertainty and systematics returned by
-    :py:meth:`split_systematics` and construct the covariance matrix. uncorrelated
-    contributions from ``stat``, ``uncorrelated`` and ``theory_uncorr`` are
-    added in quadrature
-    to the diagonal of the covmat. The correlated systematics ``correlated``,
-    ``theory_corr`` and ``special_corr`` are multiplied by their transpose
+    """Given a single ``commondata``, obtain the tuple of statistical
+    uncertainty and systematics from :py:meth:`split_uncertainties` and
+    construct the covariance matrix.
+    Uncorrelated contributions from: ``stat``, ``uncorrelated`` and
+    ``theory_uncorrelated`` are added in quadrature
+    to the diagonal of the covmat.
+
+    The correlated systematics: ``correlated``,
+    ``theory_correlated`` and ``special_correlated`` are multiplied by their transpose
     to give contributions to both the diagonal and off diagonal components of
     the covmat.
 
@@ -148,27 +163,27 @@ def covmat_from_systematics(commondata, use_theory_errors=True):
            [3.46727332e-05, 3.45492831e-05, 2.56283708e-05, ...,
             4.14126235e-05, 4.15843357e-05, 1.43824457e-04]])
     """
-    stat, unc, corr, thunc, thcorr, special_corr = split_systematics(commondata)
+    stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(commondata)
 
     special_vals = special_corr.values
-    cov_mat = (
-        np.diag(
-            stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors
-        )
-        + (
-            corr @ corr.T +
-            thcorr @ thcorr.T * use_theory_errors +
-            special_vals @ special_vals.T
-        )
+    cov_mat = np.diag(
+        stat ** 2
+        + (unc ** 2).sum(axis=1)
+        + (thunc ** 2).sum(axis=1) * use_theory_errors
+    ) + (
+        corr @ corr.T
+        + thcorr @ thcorr.T * use_theory_errors
+        + special_vals @ special_vals.T
     )
     return cov_mat
 
-def commondatas_covmat_from_systematics(list_of_commondata, use_theory_errors=True):
+
+def datasets_covmat_from_systematics(list_of_commondata, use_theory_errors=True):
     """Given a list of commondata, construct the full covariance matrix.
 
     This is similar to :py:meth:`covmat_from_systematics`
-    except that ``special_corr`` systematics are concatenated across all datasets
-    before it is multiplied by its transpose to give off block-diagonal
+    except that ``special_corr`` is concatenated across all datasets
+    before being multiplied by its transpose to give off block-diagonal
     contributions. The other systematics contribute to the block diagonal in the
     same way as :py:meth:`covmat_from_systematics`.
 
@@ -189,23 +204,21 @@ def commondatas_covmat_from_systematics(list_of_commondata, use_theory_errors=Tr
     special_corrs = []
     block_diags = []
     for cd in list_of_commondata:
-        stat, unc, corr, thunc, thcorr, special_corr = split_systematics(cd)
+        stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(cd)
         special_corrs.append(special_corr)
-        diag_covmat = (
-            np.diag(
-                stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors
-            )
-            + (
-                corr @ corr.T +
-                thcorr @ thcorr.T * use_theory_errors
-            )
-        )
+        diag_covmat = np.diag(
+            stat ** 2
+            + (unc ** 2).sum(axis=1)
+            + (thunc ** 2).sum(axis=1) * use_theory_errors
+        ) + (corr @ corr.T + thcorr @ thcorr.T * use_theory_errors)
         block_diags.append(diag_covmat)
     special_sys = pd.concat(special_corrs, axis=0, sort=False)
+    # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
     special_sys.fillna(0, inplace=True)
     diag = la.block_diag(*block_diags)
     return diag + special_sys.values @ special_sys.values.T
 
+
 def sqrt_covmat(covariance_matrix):
     """Function that computes the square root of the covariance matrix.
 
diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py
index 916e9805a5..4ca16649a9 100644
--- a/validphys2/src/validphys/tests/test_covmats.py
+++ b/validphys2/src/validphys/tests/test_covmats.py
@@ -15,7 +15,7 @@
 from validphys.commondataparser import load_commondata
 from validphys.covmats import (
     sqrt_covmat,
-    commondatas_covmat_from_systematics
+    datasets_covmat_from_systematics
 )
 from validphys.loader import Loader
 from validphys.tests.conftest import THEORYID
@@ -38,7 +38,7 @@ def test_covmat_from_systematics_correlated():
     cds = list(map(l.check_commondata, correlated_datasets))
     ld_cds = list(map(load_commondata, cds))
 
-    covmat = commondatas_covmat_from_systematics(ld_cds)
+    covmat = datasets_covmat_from_systematics(ld_cds)
 
     dss = [
         l.check_dataset(i, theoryid=THEORYID, cuts=None) for i in correlated_datasets
@@ -66,7 +66,7 @@ def test_covmat_from_systematics(data_config):
     ]
     cut_ld_cds = list(map(lambda x: x[0].with_cuts(x[1]), zip(ld_cds, internal_cuts)))
 
-    covmat = commondatas_covmat_from_systematics(cut_ld_cds)
+    covmat = datasets_covmat_from_systematics(cut_ld_cds)
 
     dss = [
         l.check_dataset(i.name, theoryid=THEORYID, cuts="internal")

From b601377269cb560aa319cf92c5783a18cb274435 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Fri, 23 Oct 2020 00:29:15 +0100
Subject: [PATCH 3/4] Add back the docstring from shayan editing to make sense
 with slight reshuffling of functions

---
 validphys2/src/validphys/covmats.py | 82 ++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 14 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index c2da1a8859..5dd1dc2e8b 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -27,25 +27,73 @@
 
 
 def split_uncertainties(commondata):
-    """Take the statistical uncertainty and systematics table from ``commondata``
+    """Take the statistical uncertainty and systematics table from
+    a :py:class:`validphys.coredata.CommonData` object
     and split into the different different uncertainty archetypes each of
     which may be handled differently by future operations, such as constructing
-    the covariance matrix.
+    the covariance matrix. The logic of the different archetypes
+    is best described by the now deprecated C++ code:
+
+    .. code-block:: c++
+            auto CovMat = NNPDF::matrix<double>(ndat, ndat);
+            for (int i = 0; i < ndat; i++)
+            {
+              for (int j = 0; j < ndat; j++)
+              {
+                double sig    = 0.0;
+                double signor = 0.0;
+                // Statistical error
+                if (i == j)
+                  sig += pow(stat_error[i],2);
+                for (int l = 0; l < nsys; l++)
+                {
+                  sysError const& isys = systematic_errors[i][l];
+                  sysError const& jsys = systematic_errors[j][l];
+                  if (isys.name != jsys.name)
+                      throw RuntimeException("ComputeCovMat", "Inconsistent naming of systematics");
+                  if (isys.name == "SKIP")
+                      continue; // Continue if systype is skipped
+                  if ((isys.name == "THEORYCORR" || isys.name == "THEORYUNCORR") && !use_theory_errors)
+                      continue; // Continue if systype is theoretical and use_theory_errors == false
+                  const bool is_correlated = ( isys.name != "UNCORR" && isys.name !="THEORYUNCORR");
+                  if (i == j || is_correlated)
+                    switch (isys.type)
+                    {
+                      case ADD:   sig    += isys.add *jsys.add;  break;
+                      case MULT: if (mult_errors) { signor += isys.mult*jsys.mult; break; }
+                                 else { continue; }
+                      case UNSET: throw RuntimeException("ComputeCovMat", "UNSET systype encountered");
+                    }
+                }
+                // Covariance matrix entry
+                CovMat(i, j) = (sig + signor*central_values[i]*central_values[j]*1e-4);
+            // Covariance matrix weight
+            CovMat(i, j) /= sqrt_weights[i]*sqrt_weights[j];
+          }
+        }
 
-    First we remove any systematics which have name "SKIP" because, as the name
-    implies, these systematics will be skipped.
+        return CovMat;
+      }
 
-    Next the systematics are transformed into the correct units:
+    if the systematic of data point i has name ``SKIP`` we ignore
+    it. This is handled by scanning over all sytematic errors in the
+    ``sys_errors`` dataframe and dropping any columns which correspond
+    to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe
+    defined below contains only the systematics without the ``SKIP`` name.
 
+    Note that in the switch statement an ADDitive or MULTiplicative systype
+    is handled by either multiplying the additive or multiplicative
+    uncertainties respectively. We convert uncertainties so that they are all
+    absolute:
         - Additive (ADD) systematics are left unchanged
         - multiplicative (MULT) systematics need to be converted from a
-        percentage into an uncertainty by multiplying by the central value
+        percentage by multiplying by the central value
         and dividing by 100.
 
     Finally, the systematics are split into the five possible archetypes
     of systematics uncertainties: uncorrelated (UNCORR), correlated (CORR),
     theory_uncorrelated (THEORYUNCORR), theory_correlated (THEORYCORR) and
-    special_correlated systematics
+    special_correlated systematics.
 
     Returns
     -------
@@ -110,17 +158,22 @@ def split_uncertainties(commondata):
 
 
 def covmat_from_systematics(commondata, use_theory_errors=True):
-    """Given a single ``commondata``, obtain the tuple of statistical
-    uncertainty and systematics from :py:meth:`split_uncertainties` and
+    """Given a single :py:class:`validphys.coredata.CommonData`, obtain the
+    tuple of statistical uncertainty and systematics from
+    :py:meth:`split_uncertainties` and
     construct the covariance matrix.
+
     Uncorrelated contributions from: ``stat``, ``uncorrelated`` and
     ``theory_uncorrelated`` are added in quadrature
     to the diagonal of the covmat.
 
-    The correlated systematics: ``correlated``,
-    ``theory_correlated`` and ``special_correlated`` are multiplied by their transpose
-    to give contributions to both the diagonal and off diagonal components of
-    the covmat.
+    From here it's a matter of staring at a piece of paper for a while to
+    realise the contribution to the covariance matrix arising due to
+    correlated systematics is schematically ``A_correlated @ A_correlated.T``,
+    where A_correlated is a matrix N_dat by N_sys. The total contribution
+    from correlated systematics is found by adding together the result of
+    mutiplying each correlated systematic matrix by its transpose
+    (``correlated``, ``theory_correlated`` and ``special_correlated``).
 
     For more information on the generation of the covariance matrix see the
     `paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
@@ -179,7 +232,8 @@ def covmat_from_systematics(commondata, use_theory_errors=True):
 
 
 def datasets_covmat_from_systematics(list_of_commondata, use_theory_errors=True):
-    """Given a list of commondata, construct the full covariance matrix.
+    """Given a list containing :py:class:`validphys.coredata.CommonData` s,
+    construct the full covariance matrix.
 
     This is similar to :py:meth:`covmat_from_systematics`
     except that ``special_corr`` is concatenated across all datasets

From d8fb0437994a15b426e329daef72ea39c1f2934d Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Fri, 23 Oct 2020 00:33:38 +0100
Subject: [PATCH 4/4] undo erroneous removal of blank lines

---
 validphys2/src/validphys/covmats.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 5dd1dc2e8b..0de87dc2ec 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -35,16 +35,20 @@ def split_uncertainties(commondata):
     is best described by the now deprecated C++ code:
 
     .. code-block:: c++
+
             auto CovMat = NNPDF::matrix<double>(ndat, ndat);
+
             for (int i = 0; i < ndat; i++)
             {
               for (int j = 0; j < ndat; j++)
               {
                 double sig    = 0.0;
                 double signor = 0.0;
+
                 // Statistical error
                 if (i == j)
                   sig += pow(stat_error[i],2);
+
                 for (int l = 0; l < nsys; l++)
                 {
                   sysError const& isys = systematic_errors[i][l];
@@ -65,6 +69,7 @@ def split_uncertainties(commondata):
                       case UNSET: throw RuntimeException("ComputeCovMat", "UNSET systype encountered");
                     }
                 }
+
                 // Covariance matrix entry
                 CovMat(i, j) = (sig + signor*central_values[i]*central_values[j]*1e-4);
             // Covariance matrix weight