From 653791c1ee15501c910a4692f1bdbc30a72e1bdb Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 7 Oct 2020 17:20:06 +0100
Subject: [PATCH 01/15] Combining list of commondatas And accounting for
 correlations

---
 validphys2/src/validphys/commondataparser.py | 84 +++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index c1f95bb07b..a1641bd847 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -7,11 +7,14 @@
 serves as a proof of concept.
 """
 from operator import attrgetter
+import toolz
 
+import numpy as np
 import pandas as pd
+import scipy.linalg as la
 
 from validphys.core import peek_commondata_metadata
-from validphys.coredata import CommonData
+from validphys.coredata import CommonData, SystematicError
 
 def load_commondata(spec):
     """
@@ -90,3 +93,82 @@ def parse_systypes(systypefile):
     systypetable.set_index("sys_index", inplace=True)
 
     return systypetable
+
+
+def combine_commondata(commondata_list):
+    # TODO: raise exception if nkin don't match
+    # Combine the systematics from each of the CommonDatas
+    systematics, systype_table = combine_commondata_systematics(commondata_list)
+
+    # The number of data points is the sum of data points per CommonData while
+    # the systematics have correlations accounted for
+    ndata, nsys = systematics.shape
+
+    header = ['process', 'kin1', 'kin2', 'kin3', 'data', 'stat']
+    kinematics = pd.concat([i.commondata_table.loc[:, header] for i in commondata_list], ignore_index=True)
+    kinematics.index += 1
+
+    additive = systematics.applymap(lambda x: x.add)
+    multiplicative = systematics.applymap(lambda x: x.mult)
+    additive.columns = (f"sys.add.{i + 1}" for i in range(nsys))
+    multiplicative.columns = (f"sys.mult.{i + 1}" for i in range(nsys))
+
+    # Table containing alternating additive and multiplicative systematic uncertainties
+    systematics_table = pd.concat([additive, multiplicative], axis=1)[list(toolz.interleave([additive, multiplicative]))]
+
+    commondata_table = pd.concat([kinematics, systematics_table], axis=1)
+
+    # The effective CommonData object to return. The set has no unique name or
+    # process type and so there is no file associated with this object.
+    cd = CommonData("N/A", ndata, "MIXED", 3, nsys, commondata_table, systype_table)
+
+    return cd
+
+
+def combine_commondata_systematics(commondata_list):
+    special_types = frozenset({"UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR", "SKIP"})
+
+    # Create an effective systematic error table which is block diagonal
+    # where the off-diagonal blocks are filled with zeros as sentinel values.
+    eff_sys_errors = pd.DataFrame(la.block_diag(*[i.sys_errors for i in commondata_list]))
+    # Replace 0s with np.nan so we can use df.fillna later on
+    eff_sys_errors.replace(0, np.nan, inplace=True)
+    # Make index and columns start from 1
+    eff_sys_errors.index += 1
+    eff_sys_errors.columns += 1
+
+    systypes = pd.concat([cd.systype_table for cd in commondata_list], ignore_index=True)
+    systypes.index += 1
+
+    # If the systypes are duplicated and not one of the special
+    # types above, then those set of systematics are correlated
+    # and we keep the first occurence and replace each subsequent
+    # occurence with a sentinel value of None
+    duplicates = systypes.groupby("name").groups
+    # Remove the duplicates that are special types
+    [duplicates.pop(key, None) for key in special_types]
+
+    for cols in duplicates.values():
+        # The columns that were correlated
+        to_combine = eff_sys_errors.loc[:, cols]
+        # We backwards propagate the uncertainties
+        to_combine.fillna(method="bfill", axis=1, inplace=True)
+        # Replace the eff_sys_errors column with
+        # the systematics that were correlated
+        prepared_column = to_combine.iloc[:, 0]
+        eff_sys_errors.loc[:, cols[0]] = prepared_column
+
+    # The zeros are replaced with validphys.coredata.SystematicError types, we label the
+    # name as "SKIP" so any future logic knows that these systematic errors should be ignored
+    dummy_error = SystematicError(0, 0, None, "SKIP")
+    eff_sys_errors.fillna(dummy_error, inplace=True)
+
+    keep_columns = (systypes["name"].isin(special_types)) | ~(
+        systypes.duplicated(subset="name", keep="first")
+    )
+    keep_columns.index = eff_sys_errors.columns
+    eff_sys_errors = eff_sys_errors.loc[:, keep_columns]
+
+    systype_table = systypes[keep_columns]
+
+    return eff_sys_errors, systype_table

From a1412f451a2ade92f9d770a0b5a9352c381b6040 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 8 Oct 2020 15:26:28 +0100
Subject: [PATCH 02/15] Removing toolz dependency

---
 validphys2/src/validphys/commondataparser.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index a1641bd847..24df556cda 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -7,7 +7,6 @@
 serves as a proof of concept.
 """
 from operator import attrgetter
-import toolz
 
 import numpy as np
 import pandas as pd
@@ -114,7 +113,9 @@ def combine_commondata(commondata_list):
     multiplicative.columns = (f"sys.mult.{i + 1}" for i in range(nsys))
 
     # Table containing alternating additive and multiplicative systematic uncertainties
-    systematics_table = pd.concat([additive, multiplicative], axis=1)[list(toolz.interleave([additive, multiplicative]))]
+    systematics_header = np.empty((additive.columns.size + multiplicative.columns.size,), dtype=object)
+    systematics_header[0::2], systematics_header[1::2] = additive.columns, multiplicative.columns
+    systematics_table = additive.join(multiplicative)[systematics_header]
 
     commondata_table = pd.concat([kinematics, systematics_table], axis=1)
 

From c69a9aad826ecd71f4622fd9b059d9d22f026780 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 8 Oct 2020 15:50:45 +0100
Subject: [PATCH 03/15] Resetting labels for systematics after correlating

---
 validphys2/src/validphys/commondataparser.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index 24df556cda..2ba580f710 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -169,7 +169,11 @@ def combine_commondata_systematics(commondata_list):
     )
     keep_columns.index = eff_sys_errors.columns
     eff_sys_errors = eff_sys_errors.loc[:, keep_columns]
+    # After slicing with keep_columns, the index
+    # of the systematic errors need to be reset
+    eff_sys_errors.columns = range(1, len(eff_sys_errors.columns) + 1)
 
     systype_table = systypes[keep_columns]
+    systype_table.index = range(1, len(eff_sys_errors.columns) + 1)
 
     return eff_sys_errors, systype_table

From 2c78e7b206fb1090f9d9445decf01c122fe627c7 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 9 Oct 2020 10:42:21 +0100
Subject: [PATCH 04/15] Raise exception

If the number of kins are not equal then raise ValueError
---
 validphys2/src/validphys/commondataparser.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index 2ba580f710..5e3266ee0d 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -95,7 +95,13 @@ def parse_systypes(systypefile):
 
 
 def combine_commondata(commondata_list):
-    # TODO: raise exception if nkin don't match
+    # Raise an exception if any of the CommonDatas has
+    # a different number of kinematic variables to the others
+    # TODO: handle this case instead of raising an error
+    nkins = [cd.nkin for cd in commondata_list]
+    if not all(x == nkins[0] for x in nkins):
+        raise ValueError("One or more of the datasets have a different number of kinematics.")
+
     # Combine the systematics from each of the CommonDatas
     systematics, systype_table = combine_commondata_systematics(commondata_list)
 

From 1221d9386a7dcb07da425d7fa98676023d1e9828 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 9 Oct 2020 12:35:28 +0100
Subject: [PATCH 05/15] Adding docstrings

---
 validphys2/src/validphys/commondataparser.py | 68 +++++++++++++++++++-
 1 file changed, 66 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index 5e3266ee0d..2a9538c2ba 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -95,6 +95,54 @@ def parse_systypes(systypefile):
 
 
 def combine_commondata(commondata_list):
+    """Function that takes in a sequence of :py:class:`validphys.coredata.CommonData`
+    objects and returns an effective :py:class:`validphys.coredata.CommonData` object
+    which has taken into account the (possible) correlations between the datasets. This
+    effective ``CommonData`` can then be used to obtain a correlated covariance matrix,
+    for example.
+
+    .. note::
+        The return of this function does not have a file on disk associated with it.
+        As such the ``CommonData`` instance has a ``setname`` of ``"N/A"`` and a
+        ``processtype`` of ``"MIXED"``.
+
+    Parameters
+    ----------
+    commondata_list: sequence
+        Sequence of :py:class:`validphys.coredata.CommonData` objects
+
+    Returns
+    -------
+    comb_cd: validphys.coredata.CommonData
+        The combined ``CommonData``
+
+    Raises
+    ------
+    ValueError: if one or more of the input ``CommonData`` objects have a different
+        number of kinematic variables
+
+    Example
+    -------
+    >>> from validphys.commondataparser import combine_commondata, load_commondata
+    >>> from validphys.loader import Loader
+    >>> l = Loader()
+    >>> cd1 = l.check_commondata("ATLASLOMASSDY11EXT")
+    >>> cd2 = l.check_commondata("ATLASZHIGHMASS49FB")
+    >>> ld1, ld2 = map(load_commondata, (cd1, cd2))
+    >>> comb_cd = combine_commondata((ld1, ld2))
+    # Note that the two datasets have one pair of correlated systematics
+    # so the total number of systematics is one fewer than the sum
+    >>> comb_cd.nsys
+    18
+    >>> ld1.nsys + ld2.nsys
+    19
+    >>> from validphys.covmats import covmat_from_systematics
+    >>> covmat = covmat_from_systematics(comb_cd)
+    >>> covmat.shape
+    (19, 19)
+    >>> ld1.ndata + ld2.ndata
+    19
+    """
     # Raise an exception if any of the CommonDatas has
     # a different number of kinematic variables to the others
     # TODO: handle this case instead of raising an error
@@ -127,12 +175,28 @@ def combine_commondata(commondata_list):
 
     # The effective CommonData object to return. The set has no unique name or
     # process type and so there is no file associated with this object.
-    cd = CommonData("N/A", ndata, "MIXED", 3, nsys, commondata_table, systype_table)
+    comb_cd = CommonData("N/A", ndata, "MIXED", 3, nsys, commondata_table, systype_table)
 
-    return cd
+    return comb_cd
 
 
 def combine_commondata_systematics(commondata_list):
+    """Function that takes in a sequence of :py:class:`validphys.coredata.CommonData`
+    and returns a tuple of ``pd.DataFrame``. The first ``DataFrame`` contains
+    :py:class:`validphys.coredata.SystematicError` which has accounted for the correlations
+    between systematics while the second contains information about the systematic errors
+    themselves such as their type and name.
+
+    Parameters
+    ----------
+    commondata_list: sequence
+        Sequence of :py:class:`validphys.coredata.CommonData` objects
+
+    Returns
+    -------
+    eff_sys_errors, systype_table: (pd.DataFrame, pd.DataFrame)
+        Tuple of DataFrames containing systematic errors and information about their name and type
+    """
     special_types = frozenset({"UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR", "SKIP"})
 
     # Create an effective systematic error table which is block diagonal

From ddac5efdf6b2aa5cd7b5a38904b31e7a4365f2d3 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 9 Oct 2020 17:00:07 +0100
Subject: [PATCH 06/15] Adding CI tests

---
 .../src/validphys/tests/test_covmats.py       | 70 ++++++++++++++++++-
 1 file changed, 68 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py
index 942eabc871..9266a502b8 100644
--- a/validphys2/src/validphys/tests/test_covmats.py
+++ b/validphys2/src/validphys/tests/test_covmats.py
@@ -3,15 +3,81 @@
 
 Tests related to the computation of the covariance matrix and its derivatives
 """
+import pathlib
+
 import pytest
 
 import numpy as np
 import random
 
+
+from validphys.api import API
+from validphys.commondataparser import combine_commondata, load_commondata
+from validphys.covmats import covmat_from_systematics, sqrt_covmat
 from validphys.loader import Loader
+from validphys.tableloader import parse_exp_mat
 from validphys.tests.conftest import THEORYID
-from validphys.api import API
-from validphys.covmats import sqrt_covmat
+
+REGRESSION_FOLDER = pathlib.Path(__file__).with_name("regressions")
+
+
+def test_covmat_from_systematics_correlated():
+    """Test the covariance matrix generation from a set of correlated datasets
+    given their systematic errors
+    """
+    l = Loader()
+    correlated_datasets = [
+        "ATLASWZRAP36PB",
+        "ATLASZHIGHMASS49FB",
+        "ATLASLOMASSDY11EXT",
+        "ATLASWZRAP11",
+    ]
+
+    cds = list(map(l.check_commondata, correlated_datasets))
+    ld_cds = list(map(load_commondata, cds))
+
+    combo_cd = combine_commondata(ld_cds)
+    covmat = covmat_from_systematics(combo_cd)
+
+    dss = [
+        l.check_dataset(i, theoryid=THEORYID, cuts=None) for i in correlated_datasets
+    ]
+    exp = l.check_experiment("null", dss)
+    ld_exp = exp.load()
+
+    assert np.allclose(ld_exp.get_covmat(), covmat)
+
+
+def test_covmat_from_systematics(data_config):
+    """Test which checks the python computation of the covmat relating to a
+    collection of datasets matches that of the C++ computation. Note that the
+    datasets are cut using the internal rules, but the datasets are not correlated.
+    """
+    l = Loader()
+    exps = API.experiments(**data_config)
+
+    cds = [l.check_commondata(i.name) for exp in exps for i in exp.datasets]
+    ld_cds = list(map(load_commondata, cds))
+
+    internal_cuts = [
+        l.check_internal_cuts(cd, API.rules(theoryid=THEORYID, use_cuts="internal"))
+        for cd in cds
+    ]
+    cut_ld_cds = list(map(lambda x: x[0].with_cuts(x[1]), zip(ld_cds, internal_cuts)))
+    combo_cd = combine_commondata(cut_ld_cds)
+
+    covmat = covmat_from_systematics(combo_cd)
+
+    dss = [
+        l.check_dataset(i.name, theoryid=THEORYID, cuts="internal")
+        for exp in exps
+        for i in exp.datasets
+    ]
+    exp = l.check_experiment("null", dss)
+    ld = exp.load()
+    cpp_covmat = ld.get_covmat()
+
+    assert np.allclose(cpp_covmat, covmat)
 
 
 def test_cpp_sqrtcovmat():

From 3b44fc85df9cac3a7de7780fdffe965a05226f5d Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 14 Oct 2020 19:08:12 +0100
Subject: [PATCH 07/15] propose way of constructing covmat of list of
 commondata objects, without creating fake commondata object of multiple
 datasets

---
 validphys2/src/validphys/commondataparser.py  | 155 -----------
 validphys2/src/validphys/covmats.py           | 242 ++++++++++--------
 .../src/validphys/tests/test_covmats.py       |  16 +-
 3 files changed, 140 insertions(+), 273 deletions(-)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index 2a9538c2ba..7fcdb9644e 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -92,158 +92,3 @@ def parse_systypes(systypefile):
     systypetable.set_index("sys_index", inplace=True)
 
     return systypetable
-
-
-def combine_commondata(commondata_list):
-    """Function that takes in a sequence of :py:class:`validphys.coredata.CommonData`
-    objects and returns an effective :py:class:`validphys.coredata.CommonData` object
-    which has taken into account the (possible) correlations between the datasets. This
-    effective ``CommonData`` can then be used to obtain a correlated covariance matrix,
-    for example.
-
-    .. note::
-        The return of this function does not have a file on disk associated with it.
-        As such the ``CommonData`` instance has a ``setname`` of ``"N/A"`` and a
-        ``processtype`` of ``"MIXED"``.
-
-    Parameters
-    ----------
-    commondata_list: sequence
-        Sequence of :py:class:`validphys.coredata.CommonData` objects
-
-    Returns
-    -------
-    comb_cd: validphys.coredata.CommonData
-        The combined ``CommonData``
-
-    Raises
-    ------
-    ValueError: if one or more of the input ``CommonData`` objects have a different
-        number of kinematic variables
-
-    Example
-    -------
-    >>> from validphys.commondataparser import combine_commondata, load_commondata
-    >>> from validphys.loader import Loader
-    >>> l = Loader()
-    >>> cd1 = l.check_commondata("ATLASLOMASSDY11EXT")
-    >>> cd2 = l.check_commondata("ATLASZHIGHMASS49FB")
-    >>> ld1, ld2 = map(load_commondata, (cd1, cd2))
-    >>> comb_cd = combine_commondata((ld1, ld2))
-    # Note that the two datasets have one pair of correlated systematics
-    # so the total number of systematics is one fewer than the sum
-    >>> comb_cd.nsys
-    18
-    >>> ld1.nsys + ld2.nsys
-    19
-    >>> from validphys.covmats import covmat_from_systematics
-    >>> covmat = covmat_from_systematics(comb_cd)
-    >>> covmat.shape
-    (19, 19)
-    >>> ld1.ndata + ld2.ndata
-    19
-    """
-    # Raise an exception if any of the CommonDatas has
-    # a different number of kinematic variables to the others
-    # TODO: handle this case instead of raising an error
-    nkins = [cd.nkin for cd in commondata_list]
-    if not all(x == nkins[0] for x in nkins):
-        raise ValueError("One or more of the datasets have a different number of kinematics.")
-
-    # Combine the systematics from each of the CommonDatas
-    systematics, systype_table = combine_commondata_systematics(commondata_list)
-
-    # The number of data points is the sum of data points per CommonData while
-    # the systematics have correlations accounted for
-    ndata, nsys = systematics.shape
-
-    header = ['process', 'kin1', 'kin2', 'kin3', 'data', 'stat']
-    kinematics = pd.concat([i.commondata_table.loc[:, header] for i in commondata_list], ignore_index=True)
-    kinematics.index += 1
-
-    additive = systematics.applymap(lambda x: x.add)
-    multiplicative = systematics.applymap(lambda x: x.mult)
-    additive.columns = (f"sys.add.{i + 1}" for i in range(nsys))
-    multiplicative.columns = (f"sys.mult.{i + 1}" for i in range(nsys))
-
-    # Table containing alternating additive and multiplicative systematic uncertainties
-    systematics_header = np.empty((additive.columns.size + multiplicative.columns.size,), dtype=object)
-    systematics_header[0::2], systematics_header[1::2] = additive.columns, multiplicative.columns
-    systematics_table = additive.join(multiplicative)[systematics_header]
-
-    commondata_table = pd.concat([kinematics, systematics_table], axis=1)
-
-    # The effective CommonData object to return. The set has no unique name or
-    # process type and so there is no file associated with this object.
-    comb_cd = CommonData("N/A", ndata, "MIXED", 3, nsys, commondata_table, systype_table)
-
-    return comb_cd
-
-
-def combine_commondata_systematics(commondata_list):
-    """Function that takes in a sequence of :py:class:`validphys.coredata.CommonData`
-    and returns a tuple of ``pd.DataFrame``. The first ``DataFrame`` contains
-    :py:class:`validphys.coredata.SystematicError` which has accounted for the correlations
-    between systematics while the second contains information about the systematic errors
-    themselves such as their type and name.
-
-    Parameters
-    ----------
-    commondata_list: sequence
-        Sequence of :py:class:`validphys.coredata.CommonData` objects
-
-    Returns
-    -------
-    eff_sys_errors, systype_table: (pd.DataFrame, pd.DataFrame)
-        Tuple of DataFrames containing systematic errors and information about their name and type
-    """
-    special_types = frozenset({"UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR", "SKIP"})
-
-    # Create an effective systematic error table which is block diagonal
-    # where the off-diagonal blocks are filled with zeros as sentinel values.
-    eff_sys_errors = pd.DataFrame(la.block_diag(*[i.sys_errors for i in commondata_list]))
-    # Replace 0s with np.nan so we can use df.fillna later on
-    eff_sys_errors.replace(0, np.nan, inplace=True)
-    # Make index and columns start from 1
-    eff_sys_errors.index += 1
-    eff_sys_errors.columns += 1
-
-    systypes = pd.concat([cd.systype_table for cd in commondata_list], ignore_index=True)
-    systypes.index += 1
-
-    # If the systypes are duplicated and not one of the special
-    # types above, then those set of systematics are correlated
-    # and we keep the first occurence and replace each subsequent
-    # occurence with a sentinel value of None
-    duplicates = systypes.groupby("name").groups
-    # Remove the duplicates that are special types
-    [duplicates.pop(key, None) for key in special_types]
-
-    for cols in duplicates.values():
-        # The columns that were correlated
-        to_combine = eff_sys_errors.loc[:, cols]
-        # We backwards propagate the uncertainties
-        to_combine.fillna(method="bfill", axis=1, inplace=True)
-        # Replace the eff_sys_errors column with
-        # the systematics that were correlated
-        prepared_column = to_combine.iloc[:, 0]
-        eff_sys_errors.loc[:, cols[0]] = prepared_column
-
-    # The zeros are replaced with validphys.coredata.SystematicError types, we label the
-    # name as "SKIP" so any future logic knows that these systematic errors should be ignored
-    dummy_error = SystematicError(0, 0, None, "SKIP")
-    eff_sys_errors.fillna(dummy_error, inplace=True)
-
-    keep_columns = (systypes["name"].isin(special_types)) | ~(
-        systypes.duplicated(subset="name", keep="first")
-    )
-    keep_columns.index = eff_sys_errors.columns
-    eff_sys_errors = eff_sys_errors.loc[:, keep_columns]
-    # After slicing with keep_columns, the index
-    # of the systematic errors need to be reset
-    eff_sys_errors.columns = range(1, len(eff_sys_errors.columns) + 1)
-
-    systype_table = systypes[keep_columns]
-    systype_table.index = range(1, len(eff_sys_errors.columns) + 1)
-
-    return eff_sys_errors, systype_table
diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 6126a14f34..03a8214da0 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -24,86 +24,91 @@
 log = logging.getLogger(__name__)
 
 
-def covmat_from_systematics(commondata):
-    """Function that takes in a :py:class:`validphys.coredata.CommonData` object
-    and generates the associated covariance matrix using the systematics. The logic
-    is best described by the now deprecated C++ code:
-
-    .. code-block:: c++
-
-            auto CovMat = NNPDF::matrix<double>(ndat, ndat);
-
-            for (int i = 0; i < ndat; i++)
-            {
-              for (int j = 0; j < ndat; j++)
-              {
-                double sig    = 0.0;
-                double signor = 0.0;
-
-                // Statistical error
-                if (i == j)
-                  sig += pow(stat_error[i],2);
-
-                for (int l = 0; l < nsys; l++)
-                {
-                  sysError const& isys = systematic_errors[i][l];
-                  sysError const& jsys = systematic_errors[j][l];
-                  if (isys.name != jsys.name)
-                      throw RuntimeException("ComputeCovMat", "Inconsistent naming of systematics");
-                  if (isys.name == "SKIP")
-                      continue; // Continue if systype is skipped
-                  if ((isys.name == "THEORYCORR" || isys.name == "THEORYUNCORR") && !use_theory_errors)
-                      continue; // Continue if systype is theoretical and use_theory_errors == false
-                  const bool is_correlated = ( isys.name != "UNCORR" && isys.name !="THEORYUNCORR");
-                  if (i == j || is_correlated)
-                    switch (isys.type)
-                    {
-                      case ADD:   sig    += isys.add *jsys.add;  break;
-                      case MULT: if (mult_errors) { signor += isys.mult*jsys.mult; break; }
-                                 else { continue; }
-                      case UNSET: throw RuntimeException("ComputeCovMat", "UNSET systype encountered");
-                    }
-                }
-
-                // Covariance matrix entry
-                CovMat(i, j) = (sig + signor*central_values[i]*central_values[j]*1e-4);
-            // Covariance matrix weight
-            CovMat(i, j) /= sqrt_weights[i]*sqrt_weights[j];
-          }
-        }
-
-        return CovMat;
-      }
-
-    We see that for the diagonal elements of the covariance matrix, we must add the squared
-    statistical error. This is done by creating a diagonal matrix from the vector of statistical
-    errors, called ``stat_mat``, and squaring the matrix before the return line.
+def split_systematics(commondata):
+    """Take the systematics table from a commondata and remove any systematics
+    which have name "SKIP". Next transform the systematics into the correct units.
+    Additive (ADD) systematics are left unchanged, whereas multiplicative (MULT)
+    systematics need to be converted from a percentage into an uncertainty
+    by multiplying by the central value and dividing by 100.
 
-    We now pick datapoint i and loop over all other datapoints j. For each such pair of points,
-    loop over all the systematics, if the l'th systematic of data point i has name ``SKIP`` we ignore
-    it. This is handled by scanning over all sytematic errors in the ``sys_errors`` dataframe and dropping
-    any columns which correspond to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe
-    defined below contains only the systematics without the ``SKIP`` name.
+    Finally the systematics are split into the different components which
+    are treated differently when constructing the covariance matrix.
 
-    Note that in the switch statement an ADDitive or MULTiplicative systype of systematic i is handled
-    by either multiplying the additive or multiplicative uncertainties respectively. We instead opt to
-    create a purely additive systematic table where all elements are to be understood as additive
-    systematics and systematics of type MULT have been correctly transformed to their additive
-    values. This dataframe we call ``additive_mat``.
+    Returns a tuple of the statistical uncertainty and 5 possible groups
+    of systematics uncertainties: UNCORR, CORR, THEORYUNCORR, THEORYCORR and
+    inter-dataset systematics
 
-    Next we notice that a systematic is correlated if its name is neither ``UNCORR`` nor ``THEORYUNCORR``. We
-    create a dataframe called ``correlated_mat`` where any column corresponding to an ``UNCORR`` or
-    ``THEORYUNCORR`` systematic is set to 0.
+    Returns
+    -------
+        uncertainties: tuple
+            stat: np.array
+                1-D array containing statistical uncertainties
+            uncorrelated: np.2darray
+                numpy array of uncorrelated systematics, can be empty if there
+                are no uncorrelated systematics. These are semantically similar
+                to statistical error, and only contribute the diagonal component
+                of the covariance matrix, because they are uncorrelated across
+                data points.
+            correlated: np.2darray
+                Similar to uncorrelated except they can be correlated across
+                data points.
+            theory_uncorr: np.2darray
+                numpy array of uncorrelated "theory" uncertainties which
+                are not to be confused with theory covariance uncertainties.
+                Instead they are, for example, uncertainties in the c-factors
+            theory_corr: np.2darray
+                same as theory_uncorr except they can be correlated across
+                data points.
+            special_corr: pd.DataFrame
+                a dataframe of the systematics which can be correlated across
+                datasets, the columns of the dataframe are the systematic names.
+                Each systematic of this type has a unique name, which is used
+                to correlated these systematics across datasets.
 
-    From here it's a matter of staring at a piece of paper for a while to realise the contribution to the
-    covariance matrix arising due to correlated systematics is ``correlated_mat @ additive_mat.T``. Note
-    that in the case where ``i == j`` is met, we double count this contribution using this method, so we set
-    the diagonal elements of this particular contribution to 0 by using ``np.fill_diagonal``.
+    """
+    systype = commondata.systype_table["name"].values
+    # Dropping systematics that have type SKIP
+    sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"]
+    systype = systype[systype != "SKIP"]
 
-    Finally, the ``i == j`` case is handled by summing the square row elements of ``additive_mat``, this is
-    done by using the ``np.einsum`` function.
+    # Diagonal matrix containing the statistical uncertainty for each
+    # data point
+    stat = commondata.stat_errors.values
 
-    For more information on the generation of the covariance matrix see the `paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
+    # Systematic uncertainties converted to proper units (additives are left
+    # unchanged, multiplicative uncertainties are in percentage format so get
+    # multiplied by central value and divided by 100)
+    sys_mat = sys_errors.apply(
+        lambda x: [
+            i.add if i.sys_type == "ADD" else (i.mult * j / 100)
+            for i, j in zip(x, commondata.central_values)
+        ]
+    )
+    # set columns for special_corr errors
+    sys_mat.columns = systype
+
+    special = ["THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR"]
+
+    special_corr = sys_mat.loc[:, ~np.isin(systype, special)]
+    thunc = sys_mat.loc[:, systype == "THEORYUNCORR"].values
+    unc = sys_mat.loc[:, systype == "UNCORR"].values
+    thcorr = sys_mat.loc[:, systype == "THEORYCORR"].values
+    corr = sys_mat.loc[:, systype == "CORR"].values
+    return stat, unc, corr, thunc, thcorr, special_corr
+
+
+def covmat_from_systematics(commondata, use_theory_errors=True):
+    """Taking the tuple of statistical uncertainty and systematics returned by
+    :py:meth:`split_systematics` and construct the covariance matrix. uncorrelated
+    contributions from ``stat``, ``uncorrelated`` and ``theory_uncorr`` are
+    added in quadrature
+    to the diagonal of the covmat. The correlated systematics ``correlated``,
+    ``theory_corr`` and ``special_corr`` are multiplied by their transpose
+    to give contributions to both the diagonal and off diagonal components of
+    the covmat.
+
+    For more information on the generation of the covariance matrix see the
+    `paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
     outlining the procedure, specifically equation 2 and surrounding text.
 
     Paramaters
@@ -111,6 +116,8 @@ def covmat_from_systematics(commondata):
     commondata : validphys.coredata.CommonData
         CommonData which stores information about systematic errors,
         their treatment and description.
+    use_theory_errors: bool
+        Whether or not to include errors with name ``THEORY*` in the covmat
 
     Returns
     -------
@@ -141,48 +148,63 @@ def covmat_from_systematics(commondata):
            [3.46727332e-05, 3.45492831e-05, 2.56283708e-05, ...,
             4.14126235e-05, 4.15843357e-05, 1.43824457e-04]])
     """
-    systype = commondata.systype_table["name"]
-    # Dropping systematics that have type SKIP
-    sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"]
+    stat, unc, corr, thunc, thcorr, special_corr = split_systematics(commondata)
 
-    # Diagonal matrix containing the statistical uncertainty for each
-    # data point
-    stat_mat = np.diag(commondata.stat_errors.to_numpy())
-
-    # Systematic uncertainties converted to additive uncertainties
-    additive_mat = sys_errors.apply(
-        lambda x: [
-            i.add if i.sys_type == "ADD" else (i.mult * j / 100)
-            for i, j in zip(x, commondata.central_values)
-        ]
+    special_vals = special_corr.values
+    cov_mat = (
+        np.diag(
+            stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors
+        )
+        + (
+            corr @ corr.T +
+            thcorr @ thcorr.T * use_theory_errors +
+            special_vals @ special_vals.T
+        )
     )
+    return cov_mat
 
-    # Matrix where all non-zero values are due to correlated systematics
-    correlated_mat = additive_mat.copy()
-    # If all systypes were SKIP then we'd have an empty df
-    if not correlated_mat.empty:
-        correlated_mat.loc[:, (systype == "UNCORR") | (systype == "THEORYUNCORR")] = 0
-        correlated_mat = correlated_mat.to_numpy()
-        additive_mat = additive_mat.to_numpy()
-    else:
-        # Some datasets have zero systematic uncertainties.
-        # We treat this special edge case by creating
-        # a matrix of zeros so it can be broadcast
-        # with the statistical error matrix at the end
-        correlated_mat = additive_mat = np.zeros_like(stat_mat)
+def commondatas_covmat_from_systematics(list_of_commondata, use_theory_errors=True):
+    """Given a list of commondata, construct the full covariance matrix.
 
-    # Avoid double counting in case that the i = j element is a correlated
-    # systematic. We take care of this case in the einsum below
-    off_diagonals = correlated_mat @ additive_mat.T
-    np.fill_diagonal(off_diagonals, 0)
+    This is similar to :py:meth:`covmat_from_systematics`
+    except that ``special_corr`` systematics are concatenated across all datasets
+    before it is multiplied by its transpose to give off block-diagonal
+    contributions. The other systematics contribute to the block diagonal in the
+    same way as :py:meth:`covmat_from_systematics`.
 
-    cov_mat = (
-        stat_mat ** 2
-        + off_diagonals
-        + np.diag(np.einsum("ij, ij -> i", additive_mat, additive_mat))
-    )
-    return cov_mat
+    Paramaters
+    ----------
+    list_of_commondata : list[validphys.coredata.CommonData]
+        list of CommonData objects.
+    use_theory_errors: bool
+        Whether or not to include errors with name ``THEORY*` in the covmat
+
+    Returns
+    -------
+    cov_mat : np.array
+        Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts)
+        containing uncertainty and correlation information.
 
+    """
+    special_corrs = []
+    block_diags = []
+    for cd in list_of_commondata:
+        stat, unc, corr, thunc, thcorr, special_corr = split_systematics(cd)
+        special_corrs.append(special_corr)
+        diag_covmat = (
+            np.diag(
+                stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors
+            )
+            + (
+                corr @ corr.T +
+                thcorr @ thcorr.T * use_theory_errors
+            )
+        )
+        block_diags.append(diag_covmat)
+    special_sys = pd.concat(special_corrs, axis=0, sort=False)
+    special_sys.fillna(0, inplace=True)
+    diag = la.block_diag(*block_diags)
+    return diag + special_sys.values @ special_sys.values.T
 
 def sqrt_covmat(covariance_matrix):
     """Function that computes the square root of the covariance matrix.
diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py
index 9266a502b8..916e9805a5 100644
--- a/validphys2/src/validphys/tests/test_covmats.py
+++ b/validphys2/src/validphys/tests/test_covmats.py
@@ -12,10 +12,12 @@
 
 
 from validphys.api import API
-from validphys.commondataparser import combine_commondata, load_commondata
-from validphys.covmats import covmat_from_systematics, sqrt_covmat
+from validphys.commondataparser import load_commondata
+from validphys.covmats import (
+    sqrt_covmat,
+    commondatas_covmat_from_systematics
+)
 from validphys.loader import Loader
-from validphys.tableloader import parse_exp_mat
 from validphys.tests.conftest import THEORYID
 
 REGRESSION_FOLDER = pathlib.Path(__file__).with_name("regressions")
@@ -36,8 +38,7 @@ def test_covmat_from_systematics_correlated():
     cds = list(map(l.check_commondata, correlated_datasets))
     ld_cds = list(map(load_commondata, cds))
 
-    combo_cd = combine_commondata(ld_cds)
-    covmat = covmat_from_systematics(combo_cd)
+    covmat = commondatas_covmat_from_systematics(ld_cds)
 
     dss = [
         l.check_dataset(i, theoryid=THEORYID, cuts=None) for i in correlated_datasets
@@ -45,7 +46,7 @@ def test_covmat_from_systematics_correlated():
     exp = l.check_experiment("null", dss)
     ld_exp = exp.load()
 
-    assert np.allclose(ld_exp.get_covmat(), covmat)
+    np.testing.assert_allclose(ld_exp.get_covmat(), covmat)
 
 
 def test_covmat_from_systematics(data_config):
@@ -64,9 +65,8 @@ def test_covmat_from_systematics(data_config):
         for cd in cds
     ]
     cut_ld_cds = list(map(lambda x: x[0].with_cuts(x[1]), zip(ld_cds, internal_cuts)))
-    combo_cd = combine_commondata(cut_ld_cds)
 
-    covmat = covmat_from_systematics(combo_cd)
+    covmat = commondatas_covmat_from_systematics(cut_ld_cds)
 
     dss = [
         l.check_dataset(i.name, theoryid=THEORYID, cuts="internal")

From 98fccc19818b5a8394236151d313abf9b76e9388 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 21 Oct 2020 14:17:03 +0100
Subject: [PATCH 08/15] Update names and docstrings of covmat construction
 functions

---
 validphys2/src/validphys/covmats.py           | 139 ++++++++++--------
 .../src/validphys/tests/test_covmats.py       |   6 +-
 2 files changed, 79 insertions(+), 66 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 03a8214da0..c2da1a8859 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -23,20 +23,29 @@
 
 log = logging.getLogger(__name__)
 
+INTRA_DATASET_SYS_NAME = ("THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR")
 
-def split_systematics(commondata):
-    """Take the systematics table from a commondata and remove any systematics
-    which have name "SKIP". Next transform the systematics into the correct units.
-    Additive (ADD) systematics are left unchanged, whereas multiplicative (MULT)
-    systematics need to be converted from a percentage into an uncertainty
-    by multiplying by the central value and dividing by 100.
 
-    Finally the systematics are split into the different components which
-    are treated differently when constructing the covariance matrix.
+def split_uncertainties(commondata):
+    """Take the statistical uncertainty and systematics table from ``commondata``
+    and split into the different different uncertainty archetypes each of
+    which may be handled differently by future operations, such as constructing
+    the covariance matrix.
 
-    Returns a tuple of the statistical uncertainty and 5 possible groups
-    of systematics uncertainties: UNCORR, CORR, THEORYUNCORR, THEORYCORR and
-    inter-dataset systematics
+    First we remove any systematics which have name "SKIP" because, as the name
+    implies, these systematics will be skipped.
+
+    Next the systematics are transformed into the correct units:
+
+        - Additive (ADD) systematics are left unchanged
+        - multiplicative (MULT) systematics need to be converted from a
+        percentage into an uncertainty by multiplying by the central value
+        and dividing by 100.
+
+    Finally, the systematics are split into the five possible archetypes
+    of systematics uncertainties: uncorrelated (UNCORR), correlated (CORR),
+    theory_uncorrelated (THEORYUNCORR), theory_correlated (THEORYCORR) and
+    special_correlated systematics
 
     Returns
     -------
@@ -52,58 +61,64 @@ def split_systematics(commondata):
             correlated: np.2darray
                 Similar to uncorrelated except they can be correlated across
                 data points.
-            theory_uncorr: np.2darray
+            theory_uncorrelated: np.2darray
                 numpy array of uncorrelated "theory" uncertainties which
                 are not to be confused with theory covariance uncertainties.
                 Instead they are, for example, uncertainties in the c-factors
-            theory_corr: np.2darray
-                same as theory_uncorr except they can be correlated across
+            theory_correlated: np.2darray
+                same as theory_uncorrelated except they can be correlated across
                 data points.
-            special_corr: pd.DataFrame
-                a dataframe of the systematics which can be correlated across
+            special_correlated: pd.DataFrame
+                Dataframe of the systematics which can be correlated across
                 datasets, the columns of the dataframe are the systematic names.
-                Each systematic of this type has a unique name, which is used
-                to correlated these systematics across datasets.
+                Each systematic of this type has a unique name, by returning
+                these systematics as a dataframe with the unique names as column
+                headers, we can easily join up the special_correlated systematics
+                from multiple datasets, retaining correlations, using
+                ``pandas.concat``.
 
     """
-    systype = commondata.systype_table["name"].values
+    sys_name = commondata.systype_table["name"].to_numpy()
     # Dropping systematics that have type SKIP
-    sys_errors = commondata.sys_errors.loc[:, systype != "SKIP"]
-    systype = systype[systype != "SKIP"]
+    sys_errors_df = commondata.sys_errors.loc[:, sys_name != "SKIP"]
+    sys_name = sys_name[sys_name != "SKIP"]
 
     # Diagonal matrix containing the statistical uncertainty for each
     # data point
-    stat = commondata.stat_errors.values
+    stat = commondata.stat_errors.to_numpy()
 
-    # Systematic uncertainties converted to proper units (additives are left
-    # unchanged, multiplicative uncertainties are in percentage format so get
-    # multiplied by central value and divided by 100)
-    sys_mat = sys_errors.apply(
+    # Systematic uncertainties converted to absolute uncertainties (additives
+    # are left unchanged, multiplicative uncertainties are in percentage format
+    # so get multiplied by central value and divided by 100).
+    abs_sys_errors_df = sys_errors_df.apply(
         lambda x: [
             i.add if i.sys_type == "ADD" else (i.mult * j / 100)
             for i, j in zip(x, commondata.central_values)
         ]
     )
-    # set columns for special_corr errors
-    sys_mat.columns = systype
+    abs_sys_errors = abs_sys_errors_df.to_numpy()
 
-    special = ["THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR"]
+    # set columns for special_correlated errors
+    abs_sys_errors_df.columns = sys_name
+    special_corr = abs_sys_errors_df.loc[:, ~np.isin(sys_name, INTRA_DATASET_SYS_NAME)]
 
-    special_corr = sys_mat.loc[:, ~np.isin(systype, special)]
-    thunc = sys_mat.loc[:, systype == "THEORYUNCORR"].values
-    unc = sys_mat.loc[:, systype == "UNCORR"].values
-    thcorr = sys_mat.loc[:, systype == "THEORYCORR"].values
-    corr = sys_mat.loc[:, systype == "CORR"].values
+    thunc = abs_sys_errors[:, sys_name == "THEORYUNCORR"]
+    unc = abs_sys_errors[:, sys_name == "UNCORR"]
+    thcorr = abs_sys_errors[:, sys_name == "THEORYCORR"]
+    corr = abs_sys_errors[:, sys_name == "CORR"]
     return stat, unc, corr, thunc, thcorr, special_corr
 
 
 def covmat_from_systematics(commondata, use_theory_errors=True):
-    """Taking the tuple of statistical uncertainty and systematics returned by
-    :py:meth:`split_systematics` and construct the covariance matrix. uncorrelated
-    contributions from ``stat``, ``uncorrelated`` and ``theory_uncorr`` are
-    added in quadrature
-    to the diagonal of the covmat. The correlated systematics ``correlated``,
-    ``theory_corr`` and ``special_corr`` are multiplied by their transpose
+    """Given a single ``commondata``, obtain the tuple of statistical
+    uncertainty and systematics from :py:meth:`split_uncertainties` and
+    construct the covariance matrix.
+    Uncorrelated contributions from: ``stat``, ``uncorrelated`` and
+    ``theory_uncorrelated`` are added in quadrature
+    to the diagonal of the covmat.
+
+    The correlated systematics: ``correlated``,
+    ``theory_correlated`` and ``special_correlated`` are multiplied by their transpose
     to give contributions to both the diagonal and off diagonal components of
     the covmat.
 
@@ -148,27 +163,27 @@ def covmat_from_systematics(commondata, use_theory_errors=True):
            [3.46727332e-05, 3.45492831e-05, 2.56283708e-05, ...,
             4.14126235e-05, 4.15843357e-05, 1.43824457e-04]])
     """
-    stat, unc, corr, thunc, thcorr, special_corr = split_systematics(commondata)
+    stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(commondata)
 
     special_vals = special_corr.values
-    cov_mat = (
-        np.diag(
-            stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors
-        )
-        + (
-            corr @ corr.T +
-            thcorr @ thcorr.T * use_theory_errors +
-            special_vals @ special_vals.T
-        )
+    cov_mat = np.diag(
+        stat ** 2
+        + (unc ** 2).sum(axis=1)
+        + (thunc ** 2).sum(axis=1) * use_theory_errors
+    ) + (
+        corr @ corr.T
+        + thcorr @ thcorr.T * use_theory_errors
+        + special_vals @ special_vals.T
     )
     return cov_mat
 
-def commondatas_covmat_from_systematics(list_of_commondata, use_theory_errors=True):
+
+def datasets_covmat_from_systematics(list_of_commondata, use_theory_errors=True):
     """Given a list of commondata, construct the full covariance matrix.
 
     This is similar to :py:meth:`covmat_from_systematics`
-    except that ``special_corr`` systematics are concatenated across all datasets
-    before it is multiplied by its transpose to give off block-diagonal
+    except that ``special_corr`` is concatenated across all datasets
+    before being multiplied by its transpose to give off block-diagonal
     contributions. The other systematics contribute to the block diagonal in the
     same way as :py:meth:`covmat_from_systematics`.
 
@@ -189,23 +204,21 @@ def commondatas_covmat_from_systematics(list_of_commondata, use_theory_errors=Tr
     special_corrs = []
     block_diags = []
     for cd in list_of_commondata:
-        stat, unc, corr, thunc, thcorr, special_corr = split_systematics(cd)
+        stat, unc, corr, thunc, thcorr, special_corr = split_uncertainties(cd)
         special_corrs.append(special_corr)
-        diag_covmat = (
-            np.diag(
-                stat ** 2 + (unc**2).sum(axis=1) + (thunc**2).sum(axis=1)* use_theory_errors
-            )
-            + (
-                corr @ corr.T +
-                thcorr @ thcorr.T * use_theory_errors
-            )
-        )
+        diag_covmat = np.diag(
+            stat ** 2
+            + (unc ** 2).sum(axis=1)
+            + (thunc ** 2).sum(axis=1) * use_theory_errors
+        ) + (corr @ corr.T + thcorr @ thcorr.T * use_theory_errors)
         block_diags.append(diag_covmat)
     special_sys = pd.concat(special_corrs, axis=0, sort=False)
+    # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
     special_sys.fillna(0, inplace=True)
     diag = la.block_diag(*block_diags)
     return diag + special_sys.values @ special_sys.values.T
 
+
 def sqrt_covmat(covariance_matrix):
     """Function that computes the square root of the covariance matrix.
 
diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py
index 916e9805a5..4ca16649a9 100644
--- a/validphys2/src/validphys/tests/test_covmats.py
+++ b/validphys2/src/validphys/tests/test_covmats.py
@@ -15,7 +15,7 @@
 from validphys.commondataparser import load_commondata
 from validphys.covmats import (
     sqrt_covmat,
-    commondatas_covmat_from_systematics
+    datasets_covmat_from_systematics
 )
 from validphys.loader import Loader
 from validphys.tests.conftest import THEORYID
@@ -38,7 +38,7 @@ def test_covmat_from_systematics_correlated():
     cds = list(map(l.check_commondata, correlated_datasets))
     ld_cds = list(map(load_commondata, cds))
 
-    covmat = commondatas_covmat_from_systematics(ld_cds)
+    covmat = datasets_covmat_from_systematics(ld_cds)
 
     dss = [
         l.check_dataset(i, theoryid=THEORYID, cuts=None) for i in correlated_datasets
@@ -66,7 +66,7 @@ def test_covmat_from_systematics(data_config):
     ]
     cut_ld_cds = list(map(lambda x: x[0].with_cuts(x[1]), zip(ld_cds, internal_cuts)))
 
-    covmat = commondatas_covmat_from_systematics(cut_ld_cds)
+    covmat = datasets_covmat_from_systematics(cut_ld_cds)
 
     dss = [
         l.check_dataset(i.name, theoryid=THEORYID, cuts="internal")

From b601377269cb560aa319cf92c5783a18cb274435 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Fri, 23 Oct 2020 00:29:15 +0100
Subject: [PATCH 09/15] Add back the docstring from shayan editing to make
 sense with slight reshuffling of functions

---
 validphys2/src/validphys/covmats.py | 82 ++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 14 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index c2da1a8859..5dd1dc2e8b 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -27,25 +27,73 @@
 
 
 def split_uncertainties(commondata):
-    """Take the statistical uncertainty and systematics table from ``commondata``
+    """Take the statistical uncertainty and systematics table from
+    a :py:class:`validphys.coredata.CommonData` object
     and split into the different different uncertainty archetypes each of
     which may be handled differently by future operations, such as constructing
-    the covariance matrix.
+    the covariance matrix. The logic of the different archetypes
+    is best described by the now deprecated C++ code:
+
+    .. code-block:: c++
+            auto CovMat = NNPDF::matrix<double>(ndat, ndat);
+            for (int i = 0; i < ndat; i++)
+            {
+              for (int j = 0; j < ndat; j++)
+              {
+                double sig    = 0.0;
+                double signor = 0.0;
+                // Statistical error
+                if (i == j)
+                  sig += pow(stat_error[i],2);
+                for (int l = 0; l < nsys; l++)
+                {
+                  sysError const& isys = systematic_errors[i][l];
+                  sysError const& jsys = systematic_errors[j][l];
+                  if (isys.name != jsys.name)
+                      throw RuntimeException("ComputeCovMat", "Inconsistent naming of systematics");
+                  if (isys.name == "SKIP")
+                      continue; // Continue if systype is skipped
+                  if ((isys.name == "THEORYCORR" || isys.name == "THEORYUNCORR") && !use_theory_errors)
+                      continue; // Continue if systype is theoretical and use_theory_errors == false
+                  const bool is_correlated = ( isys.name != "UNCORR" && isys.name !="THEORYUNCORR");
+                  if (i == j || is_correlated)
+                    switch (isys.type)
+                    {
+                      case ADD:   sig    += isys.add *jsys.add;  break;
+                      case MULT: if (mult_errors) { signor += isys.mult*jsys.mult; break; }
+                                 else { continue; }
+                      case UNSET: throw RuntimeException("ComputeCovMat", "UNSET systype encountered");
+                    }
+                }
+                // Covariance matrix entry
+                CovMat(i, j) = (sig + signor*central_values[i]*central_values[j]*1e-4);
+            // Covariance matrix weight
+            CovMat(i, j) /= sqrt_weights[i]*sqrt_weights[j];
+          }
+        }
 
-    First we remove any systematics which have name "SKIP" because, as the name
-    implies, these systematics will be skipped.
+        return CovMat;
+      }
 
-    Next the systematics are transformed into the correct units:
+    if the systematic of data point i has name ``SKIP`` we ignore
+    it. This is handled by scanning over all sytematic errors in the
+    ``sys_errors`` dataframe and dropping any columns which correspond
+    to a systematic error name of ``SKIP``, thus the ``sys_errors`` dataframe
+    defined below contains only the systematics without the ``SKIP`` name.
 
+    Note that in the switch statement an ADDitive or MULTiplicative systype
+    is handled by either multiplying the additive or multiplicative
+    uncertainties respectively. We convert uncertainties so that they are all
+    absolute:
         - Additive (ADD) systematics are left unchanged
         - multiplicative (MULT) systematics need to be converted from a
-        percentage into an uncertainty by multiplying by the central value
+        percentage by multiplying by the central value
         and dividing by 100.
 
     Finally, the systematics are split into the five possible archetypes
     of systematics uncertainties: uncorrelated (UNCORR), correlated (CORR),
     theory_uncorrelated (THEORYUNCORR), theory_correlated (THEORYCORR) and
-    special_correlated systematics
+    special_correlated systematics.
 
     Returns
     -------
@@ -110,17 +158,22 @@ def split_uncertainties(commondata):
 
 
 def covmat_from_systematics(commondata, use_theory_errors=True):
-    """Given a single ``commondata``, obtain the tuple of statistical
-    uncertainty and systematics from :py:meth:`split_uncertainties` and
+    """Given a single :py:class:`validphys.coredata.CommonData`, obtain the
+    tuple of statistical uncertainty and systematics from
+    :py:meth:`split_uncertainties` and
     construct the covariance matrix.
+
     Uncorrelated contributions from: ``stat``, ``uncorrelated`` and
     ``theory_uncorrelated`` are added in quadrature
     to the diagonal of the covmat.
 
-    The correlated systematics: ``correlated``,
-    ``theory_correlated`` and ``special_correlated`` are multiplied by their transpose
-    to give contributions to both the diagonal and off diagonal components of
-    the covmat.
+    From here it's a matter of staring at a piece of paper for a while to
+    realise the contribution to the covariance matrix arising due to
+    correlated systematics is schematically ``A_correlated @ A_correlated.T``,
+    where A_correlated is a matrix N_dat by N_sys. The total contribution
+    from correlated systematics is found by adding together the result of
+    mutiplying each correlated systematic matrix by its transpose
+    (``correlated``, ``theory_correlated`` and ``special_correlated``).
 
     For more information on the generation of the covariance matrix see the
     `paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
@@ -179,7 +232,8 @@ def covmat_from_systematics(commondata, use_theory_errors=True):
 
 
 def datasets_covmat_from_systematics(list_of_commondata, use_theory_errors=True):
-    """Given a list of commondata, construct the full covariance matrix.
+    """Given a list containing :py:class:`validphys.coredata.CommonData` s,
+    construct the full covariance matrix.
 
     This is similar to :py:meth:`covmat_from_systematics`
     except that ``special_corr`` is concatenated across all datasets

From d8fb0437994a15b426e329daef72ea39c1f2934d Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Fri, 23 Oct 2020 00:33:38 +0100
Subject: [PATCH 10/15] undo erroneous removal of blank lines

---
 validphys2/src/validphys/covmats.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 5dd1dc2e8b..0de87dc2ec 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -35,16 +35,20 @@ def split_uncertainties(commondata):
     is best described by the now deprecated C++ code:
 
     .. code-block:: c++
+
             auto CovMat = NNPDF::matrix<double>(ndat, ndat);
+
             for (int i = 0; i < ndat; i++)
             {
               for (int j = 0; j < ndat; j++)
               {
                 double sig    = 0.0;
                 double signor = 0.0;
+
                 // Statistical error
                 if (i == j)
                   sig += pow(stat_error[i],2);
+
                 for (int l = 0; l < nsys; l++)
                 {
                   sysError const& isys = systematic_errors[i][l];
@@ -65,6 +69,7 @@ def split_uncertainties(commondata):
                       case UNSET: throw RuntimeException("ComputeCovMat", "UNSET systype encountered");
                     }
                 }
+
                 // Covariance matrix entry
                 CovMat(i, j) = (sig + signor*central_values[i]*central_values[j]*1e-4);
             // Covariance matrix weight

From 589074cfe6a24af6e7df1184eba11accc187726e Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 29 Oct 2020 11:59:43 +0000
Subject: [PATCH 11/15] Removing unused imports

---
 validphys2/src/validphys/commondataparser.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index 7fcdb9644e..c1f95bb07b 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -8,12 +8,10 @@
 """
 from operator import attrgetter
 
-import numpy as np
 import pandas as pd
-import scipy.linalg as la
 
 from validphys.core import peek_commondata_metadata
-from validphys.coredata import CommonData, SystematicError
+from validphys.coredata import CommonData
 
 def load_commondata(spec):
     """

From 1b67c309d28cefc19180da732906f9aff8ea5759 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 29 Oct 2020 14:05:27 +0000
Subject: [PATCH 12/15] Adding new fixtures for cut data and correlated data

Making tests use these fixtures
---
 validphys2/src/validphys/tests/conftest.py    | 38 ++++++++++++++
 .../src/validphys/tests/test_covmats.py       | 50 +++++--------------
 2 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/validphys2/src/validphys/tests/conftest.py b/validphys2/src/validphys/tests/conftest.py
index 3f3e2d9e8d..af42c65f87 100644
--- a/validphys2/src/validphys/tests/conftest.py
+++ b/validphys2/src/validphys/tests/conftest.py
@@ -34,6 +34,26 @@ def tmp(tmpdir):
         'datasets': [{'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10}]}
     ]
 
+# Experiments which have non trivial correlations between their datasets
+CORR_EXPERIMENTS = [
+    {
+        'experiment': 'ATLAS',
+        'datasets': [
+            {'dataset': 'ATLASWZRAP36PB', 'cfac': ['QCD']},
+            {'dataset': 'ATLASZHIGHMASS49FB', 'cfac': ['QCD']},
+            {'dataset': 'ATLASLOMASSDY11EXT', 'cfac': ['QCD']},
+            {'dataset': 'ATLASWZRAP11', 'frac': 0.5, 'cfac': ['QCD']},
+        ],
+    },
+    {
+        'experiment': 'CMS',
+        'datasets': [
+            {'dataset': 'CMSZDIFF12', 'cfac': ('QCD', 'NRM'), 'sys': 10},
+            {'dataset': 'CMSJETS11', 'frac': 0.5, 'sys': 10},
+        ],
+    },
+]
+
 SINGLE_EXP = [
     {
         'experiment': 'pseudo experiment',
@@ -69,6 +89,12 @@ def tmp(tmpdir):
 def data_config():
     return base_config
 
+@pytest.fixture(scope='module')
+def data_internal_cuts_config(data_config):
+    config_dict = dict(data_config)
+    config_dict.update(use_cuts='internal')
+    return config_dict
+
 @pytest.fixture(scope='module')
 def data_witht0_config():
     config_dict = dict(
@@ -83,6 +109,18 @@ def data_singleexp_witht0_config(data_witht0_config):
     config_dict.update({'experiments': SINGLE_EXP})
     return config_dict
 
+@pytest.fixture(scope='module')
+def data_with_correlations_config():
+    corr_dict = dict(base_config)
+    corr_dict.update(experiments=CORR_EXPERIMENTS)
+    return corr_dict
+
+@pytest.fixture(scope='module')
+def data_with_correlations_internal_cuts_config(data_with_correlations_config):
+    config_dict = dict(data_with_correlations_config)
+    config_dict.update(use_cuts='internal')
+    return config_dict
+
 @pytest.fixture(scope='module')
 def weighted_data_witht0_config(data_witht0_config):
     config_dict = dict(data_witht0_config)
diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py
index 4ca16649a9..b46cc20fa0 100644
--- a/validphys2/src/validphys/tests/test_covmats.py
+++ b/validphys2/src/validphys/tests/test_covmats.py
@@ -3,12 +3,11 @@
 
 Tests related to the computation of the covariance matrix and its derivatives
 """
-import pathlib
+import random
 
 import pytest
 
 import numpy as np
-import random
 
 
 from validphys.api import API
@@ -20,64 +19,41 @@
 from validphys.loader import Loader
 from validphys.tests.conftest import THEORYID
 
-REGRESSION_FOLDER = pathlib.Path(__file__).with_name("regressions")
 
-
-def test_covmat_from_systematics_correlated():
+def test_covmat_from_systematics_correlated(data_with_correlations_config):
     """Test the covariance matrix generation from a set of correlated datasets
     given their systematic errors
     """
-    l = Loader()
-    correlated_datasets = [
-        "ATLASWZRAP36PB",
-        "ATLASZHIGHMASS49FB",
-        "ATLASLOMASSDY11EXT",
-        "ATLASWZRAP11",
-    ]
-
-    cds = list(map(l.check_commondata, correlated_datasets))
+    data = API.data(**data_with_correlations_config)
+    cds = [ds.commondata for ds in data.datasets]
+
     ld_cds = list(map(load_commondata, cds))
 
     covmat = datasets_covmat_from_systematics(ld_cds)
 
-    dss = [
-        l.check_dataset(i, theoryid=THEORYID, cuts=None) for i in correlated_datasets
-    ]
-    exp = l.check_experiment("null", dss)
-    ld_exp = exp.load()
+    cpp_covmat = API.groups_covmat(**data_with_correlations_config)
 
-    np.testing.assert_allclose(ld_exp.get_covmat(), covmat)
+    np.testing.assert_allclose(cpp_covmat, covmat)
 
 
-def test_covmat_from_systematics(data_config):
+def test_covmat_from_systematics(data_internal_cuts_config):
     """Test which checks the python computation of the covmat relating to a
     collection of datasets matches that of the C++ computation. Note that the
     datasets are cut using the internal rules, but the datasets are not correlated.
     """
-    l = Loader()
-    exps = API.experiments(**data_config)
+    data = API.data(**data_internal_cuts_config)
+    cds = [ds.commondata for ds in data.datasets]
 
-    cds = [l.check_commondata(i.name) for exp in exps for i in exp.datasets]
     ld_cds = list(map(load_commondata, cds))
 
-    internal_cuts = [
-        l.check_internal_cuts(cd, API.rules(theoryid=THEORYID, use_cuts="internal"))
-        for cd in cds
-    ]
+    internal_cuts = [ds.cuts for ds in data.datasets]
     cut_ld_cds = list(map(lambda x: x[0].with_cuts(x[1]), zip(ld_cds, internal_cuts)))
 
     covmat = datasets_covmat_from_systematics(cut_ld_cds)
 
-    dss = [
-        l.check_dataset(i.name, theoryid=THEORYID, cuts="internal")
-        for exp in exps
-        for i in exp.datasets
-    ]
-    exp = l.check_experiment("null", dss)
-    ld = exp.load()
-    cpp_covmat = ld.get_covmat()
+    cpp_covmat = API.groups_covmat(**data_internal_cuts_config)
 
-    assert np.allclose(cpp_covmat, covmat)
+    np.testing.assert_allclose(cpp_covmat, covmat)
 
 
 def test_cpp_sqrtcovmat():

From e58f4438ed3da1eacb56a4ab16a9b983d4d17186 Mon Sep 17 00:00:00 2001
From: siranipour <43517072+siranipour@users.noreply.github.com>
Date: Tue, 3 Nov 2020 17:36:58 +0000
Subject: [PATCH 13/15] Apply suggestions from code review

Changing `conftest` datasets to use the data keyword

Co-authored-by: wilsonmr <33907451+wilsonmr@users.noreply.github.com>
---
 validphys2/src/validphys/tests/conftest.py | 27 ++++++++--------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/validphys2/src/validphys/tests/conftest.py b/validphys2/src/validphys/tests/conftest.py
index af42c65f87..46683785a0 100644
--- a/validphys2/src/validphys/tests/conftest.py
+++ b/validphys2/src/validphys/tests/conftest.py
@@ -35,23 +35,13 @@ def tmp(tmpdir):
     ]
 
 # Experiments which have non trivial correlations between their datasets
-CORR_EXPERIMENTS = [
-    {
-        'experiment': 'ATLAS',
-        'datasets': [
-            {'dataset': 'ATLASWZRAP36PB', 'cfac': ['QCD']},
-            {'dataset': 'ATLASZHIGHMASS49FB', 'cfac': ['QCD']},
-            {'dataset': 'ATLASLOMASSDY11EXT', 'cfac': ['QCD']},
-            {'dataset': 'ATLASWZRAP11', 'frac': 0.5, 'cfac': ['QCD']},
-        ],
-    },
-    {
-        'experiment': 'CMS',
-        'datasets': [
-            {'dataset': 'CMSZDIFF12', 'cfac': ('QCD', 'NRM'), 'sys': 10},
-            {'dataset': 'CMSJETS11', 'frac': 0.5, 'sys': 10},
-        ],
-    },
+CORR_DATA = [
+    {'dataset': 'ATLASWZRAP36PB', 'cfac': ['QCD']},
+    {'dataset': 'ATLASZHIGHMASS49FB', 'cfac': ['QCD']},
+    {'dataset': 'ATLASLOMASSDY11EXT', 'cfac': ['QCD']},
+    {'dataset': 'ATLASWZRAP11', 'frac': 0.5, 'cfac': ['QCD']},
+    {'dataset': 'CMSZDIFF12', 'cfac': ('QCD', 'NRM'), 'sys': 10},
+    {'dataset': 'CMSJETS11', 'frac': 0.5, 'sys': 10},
 ]
 
 SINGLE_EXP = [
@@ -112,7 +102,8 @@ def data_singleexp_witht0_config(data_witht0_config):
 @pytest.fixture(scope='module')
 def data_with_correlations_config():
     corr_dict = dict(base_config)
-    corr_dict.update(experiments=CORR_EXPERIMENTS)
+    corr_dict.pop("experiments")
+    corr_dict.update(dataset_inputs=CORR_DATA)
     return corr_dict
 
 @pytest.fixture(scope='module')

From 780bcaf409cd26b027b498c42004465a669668f0 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 4 Nov 2020 10:56:04 +0000
Subject: [PATCH 14/15] Adding Parameters header to split_uncertainties
 function

---
 validphys2/src/validphys/covmats.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 0de87dc2ec..1172725a76 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -100,6 +100,13 @@ def split_uncertainties(commondata):
     theory_uncorrelated (THEORYUNCORR), theory_correlated (THEORYCORR) and
     special_correlated systematics.
 
+
+    Parameters
+    ----------
+    commondata: :py:class:`validphys.coredata.CommonData`
+        A commondata object containing information on the data central values, statistical and
+        systematic uncertainties, and information on their correlations.
+
     Returns
     -------
         uncertainties: tuple

From 1c458438db4148ecf4aefd14c1ca98d5e7929e3a Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 4 Nov 2020 12:27:23 +0000
Subject: [PATCH 15/15] Making return type namedtuple

Updating docstring to reflect this
---
 validphys2/src/validphys/covmats.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 1172725a76..385c2921e4 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -1,6 +1,7 @@
 """Module for handling logic and manipulation of covariance and correlation
 matrices on different levels of abstraction
 """
+from collections import namedtuple
 import logging
 
 import numpy as np
@@ -25,6 +26,10 @@
 
 INTRA_DATASET_SYS_NAME = ("THEORYUNCORR", "UNCORR", "THEORYCORR", "CORR")
 
+Uncertainties = namedtuple(
+    "Uncertainties", ("stat", "unc", "corr", "thunc", "thcorr", "special")
+)
+
 
 def split_uncertainties(commondata):
     """Take the statistical uncertainty and systematics table from
@@ -109,26 +114,28 @@ def split_uncertainties(commondata):
 
     Returns
     -------
-        uncertainties: tuple
+        uncertainties: :py:class:`validphys.covmats.Uncertainties`
+            A ``namedtuple`` with the following attributes
+
             stat: np.array
                 1-D array containing statistical uncertainties
-            uncorrelated: np.2darray
+            unc: np.2darray
                 numpy array of uncorrelated systematics, can be empty if there
                 are no uncorrelated systematics. These are semantically similar
                 to statistical error, and only contribute the diagonal component
                 of the covariance matrix, because they are uncorrelated across
                 data points.
-            correlated: np.2darray
+            corr: np.2darray
                 Similar to uncorrelated except they can be correlated across
                 data points.
-            theory_uncorrelated: np.2darray
+            thunc: np.2darray
                 numpy array of uncorrelated "theory" uncertainties which
                 are not to be confused with theory covariance uncertainties.
                 Instead they are, for example, uncertainties in the c-factors
-            theory_correlated: np.2darray
+            thcorr: np.2darray
                 same as theory_uncorrelated except they can be correlated across
                 data points.
-            special_correlated: pd.DataFrame
+            special: pd.DataFrame
                 Dataframe of the systematics which can be correlated across
                 datasets, the columns of the dataframe are the systematic names.
                 Each systematic of this type has a unique name, by returning
@@ -136,7 +143,6 @@ def split_uncertainties(commondata):
                 headers, we can easily join up the special_correlated systematics
                 from multiple datasets, retaining correlations, using
                 ``pandas.concat``.
-
     """
     sys_name = commondata.systype_table["name"].to_numpy()
     # Dropping systematics that have type SKIP
@@ -166,7 +172,9 @@ def split_uncertainties(commondata):
     unc = abs_sys_errors[:, sys_name == "UNCORR"]
     thcorr = abs_sys_errors[:, sys_name == "THEORYCORR"]
     corr = abs_sys_errors[:, sys_name == "CORR"]
-    return stat, unc, corr, thunc, thcorr, special_corr
+
+    uncertainties = Uncertainties(stat, unc, corr, thunc, thcorr, special_corr)
+    return uncertainties
 
 
 def covmat_from_systematics(commondata, use_theory_errors=True):