NNPDF · scarrazza · Jun 24, 2020 · May 5, 2020 · May 5, 2020 · May 5, 2020
diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
@@ -0,0 +1,92 @@
+"""
+This module implements parsers for commondata  and systype files into useful
+datastructures, contained in the :py:mod:`validphys.coredata` module, which are
+not backed by C++ managed memory, and so they can be easily pickled and
+interfaces with common Python libraries.  The integration of these objects into
+the codebase is currently work in progress, and at the moment this module
+serves as a proof of concept.
+"""
+from operator import attrgetter
+
+import pandas as pd
+
+from validphys.core import peek_commondata_metadata
+from validphys.coredata import CommonData
+
+def load_commondata(spec):
+    """
+    Load the data corresponding to a CommonDataSpec object.
+    Returns an instance of CommonData
+    """
+    commondatafile = spec.datafile
+    # Getting set name from commondata file name
+    setname = commondatafile.name[5:-4] # DATA prefix and .dat suffix
+    systypefile = spec.sysfile
+
+    commondata = parse_commondata(commondatafile, systypefile, setname)
+
+    return commondata
+
+
+def parse_commondata(commondatafile, systypefile, setname):
+    """Parse a commondata file  and a systype file into a CommonData.
+
+    Parameters
+    ----------
+    commondatafile : file or path to file
+    systypefile : file or path to file
+
+    Returns
+    -------
+    commondata : CommonData
+        An object containing the data and information from the commondata
+        and systype files.
+    """
+    # First parse commondata file
+    commondatatable = pd.read_csv(commondatafile, sep=r'\s+', skiprows=1, header=None)
+    # Remove NaNs
+    # TODO: replace commondata files with bad formatting
+    # Build header
+    commondataheader = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat']
+    nsys  = (commondatatable.shape[1] - len(commondataheader)) // 2
+    for i in range(nsys):
+        commondataheader += [f"sys.add.{i+1}", f"sys.mult.{i+1}"]
+    commondatatable.columns = commondataheader
+    commondatatable.set_index("entry", inplace=True)
+    ndata = len(commondatatable)
+    commondataproc = commondatatable["process"][1]
+     # Check for consistency with commondata metadata
+    cdmetadata =  peek_commondata_metadata(commondatafile)
+    if (setname, nsys, ndata) != attrgetter('name', 'nsys', 'ndata')(cdmetadata):
+        raise ValueError("Commondata table information does not match metadata")
+
+    # Now parse the systype file
+    systypetable = parse_systypes(systypefile)
+
+    # Populate CommonData object
+    return CommonData(
+        setname=setname,
+        ndata=ndata,
+        commondataproc=commondataproc,
+        nkin=3,
+        nsys=nsys,
+        commondata_table=commondatatable,
+        systype_table=systypetable
+    )
+
+def parse_systypes(systypefile):
+    """Parses a systype file and returns a pandas dataframe.
+    """
+    systypeheader = ["sys_index", "type", "name"]
+    try:
+        systypetable = pd.read_csv(
+            systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None
+        )
+        systypetable.dropna(axis='columns', inplace=True)
+    # Some datasets e.g. CMSWCHARMRAT have no systematics
+    except pd.errors.EmptyDataError:
+        systypetable = pd.DataFrame(columns=systypeheader)
+
+    systypetable.set_index("sys_index", inplace=True)
+
+    return systypetable
diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py
@@ -28,7 +28,7 @@ class FKTableData:
     xgrid : array, shape (nx)
         The points in x at which the PDFs should be evaluated.
 
-    sigma : DataFrame
+    sigma : pd.DataFrame
         For hadronic data, the columns are the indexes in the ``NfxNf`` list of
         possible flavour combinations of two PDFs.  The MultiIndex contains
         three keys, the data index, an index into ``xgrid`` for the first PDF
@@ -117,3 +117,126 @@ class CFactorData:
     description: str
     central_value: np.array
     uncertainty: np.array
+
+
+@dataclasses.dataclass(eq=False)
+class SystematicError:
+    add: float
+    mult: float
+    sys_type: str #e.g ADD
+    name: str #e.g UNCORR
+
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(add={self.add}, mult={self.mult},"
+                "sys_type={self.sys_type}, name={self.name})")
+
+
+@dataclasses.dataclass(eq=False)
+class CommonData:
+    """
+    Data contained in Commondata files, relevant cuts applied.
+
+    Parameters
+    ----------
+
+    setname : str
+        Name of the dataset
+
+    ndata : int
+        Number of data points
+
+    commondataproc : str
+        Process type, one of 21 options
+
+    nkin : int
+        Number of kinematics specified
+
+    kinematics : list of str with length nkin
+        Kinematic variables kin1, kin2, kin3 ...
+
+    nsys : int
+        Number of systematics
+
+    sysid : list of str with length nsys
+        ID for systematic
+
+    commondata_table : pd.DataFrame
+        Pandas dataframe containing the commondata
+
+    systype_table : pd.DataFrame
+        Pandas dataframe containing the systype index
+        for each systematic alongside the uncertainty
+        type (ADD/MULT/RAND) and name
+        (CORR/UNCORR/THEORYCORR/SKIP)
+    """
+    setname: str
+    ndata: int
+    commondataproc: str
+    nkin: int
+    nsys: int
+    commondata_table: pd.DataFrame
+    systype_table: pd.DataFrame
+
+    def with_cuts(self, cuts):
+        """A method to return a CommonData object where
+        an integer mask has been applied, keeping only data
+        points which pass cuts.
+
+        Note if the first data point passes cuts, the first entry
+        of ``cuts`` should be ``0``.
+
+        Paramters
+        ---------
+        cuts: list or validphys.core.Cuts or None
+        """
+        # Ensure that the cuts we're applying applies to this dataset
+        # only check, however, if the cuts is of type :py:class:`validphys.core.Cuts`
+        if hasattr(cuts, 'name') and self.setname != cuts.name:
+            raise ValueError(f"The cuts provided are for {cuts.name} which does not apply "
+                    f"to this commondata file: {self.setname}")
+
+        if hasattr(cuts, 'load'):
+            cuts = cuts.load()
+        if cuts is None:
+            return self
+
+        # We must shift the cuts up by 1 since a cut of 0 implies the first data point
+        # while commondata indexing starts at 1.
+        cuts = list(map(lambda x: x + 1, cuts))
+
+        newndata = len(cuts)
+        new_commondata_table = self.commondata_table.loc[cuts]
+        return dataclasses.replace(
+            self, ndata=newndata, commondata_table=new_commondata_table
+        )
+
+    @property
+    def central_values(self):
+        return self.commondata_table["data"]
+
+    @property
+    def stat_errors(self):
+        return self.commondata_table["stat"]
+
+    @property
+    def sys_errors(self):
+        sys_table = self.commondata_table.drop(
+            columns=["process", "kin1", "kin2", "kin3", "data", "stat"]
+        )
+        table = [
+            [
+                SystematicError(
+                    add=sys_table[f"sys.add.{j}"][i],
+                    mult=sys_table[f"sys.mult.{j}"][i],
+                    sys_type=self.systype_table["type"][j],
+                    name=self.systype_table["name"][j],
+                )
+                for j in self.systype_table.index
+            ]
+            for i in self.commondata_table.index
+        ]
+        return pd.DataFrame(
+            table,
+            columns=[f"sys.{i}" for i in self.systype_table.index],
+            index=self.commondata_table.index,
+        )
diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py
@@ -0,0 +1,59 @@
+import pytest
+import pandas as pd
+
+from validphys.api import API
+from validphys.commondataparser import load_commondata
+from validphys.loader import FallbackLoader as Loader
+
+
+def test_basic_commondata_loading():
+    l = Loader()
+    cd = l.check_commondata(setname="H1HERAF2B")
+    res = load_commondata(cd)
+    # Test commondata loading
+    assert res.ndata == 12
+    assert isinstance(res.commondata_table, pd.DataFrame)
+    # Test systype loading
+    assert res.nsys == 25
+    assert isinstance(res.systype_table, pd.DataFrame)
+    # Test a dataset with no systematics
+    emptysyscd = l.check_commondata(setname="CMSWCHARMRAT")
+    emptysysres = load_commondata(emptysyscd)
+    assert emptysysres.nsys == 0
+    assert emptysysres.systype_table.empty is True
+
+
+def test_commondata_with_cuts():
+    l = Loader()
+    setname = "NMC"
+
+    cd = l.check_commondata(setname=setname)
+    loaded_cd = load_commondata(cd)
+
+    fit_cuts = l.check_fit_cuts(fit="191015-mw-001", setname=setname)
+    internal_cuts = l.check_internal_cuts(
+        cd, API.rules(theoryid=162, use_cuts="internal")
+    )
+
+    loaded_cd_fit_cuts = loaded_cd.with_cuts(fit_cuts)
+    # We must do these - 1 subtractions due to the fact that cuts indexing
+    # starts at 0 while commondata indexing starts at 1
+    assert all(loaded_cd_fit_cuts.commondata_table.index - 1 == fit_cuts.load())
+    assert all(loaded_cd_fit_cuts.sys_errors.index - 1 == fit_cuts.load())
+
+    loaded_cd_internal_cuts = loaded_cd.with_cuts(internal_cuts)
+    assert all(loaded_cd_internal_cuts.commondata_table.index - 1 == internal_cuts.load())
+
+    loaded_cd_nocuts = loaded_cd.with_cuts(None)
+    assert all(loaded_cd_nocuts.commondata_table.index == range(1, cd.ndata + 1))
+
+    preloaded_fit_cuts = fit_cuts.load()
+    loaded_cd_preloaded_cuts = loaded_cd.with_cuts(fit_cuts)
+    assert all(loaded_cd_preloaded_cuts.commondata_table.index - 1 == preloaded_fit_cuts)
+
+    assert all(loaded_cd.with_cuts([1, 2, 3]).commondata_table.index - 1 == [1, 2, 3])
+
+    # Check that giving cuts for another dataset raises the correct ValueError exception
+    bad_cuts = l.check_fit_cuts(fit="191015-mw-001", setname="NMCPD")
+    with pytest.raises(ValueError):
+        loaded_cd.with_cuts(bad_cuts)