diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py new file mode 100644 index 0000000000..c1f95bb07b --- /dev/null +++ b/validphys2/src/validphys/commondataparser.py @@ -0,0 +1,92 @@ +""" +This module implements parsers for commondata and systype files into useful +datastructures, contained in the :py:mod:`validphys.coredata` module, which are +not backed by C++ managed memory, and so they can be easily pickled and +interfaces with common Python libraries. The integration of these objects into +the codebase is currently work in progress, and at the moment this module +serves as a proof of concept. +""" +from operator import attrgetter + +import pandas as pd + +from validphys.core import peek_commondata_metadata +from validphys.coredata import CommonData + +def load_commondata(spec): + """ + Load the data corresponding to a CommonDataSpec object. + Returns an instance of CommonData + """ + commondatafile = spec.datafile + # Getting set name from commondata file name + setname = commondatafile.name[5:-4] # DATA prefix and .dat suffix + systypefile = spec.sysfile + + commondata = parse_commondata(commondatafile, systypefile, setname) + + return commondata + + +def parse_commondata(commondatafile, systypefile, setname): + """Parse a commondata file and a systype file into a CommonData. + + Parameters + ---------- + commondatafile : file or path to file + systypefile : file or path to file + + Returns + ------- + commondata : CommonData + An object containing the data and information from the commondata + and systype files. + """ + # First parse commondata file + commondatatable = pd.read_csv(commondatafile, sep=r'\s+', skiprows=1, header=None) + # Remove NaNs + # TODO: replace commondata files with bad formatting + # Build header + commondataheader = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] + nsys = (commondatatable.shape[1] - len(commondataheader)) // 2 + for i in range(nsys): + commondataheader += [f"sys.add.{i+1}", f"sys.mult.{i+1}"] + commondatatable.columns = commondataheader + commondatatable.set_index("entry", inplace=True) + ndata = len(commondatatable) + commondataproc = commondatatable["process"][1] + # Check for consistency with commondata metadata + cdmetadata = peek_commondata_metadata(commondatafile) + if (setname, nsys, ndata) != attrgetter('name', 'nsys', 'ndata')(cdmetadata): + raise ValueError("Commondata table information does not match metadata") + + # Now parse the systype file + systypetable = parse_systypes(systypefile) + + # Populate CommonData object + return CommonData( + setname=setname, + ndata=ndata, + commondataproc=commondataproc, + nkin=3, + nsys=nsys, + commondata_table=commondatatable, + systype_table=systypetable + ) + +def parse_systypes(systypefile): + """Parses a systype file and returns a pandas dataframe. + """ + systypeheader = ["sys_index", "type", "name"] + try: + systypetable = pd.read_csv( + systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None + ) + systypetable.dropna(axis='columns', inplace=True) + # Some datasets e.g. CMSWCHARMRAT have no systematics + except pd.errors.EmptyDataError: + systypetable = pd.DataFrame(columns=systypeheader) + + systypetable.set_index("sys_index", inplace=True) + + return systypetable diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 27245a5126..8d1ad57dfa 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -28,7 +28,7 @@ class FKTableData: xgrid : array, shape (nx) The points in x at which the PDFs should be evaluated. - sigma : DataFrame + sigma : pd.DataFrame For hadronic data, the columns are the indexes in the ``NfxNf`` list of possible flavour combinations of two PDFs. The MultiIndex contains three keys, the data index, an index into ``xgrid`` for the first PDF @@ -117,3 +117,126 @@ class CFactorData: description: str central_value: np.array uncertainty: np.array + + +@dataclasses.dataclass(eq=False) +class SystematicError: + add: float + mult: float + sys_type: str #e.g ADD + name: str #e.g UNCORR + + def __repr__(self): + return (f"{self.__class__.__name__}(add={self.add}, mult={self.mult}," + "sys_type={self.sys_type}, name={self.name})") + + +@dataclasses.dataclass(eq=False) +class CommonData: + """ + Data contained in Commondata files, relevant cuts applied. + + Parameters + ---------- + + setname : str + Name of the dataset + + ndata : int + Number of data points + + commondataproc : str + Process type, one of 21 options + + nkin : int + Number of kinematics specified + + kinematics : list of str with length nkin + Kinematic variables kin1, kin2, kin3 ... + + nsys : int + Number of systematics + + sysid : list of str with length nsys + ID for systematic + + commondata_table : pd.DataFrame + Pandas dataframe containing the commondata + + systype_table : pd.DataFrame + Pandas dataframe containing the systype index + for each systematic alongside the uncertainty + type (ADD/MULT/RAND) and name + (CORR/UNCORR/THEORYCORR/SKIP) + """ + setname: str + ndata: int + commondataproc: str + nkin: int + nsys: int + commondata_table: pd.DataFrame + systype_table: pd.DataFrame + + def with_cuts(self, cuts): + """A method to return a CommonData object where + an integer mask has been applied, keeping only data + points which pass cuts. + + Note if the first data point passes cuts, the first entry + of ``cuts`` should be ``0``. + + Paramters + --------- + cuts: list or validphys.core.Cuts or None + """ + # Ensure that the cuts we're applying applies to this dataset + # only check, however, if the cuts is of type :py:class:`validphys.core.Cuts` + if hasattr(cuts, 'name') and self.setname != cuts.name: + raise ValueError(f"The cuts provided are for {cuts.name} which does not apply " + f"to this commondata file: {self.setname}") + + if hasattr(cuts, 'load'): + cuts = cuts.load() + if cuts is None: + return self + + # We must shift the cuts up by 1 since a cut of 0 implies the first data point + # while commondata indexing starts at 1. + cuts = list(map(lambda x: x + 1, cuts)) + + newndata = len(cuts) + new_commondata_table = self.commondata_table.loc[cuts] + return dataclasses.replace( + self, ndata=newndata, commondata_table=new_commondata_table + ) + + @property + def central_values(self): + return self.commondata_table["data"] + + @property + def stat_errors(self): + return self.commondata_table["stat"] + + @property + def sys_errors(self): + sys_table = self.commondata_table.drop( + columns=["process", "kin1", "kin2", "kin3", "data", "stat"] + ) + table = [ + [ + SystematicError( + add=sys_table[f"sys.add.{j}"][i], + mult=sys_table[f"sys.mult.{j}"][i], + sys_type=self.systype_table["type"][j], + name=self.systype_table["name"][j], + ) + for j in self.systype_table.index + ] + for i in self.commondata_table.index + ] + return pd.DataFrame( + table, + columns=[f"sys.{i}" for i in self.systype_table.index], + index=self.commondata_table.index, + ) diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py new file mode 100644 index 0000000000..3645545b18 --- /dev/null +++ b/validphys2/src/validphys/tests/test_commondataparser.py @@ -0,0 +1,59 @@ +import pytest +import pandas as pd + +from validphys.api import API +from validphys.commondataparser import load_commondata +from validphys.loader import FallbackLoader as Loader + + +def test_basic_commondata_loading(): + l = Loader() + cd = l.check_commondata(setname="H1HERAF2B") + res = load_commondata(cd) + # Test commondata loading + assert res.ndata == 12 + assert isinstance(res.commondata_table, pd.DataFrame) + # Test systype loading + assert res.nsys == 25 + assert isinstance(res.systype_table, pd.DataFrame) + # Test a dataset with no systematics + emptysyscd = l.check_commondata(setname="CMSWCHARMRAT") + emptysysres = load_commondata(emptysyscd) + assert emptysysres.nsys == 0 + assert emptysysres.systype_table.empty is True + + +def test_commondata_with_cuts(): + l = Loader() + setname = "NMC" + + cd = l.check_commondata(setname=setname) + loaded_cd = load_commondata(cd) + + fit_cuts = l.check_fit_cuts(fit="191015-mw-001", setname=setname) + internal_cuts = l.check_internal_cuts( + cd, API.rules(theoryid=162, use_cuts="internal") + ) + + loaded_cd_fit_cuts = loaded_cd.with_cuts(fit_cuts) + # We must do these - 1 subtractions due to the fact that cuts indexing + # starts at 0 while commondata indexing starts at 1 + assert all(loaded_cd_fit_cuts.commondata_table.index - 1 == fit_cuts.load()) + assert all(loaded_cd_fit_cuts.sys_errors.index - 1 == fit_cuts.load()) + + loaded_cd_internal_cuts = loaded_cd.with_cuts(internal_cuts) + assert all(loaded_cd_internal_cuts.commondata_table.index - 1 == internal_cuts.load()) + + loaded_cd_nocuts = loaded_cd.with_cuts(None) + assert all(loaded_cd_nocuts.commondata_table.index == range(1, cd.ndata + 1)) + + preloaded_fit_cuts = fit_cuts.load() + loaded_cd_preloaded_cuts = loaded_cd.with_cuts(fit_cuts) + assert all(loaded_cd_preloaded_cuts.commondata_table.index - 1 == preloaded_fit_cuts) + + assert all(loaded_cd.with_cuts([1, 2, 3]).commondata_table.index - 1 == [1, 2, 3]) + + # Check that giving cuts for another dataset raises the correct ValueError exception + bad_cuts = l.check_fit_cuts(fit="191015-mw-001", setname="NMCPD") + with pytest.raises(ValueError): + loaded_cd.with_cuts(bad_cuts)