-
Notifications
You must be signed in to change notification settings - Fork 14
[WIP]: Python commondata parser #769
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4fbe489
d9b632b
6c0351c
a840f2c
2833664
08ba1d4
e31a47d
971c535
e99bcb3
3c702fb
3aa32e1
dd90cae
01ff097
7b38cba
0756b3b
2dac35c
64a9645
a1d4ad5
e457a6c
fa32bd7
1ac7492
26710ee
eda4a5d
b3ea44e
039b168
45749a0
05b87a9
d869c33
d6005fc
04a75b7
1c22b8f
a72813e
21b41ed
8bcd766
84088a5
5652209
583e5d8
4e1b6d3
7e7758b
0284526
c50394a
c3a40f3
d666c2b
21a9dfe
fa84a06
f0e8a03
c2312b3
1195554
d922ae0
0e2bf01
a92b92e
c11746e
5495638
a3bd404
78e2010
84ed62e
c89c33e
c5a39ec
e79bd2d
ad1ab6b
11c4b5d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,92 @@ | ||
| """ | ||
| This module implements parsers for commondata and systype files into useful | ||
| datastructures, contained in the :py:mod:`validphys.coredata` module, which are | ||
| not backed by C++ managed memory, and so they can be easily pickled and | ||
| interfaces with common Python libraries. The integration of these objects into | ||
| the codebase is currently work in progress, and at the moment this module | ||
| serves as a proof of concept. | ||
| """ | ||
| from operator import attrgetter | ||
|
|
||
| import pandas as pd | ||
|
|
||
| from validphys.core import peek_commondata_metadata | ||
| from validphys.coredata import CommonData | ||
|
|
||
| def load_commondata(spec): | ||
| """ | ||
| Load the data corresponding to a CommonDataSpec object. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather have a link that renders in the docs here. Ans also use the standard format for parameters and return values. |
||
| Returns an instance of CommonData | ||
| """ | ||
| commondatafile = spec.datafile | ||
| # Getting set name from commondata file name | ||
| setname = commondatafile.name[5:-4] # DATA prefix and .dat suffix | ||
| systypefile = spec.sysfile | ||
|
|
||
| commondata = parse_commondata(commondatafile, systypefile, setname) | ||
|
|
||
| return commondata | ||
|
|
||
|
|
||
| def parse_commondata(commondatafile, systypefile, setname): | ||
| """Parse a commondata file and a systype file into a CommonData. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| commondatafile : file or path to file | ||
| systypefile : file or path to file | ||
|
|
||
| Returns | ||
| ------- | ||
| commondata : CommonData | ||
| An object containing the data and information from the commondata | ||
| and systype files. | ||
| """ | ||
| # First parse commondata file | ||
| commondatatable = pd.read_csv(commondatafile, sep=r'\s+', skiprows=1, header=None) | ||
| # Remove NaNs | ||
| # TODO: replace commondata files with bad formatting | ||
| # Build header | ||
| commondataheader = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] | ||
| nsys = (commondatatable.shape[1] - len(commondataheader)) // 2 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should look at the actual commondata header (which is now discarded) and check for consistency.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that there is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, should I do this as a separate test, or do it within the parser every time?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I asserted this within the function |
||
| for i in range(nsys): | ||
| commondataheader += [f"sys.add.{i+1}", f"sys.mult.{i+1}"] | ||
| commondatatable.columns = commondataheader | ||
| commondatatable.set_index("entry", inplace=True) | ||
| ndata = len(commondatatable) | ||
| commondataproc = commondatatable["process"][1] | ||
| # Check for consistency with commondata metadata | ||
| cdmetadata = peek_commondata_metadata(commondatafile) | ||
| if (setname, nsys, ndata) != attrgetter('name', 'nsys', 'ndata')(cdmetadata): | ||
| raise ValueError("Commondata table information does not match metadata") | ||
|
|
||
| # Now parse the systype file | ||
| systypetable = parse_systypes(systypefile) | ||
|
|
||
| # Populate CommonData object | ||
| return CommonData( | ||
| setname=setname, | ||
| ndata=ndata, | ||
| commondataproc=commondataproc, | ||
| nkin=3, | ||
| nsys=nsys, | ||
| commondata_table=commondatatable, | ||
| systype_table=systypetable | ||
| ) | ||
|
|
||
| def parse_systypes(systypefile): | ||
| """Parses a systype file and returns a pandas dataframe. | ||
| """ | ||
| systypeheader = ["sys_index", "type", "name"] | ||
| try: | ||
| systypetable = pd.read_csv( | ||
| systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None | ||
| ) | ||
| systypetable.dropna(axis='columns', inplace=True) | ||
| # Some datasets e.g. CMSWCHARMRAT have no systematics | ||
| except pd.errors.EmptyDataError: | ||
| systypetable = pd.DataFrame(columns=systypeheader) | ||
|
|
||
| systypetable.set_index("sys_index", inplace=True) | ||
|
|
||
| return systypetable | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,7 @@ class FKTableData: | |
| xgrid : array, shape (nx) | ||
| The points in x at which the PDFs should be evaluated. | ||
|
|
||
| sigma : DataFrame | ||
| sigma : pd.DataFrame | ||
| For hadronic data, the columns are the indexes in the ``NfxNf`` list of | ||
| possible flavour combinations of two PDFs. The MultiIndex contains | ||
| three keys, the data index, an index into ``xgrid`` for the first PDF | ||
|
|
@@ -117,3 +117,126 @@ class CFactorData: | |
| description: str | ||
| central_value: np.array | ||
| uncertainty: np.array | ||
|
|
||
|
|
||
| @dataclasses.dataclass(eq=False) | ||
| class SystematicError: | ||
| add: float | ||
| mult: float | ||
| sys_type: str #e.g ADD | ||
| name: str #e.g UNCORR | ||
|
|
||
| def __repr__(self): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't the dataclass already generate good string methods? Why are we implementing our own?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It made printing |
||
| return (f"{self.__class__.__name__}(add={self.add}, mult={self.mult}," | ||
| "sys_type={self.sys_type}, name={self.name})") | ||
|
|
||
|
|
||
| @dataclasses.dataclass(eq=False) | ||
| class CommonData: | ||
| """ | ||
| Data contained in Commondata files, relevant cuts applied. | ||
|
|
||
| Parameters | ||
| ---------- | ||
|
|
||
| setname : str | ||
| Name of the dataset | ||
|
|
||
| ndata : int | ||
| Number of data points | ||
|
|
||
| commondataproc : str | ||
| Process type, one of 21 options | ||
|
|
||
| nkin : int | ||
| Number of kinematics specified | ||
|
|
||
| kinematics : list of str with length nkin | ||
| Kinematic variables kin1, kin2, kin3 ... | ||
|
|
||
| nsys : int | ||
| Number of systematics | ||
|
|
||
| sysid : list of str with length nsys | ||
| ID for systematic | ||
|
|
||
| commondata_table : pd.DataFrame | ||
| Pandas dataframe containing the commondata | ||
|
|
||
| systype_table : pd.DataFrame | ||
| Pandas dataframe containing the systype index | ||
| for each systematic alongside the uncertainty | ||
| type (ADD/MULT/RAND) and name | ||
| (CORR/UNCORR/THEORYCORR/SKIP) | ||
| """ | ||
| setname: str | ||
| ndata: int | ||
| commondataproc: str | ||
| nkin: int | ||
| nsys: int | ||
| commondata_table: pd.DataFrame | ||
| systype_table: pd.DataFrame | ||
|
|
||
| def with_cuts(self, cuts): | ||
| """A method to return a CommonData object where | ||
| an integer mask has been applied, keeping only data | ||
| points which pass cuts. | ||
|
|
||
| Note if the first data point passes cuts, the first entry | ||
| of ``cuts`` should be ``0``. | ||
|
Comment on lines
+185
to
+186
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is rather unclear. It reads to me like it should be a list starting with zero.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is what we mean right? If datapoint 1 passed cuts then |
||
|
|
||
| Paramters | ||
| --------- | ||
| cuts: list or validphys.core.Cuts or None | ||
| """ | ||
| # Ensure that the cuts we're applying applies to this dataset | ||
| # only check, however, if the cuts is of type :py:class:`validphys.core.Cuts` | ||
| if hasattr(cuts, 'name') and self.setname != cuts.name: | ||
| raise ValueError(f"The cuts provided are for {cuts.name} which does not apply " | ||
| f"to this commondata file: {self.setname}") | ||
|
|
||
| if hasattr(cuts, 'load'): | ||
| cuts = cuts.load() | ||
| if cuts is None: | ||
| return self | ||
|
|
||
| # We must shift the cuts up by 1 since a cut of 0 implies the first data point | ||
| # while commondata indexing starts at 1. | ||
| cuts = list(map(lambda x: x + 1, cuts)) | ||
|
|
||
| newndata = len(cuts) | ||
| new_commondata_table = self.commondata_table.loc[cuts] | ||
| return dataclasses.replace( | ||
| self, ndata=newndata, commondata_table=new_commondata_table | ||
| ) | ||
|
|
||
| @property | ||
| def central_values(self): | ||
| return self.commondata_table["data"] | ||
|
|
||
| @property | ||
| def stat_errors(self): | ||
| return self.commondata_table["stat"] | ||
|
|
||
| @property | ||
| def sys_errors(self): | ||
| sys_table = self.commondata_table.drop( | ||
| columns=["process", "kin1", "kin2", "kin3", "data", "stat"] | ||
| ) | ||
| table = [ | ||
| [ | ||
| SystematicError( | ||
|
siranipour marked this conversation as resolved.
|
||
| add=sys_table[f"sys.add.{j}"][i], | ||
| mult=sys_table[f"sys.mult.{j}"][i], | ||
| sys_type=self.systype_table["type"][j], | ||
| name=self.systype_table["name"][j], | ||
| ) | ||
| for j in self.systype_table.index | ||
| ] | ||
| for i in self.commondata_table.index | ||
| ] | ||
| return pd.DataFrame( | ||
| table, | ||
| columns=[f"sys.{i}" for i in self.systype_table.index], | ||
| index=self.commondata_table.index, | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| import pytest | ||
| import pandas as pd | ||
|
|
||
| from validphys.api import API | ||
| from validphys.commondataparser import load_commondata | ||
| from validphys.loader import FallbackLoader as Loader | ||
|
|
||
|
|
||
| def test_basic_commondata_loading(): | ||
| l = Loader() | ||
| cd = l.check_commondata(setname="H1HERAF2B") | ||
| res = load_commondata(cd) | ||
| # Test commondata loading | ||
| assert res.ndata == 12 | ||
| assert isinstance(res.commondata_table, pd.DataFrame) | ||
| # Test systype loading | ||
| assert res.nsys == 25 | ||
| assert isinstance(res.systype_table, pd.DataFrame) | ||
| # Test a dataset with no systematics | ||
| emptysyscd = l.check_commondata(setname="CMSWCHARMRAT") | ||
| emptysysres = load_commondata(emptysyscd) | ||
| assert emptysysres.nsys == 0 | ||
| assert emptysysres.systype_table.empty is True | ||
|
|
||
|
|
||
| def test_commondata_with_cuts(): | ||
| l = Loader() | ||
| setname = "NMC" | ||
|
|
||
| cd = l.check_commondata(setname=setname) | ||
| loaded_cd = load_commondata(cd) | ||
|
|
||
| fit_cuts = l.check_fit_cuts(fit="191015-mw-001", setname=setname) | ||
| internal_cuts = l.check_internal_cuts( | ||
| cd, API.rules(theoryid=162, use_cuts="internal") | ||
| ) | ||
|
|
||
| loaded_cd_fit_cuts = loaded_cd.with_cuts(fit_cuts) | ||
| # We must do these - 1 subtractions due to the fact that cuts indexing | ||
| # starts at 0 while commondata indexing starts at 1 | ||
| assert all(loaded_cd_fit_cuts.commondata_table.index - 1 == fit_cuts.load()) | ||
| assert all(loaded_cd_fit_cuts.sys_errors.index - 1 == fit_cuts.load()) | ||
|
|
||
| loaded_cd_internal_cuts = loaded_cd.with_cuts(internal_cuts) | ||
| assert all(loaded_cd_internal_cuts.commondata_table.index - 1 == internal_cuts.load()) | ||
|
|
||
| loaded_cd_nocuts = loaded_cd.with_cuts(None) | ||
| assert all(loaded_cd_nocuts.commondata_table.index == range(1, cd.ndata + 1)) | ||
|
|
||
| preloaded_fit_cuts = fit_cuts.load() | ||
| loaded_cd_preloaded_cuts = loaded_cd.with_cuts(fit_cuts) | ||
| assert all(loaded_cd_preloaded_cuts.commondata_table.index - 1 == preloaded_fit_cuts) | ||
|
|
||
| assert all(loaded_cd.with_cuts([1, 2, 3]).commondata_table.index - 1 == [1, 2, 3]) | ||
|
|
||
| # Check that giving cuts for another dataset raises the correct ValueError exception | ||
| bad_cuts = l.check_fit_cuts(fit="191015-mw-001", setname="NMCPD") | ||
| with pytest.raises(ValueError): | ||
| loaded_cd.with_cuts(bad_cuts) |
Uh oh!
There was an error while loading. Please reload this page.