Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
4fbe489
copying code from PR476 for data loading
May 5, 2020
d9b632b
altering load to use new function
May 5, 2020
6c0351c
Changing dataset -> name in config
May 5, 2020
a840f2c
Importing pandas
May 5, 2020
2833664
data container for commondata
May 13, 2020
08ba1d4
moving coredata.py to correct loc
May 13, 2020
e31a47d
adding commondata parser script
May 13, 2020
971c535
move load_data to commondataparser
May 13, 2020
e99bcb3
remove pandas import from core
May 13, 2020
3c702fb
adding CommonDataInfo class
May 13, 2020
3aa32e1
reverting back to old behvaiour in core
May 14, 2020
dd90cae
populating CommonData object
May 15, 2020
01ff097
removing space in core
May 15, 2020
7b38cba
separating structure into parse_commondata and load_commondata
May 15, 2020
0756b3b
changing structure of CommonData object
May 15, 2020
2dac35c
Update core.py
RosalynLP May 15, 2020
64a9645
searhing for setname in file name
May 15, 2020
a1d4ad5
Merge branch 'python-commondata-parser' of https://github.com/NNPDF/n…
May 15, 2020
e457a6c
adding test for commondata parser
May 15, 2020
fa32bd7
changing name to dataset in config
RosalynLP May 20, 2020
1ac7492
reverting to old config behaviour
RosalynLP May 20, 2020
26710ee
adding class for SystypeData
May 25, 2020
eda4a5d
adding systypeinfo object
May 25, 2020
b3ea44e
parse systype files as well
May 25, 2020
039b168
removing info objects
May 25, 2020
45749a0
adding error message
May 25, 2020
05b87a9
test for systype loading
May 25, 2020
d869c33
fixing bug in string
May 25, 2020
d6005fc
renaming namedtuple
May 25, 2020
04a75b7
choosing sys_index as the index for systype table
May 25, 2020
1c22b8f
updating test
May 25, 2020
a72813e
Format with black
voisey May 27, 2020
21b41ed
Remove unnecessary imports
voisey May 27, 2020
8bcd766
amending review comments
May 27, 2020
84088a5
merge
May 27, 2020
5652209
Merge branch 'master' into python-commondata-parser
voisey May 27, 2020
583e5d8
Make comments and docstrings consistent
voisey May 27, 2020
4e1b6d3
Removing explanation of raises in docstrings
May 27, 2020
7e7758b
Removing blank lines
May 27, 2020
0284526
Adding some more properties
May 27, 2020
c50394a
Fix name of argument given to CommonData
voisey May 27, 2020
c3a40f3
Using pathlib to handle setname parsing
May 27, 2020
d666c2b
getting process type from 1st entry
May 27, 2020
21a9dfe
merging CommonData and SystypeData objects
May 28, 2020
fa84a06
Correcting typos
May 28, 2020
f0e8a03
Fixing tests
May 28, 2020
c2312b3
Adding systype table to CommonData class
May 28, 2020
1195554
Adding __repr__ and __str__ methods for pretty printing the systemati…
May 28, 2020
d922ae0
Adding a __repr__ method
Jun 6, 2020
0e2bf01
changing comments
May 29, 2020
a92b92e
Removing blank lines
May 29, 2020
c11746e
loading empty systematics as empty dataframe
May 29, 2020
5495638
comment explaining try/except for sys load
May 29, 2020
a3bd404
remove dropna line
Jun 1, 2020
78e2010
test for ds with no systematics
Jun 1, 2020
84ed62e
separating parsing of systype files
Jun 1, 2020
c89c33e
checking table info against metadata from peek_commondata_metadata
Jun 1, 2020
c5a39ec
Adding a with_cuts method
Jun 1, 2020
e79bd2d
Adding tests for with_cuts method
Jun 5, 2020
ad1ab6b
Correcting consistency check
Jun 6, 2020
11c4b5d
Incrementing cuts by 1
Jun 8, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions validphys2/src/validphys/commondataparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""
This module implements parsers for commondata and systype files into useful
datastructures, contained in the :py:mod:`validphys.coredata` module, which are
not backed by C++ managed memory, and so they can be easily pickled and
interfaces with common Python libraries. The integration of these objects into
the codebase is currently work in progress, and at the moment this module
serves as a proof of concept.
"""
from operator import attrgetter

import pandas as pd

from validphys.core import peek_commondata_metadata
from validphys.coredata import CommonData

def load_commondata(spec):
Comment thread
RosalynLP marked this conversation as resolved.
"""
Load the data corresponding to a CommonDataSpec object.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather have a link that renders in the docs here. Ans also use the standard format for parameters and return values.

Returns an instance of CommonData
"""
commondatafile = spec.datafile
# Getting set name from commondata file name
setname = commondatafile.name[5:-4] # DATA prefix and .dat suffix
systypefile = spec.sysfile

commondata = parse_commondata(commondatafile, systypefile, setname)

return commondata


def parse_commondata(commondatafile, systypefile, setname):
"""Parse a commondata file and a systype file into a CommonData.

Parameters
----------
commondatafile : file or path to file
systypefile : file or path to file

Returns
-------
commondata : CommonData
An object containing the data and information from the commondata
and systype files.
"""
# First parse commondata file
commondatatable = pd.read_csv(commondatafile, sep=r'\s+', skiprows=1, header=None)
# Remove NaNs
# TODO: replace commondata files with bad formatting
# Build header
commondataheader = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat']
nsys = (commondatatable.shape[1] - len(commondataheader)) // 2
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should look at the actual commondata header (which is now discarded) and check for consistency.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that there is peek_commondata_metadata in validphys.core already.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, should I do this as a separate test, or do it within the parser every time?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I asserted this within the function

for i in range(nsys):
commondataheader += [f"sys.add.{i+1}", f"sys.mult.{i+1}"]
commondatatable.columns = commondataheader
commondatatable.set_index("entry", inplace=True)
ndata = len(commondatatable)
commondataproc = commondatatable["process"][1]
# Check for consistency with commondata metadata
cdmetadata = peek_commondata_metadata(commondatafile)
if (setname, nsys, ndata) != attrgetter('name', 'nsys', 'ndata')(cdmetadata):
raise ValueError("Commondata table information does not match metadata")

# Now parse the systype file
systypetable = parse_systypes(systypefile)

# Populate CommonData object
return CommonData(
setname=setname,
ndata=ndata,
commondataproc=commondataproc,
nkin=3,
nsys=nsys,
commondata_table=commondatatable,
systype_table=systypetable
)

def parse_systypes(systypefile):
"""Parses a systype file and returns a pandas dataframe.
"""
systypeheader = ["sys_index", "type", "name"]
try:
systypetable = pd.read_csv(
systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None
)
systypetable.dropna(axis='columns', inplace=True)
# Some datasets e.g. CMSWCHARMRAT have no systematics
except pd.errors.EmptyDataError:
systypetable = pd.DataFrame(columns=systypeheader)

systypetable.set_index("sys_index", inplace=True)

return systypetable
125 changes: 124 additions & 1 deletion validphys2/src/validphys/coredata.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class FKTableData:
xgrid : array, shape (nx)
The points in x at which the PDFs should be evaluated.

sigma : DataFrame
sigma : pd.DataFrame
For hadronic data, the columns are the indexes in the ``NfxNf`` list of
possible flavour combinations of two PDFs. The MultiIndex contains
three keys, the data index, an index into ``xgrid`` for the first PDF
Expand Down Expand Up @@ -117,3 +117,126 @@ class CFactorData:
description: str
central_value: np.array
uncertainty: np.array


@dataclasses.dataclass(eq=False)
class SystematicError:
add: float
mult: float
sys_type: str #e.g ADD
name: str #e.g UNCORR

def __repr__(self):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't the dataclass already generate good string methods? Why are we implementing our own?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It made printing cd.sys_errors look ugly, although now that i think about it, it isn't much better with this. I can remove it

return (f"{self.__class__.__name__}(add={self.add}, mult={self.mult},"
"sys_type={self.sys_type}, name={self.name})")


@dataclasses.dataclass(eq=False)
class CommonData:
"""
Data contained in Commondata files, relevant cuts applied.

Parameters
----------

setname : str
Name of the dataset

ndata : int
Number of data points

commondataproc : str
Process type, one of 21 options

nkin : int
Number of kinematics specified

kinematics : list of str with length nkin
Kinematic variables kin1, kin2, kin3 ...

nsys : int
Number of systematics

sysid : list of str with length nsys
ID for systematic

commondata_table : pd.DataFrame
Pandas dataframe containing the commondata

systype_table : pd.DataFrame
Pandas dataframe containing the systype index
for each systematic alongside the uncertainty
type (ADD/MULT/RAND) and name
(CORR/UNCORR/THEORYCORR/SKIP)
"""
setname: str
ndata: int
commondataproc: str
nkin: int
nsys: int
commondata_table: pd.DataFrame
systype_table: pd.DataFrame

def with_cuts(self, cuts):
"""A method to return a CommonData object where
an integer mask has been applied, keeping only data
points which pass cuts.

Note if the first data point passes cuts, the first entry
of ``cuts`` should be ``0``.
Comment on lines +185 to +186
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is rather unclear. It reads to me like it should be a list starting with zero.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is what we mean right? If datapoint 1 passed cuts then cuts = [0, ...


Paramters
---------
cuts: list or validphys.core.Cuts or None
"""
# Ensure that the cuts we're applying applies to this dataset
# only check, however, if the cuts is of type :py:class:`validphys.core.Cuts`
if hasattr(cuts, 'name') and self.setname != cuts.name:
raise ValueError(f"The cuts provided are for {cuts.name} which does not apply "
f"to this commondata file: {self.setname}")

if hasattr(cuts, 'load'):
cuts = cuts.load()
if cuts is None:
return self

# We must shift the cuts up by 1 since a cut of 0 implies the first data point
# while commondata indexing starts at 1.
cuts = list(map(lambda x: x + 1, cuts))

newndata = len(cuts)
new_commondata_table = self.commondata_table.loc[cuts]
return dataclasses.replace(
self, ndata=newndata, commondata_table=new_commondata_table
)

@property
def central_values(self):
return self.commondata_table["data"]

@property
def stat_errors(self):
return self.commondata_table["stat"]

@property
def sys_errors(self):
sys_table = self.commondata_table.drop(
columns=["process", "kin1", "kin2", "kin3", "data", "stat"]
)
table = [
[
SystematicError(
Comment thread
siranipour marked this conversation as resolved.
add=sys_table[f"sys.add.{j}"][i],
mult=sys_table[f"sys.mult.{j}"][i],
sys_type=self.systype_table["type"][j],
name=self.systype_table["name"][j],
)
for j in self.systype_table.index
]
for i in self.commondata_table.index
]
return pd.DataFrame(
table,
columns=[f"sys.{i}" for i in self.systype_table.index],
index=self.commondata_table.index,
)
59 changes: 59 additions & 0 deletions validphys2/src/validphys/tests/test_commondataparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pytest
import pandas as pd

from validphys.api import API
from validphys.commondataparser import load_commondata
from validphys.loader import FallbackLoader as Loader


def test_basic_commondata_loading():
l = Loader()
cd = l.check_commondata(setname="H1HERAF2B")
res = load_commondata(cd)
# Test commondata loading
assert res.ndata == 12
assert isinstance(res.commondata_table, pd.DataFrame)
# Test systype loading
assert res.nsys == 25
assert isinstance(res.systype_table, pd.DataFrame)
# Test a dataset with no systematics
emptysyscd = l.check_commondata(setname="CMSWCHARMRAT")
emptysysres = load_commondata(emptysyscd)
assert emptysysres.nsys == 0
assert emptysysres.systype_table.empty is True


def test_commondata_with_cuts():
l = Loader()
setname = "NMC"

cd = l.check_commondata(setname=setname)
loaded_cd = load_commondata(cd)

fit_cuts = l.check_fit_cuts(fit="191015-mw-001", setname=setname)
internal_cuts = l.check_internal_cuts(
cd, API.rules(theoryid=162, use_cuts="internal")
)

loaded_cd_fit_cuts = loaded_cd.with_cuts(fit_cuts)
# We must do these - 1 subtractions due to the fact that cuts indexing
# starts at 0 while commondata indexing starts at 1
assert all(loaded_cd_fit_cuts.commondata_table.index - 1 == fit_cuts.load())
assert all(loaded_cd_fit_cuts.sys_errors.index - 1 == fit_cuts.load())

loaded_cd_internal_cuts = loaded_cd.with_cuts(internal_cuts)
assert all(loaded_cd_internal_cuts.commondata_table.index - 1 == internal_cuts.load())

loaded_cd_nocuts = loaded_cd.with_cuts(None)
assert all(loaded_cd_nocuts.commondata_table.index == range(1, cd.ndata + 1))

preloaded_fit_cuts = fit_cuts.load()
loaded_cd_preloaded_cuts = loaded_cd.with_cuts(fit_cuts)
assert all(loaded_cd_preloaded_cuts.commondata_table.index - 1 == preloaded_fit_cuts)

assert all(loaded_cd.with_cuts([1, 2, 3]).commondata_table.index - 1 == [1, 2, 3])

# Check that giving cuts for another dataset raises the correct ValueError exception
bad_cuts = l.check_fit_cuts(fit="191015-mw-001", setname="NMCPD")
with pytest.raises(ValueError):
loaded_cd.with_cuts(bad_cuts)