Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 93 additions & 13 deletions validphys2/src/validphys/commondataparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,47 @@
This module implements parsers for commondata and systype files into useful
datastructures, contained in the :py:mod:`validphys.coredata` module, which are
not backed by C++ managed memory, and so they can be easily pickled and
interfaces with common Python libraries. The integration of these objects into
the codebase is currently work in progress, and at the moment this module
serves as a proof of concept.
interfaced with common Python libraries.

The validphys commondata structure is an instance of :py:class:`validphys.coredata.CommonData`
"""
import dataclasses
from operator import attrgetter
import logging

import pandas as pd

from validphys.core import peek_commondata_metadata
from validphys.coredata import CommonData

log = logging.getLogger(__name__)

KINLABEL_LATEX = {
"DIJET": ("\\eta", "$\\m_{1,2} (GeV)", "$\\sqrt{s} (GeV)"),
"DIS": ("$x$", "$Q^2 (GeV^2)$", "$y$"),
"DYP": ("$y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"EWJ_JPT": ("$p_T (GeV)$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"EWJ_JRAP": ("$\\eta/y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"EWJ_MLL": ("$M_{ll} (GeV)$", "$M_{ll}^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"EWJ_PT": ("$p_T (GeV)$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"EWJ_PTRAP": ("$\\eta/y$", "$p_T^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"EWJ_RAP": ("$\\eta/y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"EWK_MLL": ("$M_{ll} (GeV)$", "$M_{ll}^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"EWK_PT": ("$p_T$ (GeV)", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"EWK_PTRAP": ("$\\eta/y$", "$p_T^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"EWK_RAP": ("$\\eta/y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"HIG_RAP": ("$y$", "$M_H^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"HQP_MQQ": ("$M^{QQ} (GeV)$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"HQP_PTQ": ("$p_T^Q (GeV)$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"HQP_PTQQ": ("$p_T^{QQ} (GeV)$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"HQP_YQ": ("$y^Q$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"HQP_YQQ": ("$y^{QQ}$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"INC": ("$0$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"JET": ("$\\eta$", "$p_T^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"PHT": ("$\\eta_\\gamma$", "$E_{T,\\gamma}^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
"SIA": ("$z$", "$Q^2 (GeV^2)$", "$y$"),
}


def load_commondata(spec):
"""
Load the data corresponding to a CommonDataSpec object.
Expand Down Expand Up @@ -42,21 +72,21 @@ def parse_commondata(commondatafile, systypefile, setname):
and systype files.
"""
# First parse commondata file
commondatatable = pd.read_csv(commondatafile, sep=r'\s+', skiprows=1, header=None)
commondatatable = pd.read_csv(commondatafile, sep=r"\s+", skiprows=1, header=None)
# Remove NaNs
# TODO: replace commondata files with bad formatting
# Build header
commondataheader = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat']
nsys = (commondatatable.shape[1] - len(commondataheader)) // 2
commondataheader = ["entry", "process", "kin1", "kin2", "kin3", "data", "stat"]
nsys = (commondatatable.shape[1] - len(commondataheader)) // 2

commondataheader += ["ADD", "MULT"] * nsys
commondatatable.columns = commondataheader
commondatatable.set_index("entry", inplace=True)
ndata = len(commondatatable)
commondataproc = commondatatable["process"][1]
# Check for consistency with commondata metadata
cdmetadata = peek_commondata_metadata(commondatafile)
if (setname, nsys, ndata) != attrgetter('name', 'nsys', 'ndata')(cdmetadata):
cdmetadata = peek_commondata_metadata(commondatafile)
if (setname, nsys, ndata) != attrgetter("name", "nsys", "ndata")(cdmetadata):
raise ValueError("Commondata table information does not match metadata")

# Now parse the systype file
Expand All @@ -70,22 +100,72 @@ def parse_commondata(commondatafile, systypefile, setname):
nkin=3,
nsys=nsys,
commondata_table=commondatatable,
systype_table=systypetable
systype_table=systypetable,
)


def parse_systypes(systypefile):
"""Parses a systype file and returns a pandas dataframe.
"""
"""Parses a systype file and returns a pandas dataframe."""
systypeheader = ["sys_index", "type", "name"]
try:
systypetable = pd.read_csv(
systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None
)
systypetable.dropna(axis='columns', inplace=True)
systypetable.dropna(axis="columns", inplace=True)
# Some datasets e.g. CMSWCHARMRAT have no systematics
except pd.errors.EmptyDataError:
systypetable = pd.DataFrame(columns=systypeheader)

systypetable.set_index("sys_index", inplace=True)

return systypetable


@dataclasses.dataclass(frozen=True)
class CommonDataMetadata:
"""Contains metadata information about the data being read"""
name: str
nsys: int
ndata: int
process_type: str


def peek_commondata_metadata(commondatafilename):
"""Read some of the properties of the commondata object as a CommonData Metadata
"""
with open(commondatafilename) as f:
try:
l = f.readline()
name, nsys_str, ndata_str = l.split()
l = f.readline()
process_type_str = l.split()[1]
except Exception:
log.error(f"Error processing {commondatafilename}")
raise

return CommonDataMetadata(
name, int(nsys_str), int(ndata_str), get_kinlabel_key(process_type_str)
)
Comment thread
scarlehoff marked this conversation as resolved.


def get_plot_kinlabels(commondata):
"""Return the LaTex kinematic labels for a given Commondata"""
key = commondata.process_type

return KINLABEL_LATEX[key]


def get_kinlabel_key(process_label):
"""
Since there is no 1:1 correspondence between latex keys and GetProc,
we match the longest key such that the proc label starts with it.
"""
l = process_label
try:
return next(k for k in sorted(KINLABEL_LATEX, key=len, reverse=True) if l.startswith(k))
except StopIteration as e:
raise ValueError(
"Could not find a set of kinematic "
"variables matching the process %s Check the "
"labels defined in commondata.cc. " % (l)
) from e
56 changes: 9 additions & 47 deletions validphys2/src/validphys/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
"""
from __future__ import generator_stop

from collections import namedtuple
import re
import enum
import functools
Expand All @@ -24,7 +23,7 @@
from reportengine.compat import yaml

from NNPDF import (LHAPDFSet as libNNPDF_LHAPDFSet,
CommonData,
CommonData as LegacyCommonData,
FKTable,
FKSet,
DataSet,
Expand All @@ -41,6 +40,9 @@
from validphys.lhapdfset import LHAPDFSet
from validphys.fkparser import load_fktable
from validphys.pineparser import pineappl_reader
from validphys.commondataparser import (peek_commondata_metadata,
get_plot_kinlabels,
parse_commondata,)

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -234,46 +236,6 @@ def get_members(self):
return len(self)


kinlabels_latex = CommonData.kinLabel_latex.asdict()
_kinlabels_keys = sorted(kinlabels_latex, key=len, reverse=True)


def get_plot_kinlabels(commondata):
"""Return the LaTex kinematic labels for a given Commondata"""
key = commondata.process_type

return kinlabels_latex[key]

def get_kinlabel_key(process_label):
#Since there is no 1:1 correspondence between latex keys and GetProc,
#we match the longest key such that the proc label starts with it.
l = process_label
try:
return next(k for k in _kinlabels_keys if l.startswith(k))
except StopIteration as e:
raise ValueError("Could not find a set of kinematic "
"variables matching the process %s Check the "
"labels defined in commondata.cc. " % (l)) from e

CommonDataMetadata = namedtuple('CommonDataMetadata', ('name', 'nsys', 'ndata', 'process_type'))

def peek_commondata_metadata(commondatafilename):
"""Check some basic properties commondata object without going though the
trouble of processing it on the C++ side"""
with open(commondatafilename) as f:
try:
l = f.readline()
name, nsys_str, ndata_str = l.split()
l = f.readline()
process_type_str = l.split()[1]
except Exception:
log.error(f"Error processing {commondatafilename}")
raise

return CommonDataMetadata(name, int(nsys_str), int(ndata_str),
get_kinlabel_key(process_type_str))


class CommonDataSpec(TupleComp):
def __init__(self, datafile, sysfile, plotfiles, name=None, metadata=None):
self.datafile = datafile
Expand Down Expand Up @@ -312,9 +274,8 @@ def __iter__(self):
return iter((self.datafile, self.sysfile, self.plotfiles))

@functools.lru_cache()
def load(self)->CommonData:
#TODO: Use better path handling in python 3.6
return CommonData.ReadFile(str(self.datafile), str(self.sysfile))
def load(self):
return parse_commondata(self.datafile, self.sysfile, self.name)

@property
def plot_kinlabels(self):
Expand Down Expand Up @@ -472,7 +433,8 @@ def __init__(self, *, name, commondata, fkspecs, thspec, cuts,

@functools.lru_cache()
def load(self):
cd = self.commondata.load()
"""Load the libNNPDF version of the dataset"""
cd = LegacyCommonData.ReadFile(str(self.commondata.datafile), str(self.commondata.sysfile))
Comment thread
scarlehoff marked this conversation as resolved.

fktables = []
for p in self.fkspecs:
Expand Down Expand Up @@ -508,7 +470,7 @@ def load_commondata(self):
loaded_cuts = self.cuts.load()
if not (hasattr(loaded_cuts, '_full') and loaded_cuts._full):
intmask = [int(ele) for ele in loaded_cuts]
cd = CommonData(cd, intmask)
cd = cd.with_cuts(intmask)
return cd

def to_unweighted(self):
Expand Down
43 changes: 36 additions & 7 deletions validphys2/src/validphys/coredata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import numpy as np
import pandas as pd

KIN_NAMES = ["kin1", "kin2", "kin3"]


@dataclasses.dataclass(eq=False)
class FKTableData:
Expand Down Expand Up @@ -175,6 +177,7 @@ def get_np_fktable(self):

return fktable


@dataclasses.dataclass(eq=False)
class CFactorData:
"""
Expand Down Expand Up @@ -218,15 +221,9 @@ class CommonData:
nkin : int
Number of kinematics specified

kinematics : list of str with length nkin
Kinematic variables kin1, kin2, kin3 ...

nsys : int
Number of systematics

sysid : list of str with length nsys
ID for systematic

commondata_table : pd.DataFrame
Pandas dataframe containing the commondata

Expand All @@ -235,6 +232,9 @@ class CommonData:
for each systematic alongside the uncertainty
type (ADD/MULT/RAND) and name
(CORR/UNCORR/THEORYCORR/SKIP)

systematics_table: pd.DataFrame
Panda dataframe containing the table of systematics
"""

setname: str
Expand All @@ -248,7 +248,7 @@ class CommonData:

def __post_init__(self):
self.systematics_table = self.commondata_table.drop(
columns=["process", "kin1", "kin2", "kin3", "data", "stat"]
columns=["process", "data", "stat"] + KIN_NAMES
)

def with_cuts(self, cuts):
Expand Down Expand Up @@ -284,10 +284,20 @@ def with_cuts(self, cuts):
new_commondata_table = self.commondata_table.loc[cuts]
return dataclasses.replace(self, ndata=newndata, commondata_table=new_commondata_table)

@property
def kinematics(self):
return self.commondata_table[KIN_NAMES]

def get_kintable(self):
return self.kinematics.values
Comment thread
scarlehoff marked this conversation as resolved.

@property
def central_values(self):
return self.commondata_table["data"]

def get_cv(self):
return self.central_values.values
Comment thread
scarlehoff marked this conversation as resolved.

@property
def stat_errors(self):
return self.commondata_table["stat"]
Expand Down Expand Up @@ -353,3 +363,22 @@ def systematic_errors(self, central_values=None):
central_values = self.central_values.to_numpy()
converted_mult_errors = self.multiplicative_errors * central_values[:, np.newaxis] / 100
return pd.concat((self.additive_errors, converted_mult_errors), axis=1)

def export(self, path):
Comment thread
scarlehoff marked this conversation as resolved.
Comment thread
scarlehoff marked this conversation as resolved.
"""Export the data, and error types
Use the same format as libNNPDF:

- A DATA_<dataset>.dat file with the dataframe of accepted points
- A systypes/STYPES_<dataset>.dat file with the error types
"""
dat_path = path / f"DATA_{self.setname}.dat"
sys_path = path / "systypes" / f"SYSTYPE_{self.setname}_DEFAULT.dat"
sys_path.parent.mkdir(exist_ok=True)

dat_string_raw = self.commondata_table.to_string(index=False, header=False, float_format="{:.8e}".format)
header = f"{self.setname} {self.nsys} {self.ndata}"
dat_string = "\n".join([f" {i+1} {r}" for i, r in enumerate(dat_string_raw.split("\n"))])
dat_path.write_text(f"{header}\n{dat_string}\n")
Comment thread
scarlehoff marked this conversation as resolved.

sys_raw = self.systype_table.to_string(index=True, header=False, index_names=False)
sys_path.write_text(f"{self.nsys}\n{sys_raw}\n")
Comment thread
scarlehoff marked this conversation as resolved.
8 changes: 5 additions & 3 deletions validphys2/src/validphys/dataplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from validphys.plotoptions import get_info, kitable, transform_result
from validphys import plotutils
from validphys.utils import sane_groupby_iter, split_ranges, scale_from_grid
from validphys.coredata import KIN_NAMES

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -960,12 +961,13 @@ def plot_positivity(pdfs, positivity_predictions_for_pdfs, posdataset, pos_use_k
ax.axhline(0, color='red')

posset = posdataset.load_commondata()
ndata = posset.GetNData()
ndata = posset.ndata
xvals = []

if pos_use_kin:
ax.set_xlabel('kin1')
xvals = [posset.GetKinematics(i, 0) for i in range(0, ndata)]
kin_name = KIN_NAMES[0]
ax.set_xlabel(kin_name)
xvals = posset.kinematics[kin_name].values
else:
ax.set_xlabel('idat')
xvals = np.arange(ndata)
Expand Down
Loading