NNPDF · scarlehoff · Jan 20, 2023 · Dec 15, 2022 · Dec 16, 2022 · Jan 17, 2023
diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
@@ -2,17 +2,47 @@
 This module implements parsers for commondata  and systype files into useful
 datastructures, contained in the :py:mod:`validphys.coredata` module, which are
 not backed by C++ managed memory, and so they can be easily pickled and
-interfaces with common Python libraries.  The integration of these objects into
-the codebase is currently work in progress, and at the moment this module
-serves as a proof of concept.
+interfaced with common Python libraries. 
+
+The validphys commondata structure is an instance of :py:class:`validphys.coredata.CommonData`
 """
+import dataclasses
 from operator import attrgetter
+import logging
 
 import pandas as pd
 
-from validphys.core import peek_commondata_metadata
 from validphys.coredata import CommonData
 
+log = logging.getLogger(__name__)
+
+KINLABEL_LATEX = {
+    "DIJET": ("\\eta", "$\\m_{1,2} (GeV)", "$\\sqrt{s} (GeV)"),
+    "DIS": ("$x$", "$Q^2 (GeV^2)$", "$y$"),
+    "DYP": ("$y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_JPT": ("$p_T (GeV)$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_JRAP": ("$\\eta/y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_MLL": ("$M_{ll} (GeV)$", "$M_{ll}^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_PT": ("$p_T (GeV)$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_PTRAP": ("$\\eta/y$", "$p_T^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_RAP": ("$\\eta/y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWK_MLL": ("$M_{ll} (GeV)$", "$M_{ll}^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWK_PT": ("$p_T$ (GeV)", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWK_PTRAP": ("$\\eta/y$", "$p_T^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWK_RAP": ("$\\eta/y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HIG_RAP": ("$y$", "$M_H^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HQP_MQQ": ("$M^{QQ} (GeV)$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HQP_PTQ": ("$p_T^Q (GeV)$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HQP_PTQQ": ("$p_T^{QQ} (GeV)$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HQP_YQ": ("$y^Q$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HQP_YQQ": ("$y^{QQ}$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "INC": ("$0$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "JET": ("$\\eta$", "$p_T^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "PHT": ("$\\eta_\\gamma$", "$E_{T,\\gamma}^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "SIA": ("$z$", "$Q^2 (GeV^2)$", "$y$"),
+}
+
+
 def load_commondata(spec):
     """
     Load the data corresponding to a CommonDataSpec object.
@@ -42,21 +72,21 @@ def parse_commondata(commondatafile, systypefile, setname):
         and systype files.
     """
     # First parse commondata file
-    commondatatable = pd.read_csv(commondatafile, sep=r'\s+', skiprows=1, header=None)
+    commondatatable = pd.read_csv(commondatafile, sep=r"\s+", skiprows=1, header=None)
     # Remove NaNs
     # TODO: replace commondata files with bad formatting
     # Build header
-    commondataheader = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat']
-    nsys  = (commondatatable.shape[1] - len(commondataheader)) // 2
+    commondataheader = ["entry", "process", "kin1", "kin2", "kin3", "data", "stat"]
+    nsys = (commondatatable.shape[1] - len(commondataheader)) // 2
 
     commondataheader += ["ADD", "MULT"] * nsys
     commondatatable.columns = commondataheader
     commondatatable.set_index("entry", inplace=True)
     ndata = len(commondatatable)
     commondataproc = commondatatable["process"][1]
     # Check for consistency with commondata metadata
-    cdmetadata =  peek_commondata_metadata(commondatafile)
-    if (setname, nsys, ndata) != attrgetter('name', 'nsys', 'ndata')(cdmetadata):
+    cdmetadata = peek_commondata_metadata(commondatafile)
+    if (setname, nsys, ndata) != attrgetter("name", "nsys", "ndata")(cdmetadata):
         raise ValueError("Commondata table information does not match metadata")
 
     # Now parse the systype file
@@ -70,22 +100,72 @@ def parse_commondata(commondatafile, systypefile, setname):
         nkin=3,
         nsys=nsys,
         commondata_table=commondatatable,
-        systype_table=systypetable
+        systype_table=systypetable,
     )
 
+
 def parse_systypes(systypefile):
-    """Parses a systype file and returns a pandas dataframe.
-    """
+    """Parses a systype file and returns a pandas dataframe."""
     systypeheader = ["sys_index", "type", "name"]
     try:
         systypetable = pd.read_csv(
             systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None
         )
-        systypetable.dropna(axis='columns', inplace=True)
+        systypetable.dropna(axis="columns", inplace=True)
     # Some datasets e.g. CMSWCHARMRAT have no systematics
     except pd.errors.EmptyDataError:
         systypetable = pd.DataFrame(columns=systypeheader)
 
     systypetable.set_index("sys_index", inplace=True)
 
     return systypetable
+
+
+@dataclasses.dataclass(frozen=True)
+class CommonDataMetadata:
+    """Contains metadata information about the data being read"""
+    name: str
+    nsys: int
+    ndata: int
+    process_type: str
+
+
+def peek_commondata_metadata(commondatafilename):
+    """Read some of the properties of the commondata object as a CommonData Metadata
+    """
+    with open(commondatafilename) as f:
+        try:
+            l = f.readline()
+            name, nsys_str, ndata_str = l.split()
+            l = f.readline()
+            process_type_str = l.split()[1]
+        except Exception:
+            log.error(f"Error processing {commondatafilename}")
+            raise
+
+    return CommonDataMetadata(
+        name, int(nsys_str), int(ndata_str), get_kinlabel_key(process_type_str)
+    )
+
+
+def get_plot_kinlabels(commondata):
+    """Return the LaTex kinematic labels for a given Commondata"""
+    key = commondata.process_type
+
+    return KINLABEL_LATEX[key]
+
+
+def get_kinlabel_key(process_label):
+    """
+    Since there is no 1:1 correspondence between latex keys and GetProc,
+    we match the longest key such that the proc label starts with it.
+    """
+    l = process_label
+    try:
+        return next(k for k in sorted(KINLABEL_LATEX, key=len, reverse=True) if l.startswith(k))
+    except StopIteration as e:
+        raise ValueError(
+            "Could not find a set of kinematic "
+            "variables matching  the process %s Check the "
+            "labels defined in commondata.cc. " % (l)
+        ) from e
diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py
@@ -8,7 +8,6 @@
 """
 from __future__ import generator_stop
 
-from collections import namedtuple
 import re
 import enum
 import functools
@@ -24,7 +23,7 @@
 from reportengine.compat import yaml
 
 from NNPDF import (LHAPDFSet as libNNPDF_LHAPDFSet,
-    CommonData,
+    CommonData as LegacyCommonData,
     FKTable,
     FKSet,
     DataSet,
@@ -41,6 +40,9 @@
 from validphys.lhapdfset import LHAPDFSet
 from validphys.fkparser import load_fktable
 from validphys.pineparser import pineappl_reader
+from validphys.commondataparser import (peek_commondata_metadata,
+    get_plot_kinlabels,
+    parse_commondata,)
 
 log = logging.getLogger(__name__)
 
@@ -234,46 +236,6 @@ def get_members(self):
         return len(self)
 
 
-kinlabels_latex = CommonData.kinLabel_latex.asdict()
-_kinlabels_keys = sorted(kinlabels_latex, key=len, reverse=True)
-
-
-def get_plot_kinlabels(commondata):
-    """Return the LaTex kinematic labels for a given Commondata"""
-    key = commondata.process_type
-
-    return kinlabels_latex[key]
-
-def get_kinlabel_key(process_label):
-    #Since there is no 1:1 correspondence between latex keys and GetProc,
-    #we match the longest key such that the proc label starts with it.
-    l = process_label
-    try:
-        return next(k for k in _kinlabels_keys if l.startswith(k))
-    except StopIteration as e:
-        raise ValueError("Could not find a set of kinematic "
-                         "variables matching  the process %s Check the "
-                         "labels defined in commondata.cc. " % (l)) from e
-
-CommonDataMetadata = namedtuple('CommonDataMetadata', ('name', 'nsys', 'ndata', 'process_type'))
-
-def peek_commondata_metadata(commondatafilename):
-    """Check some basic properties commondata object without going though the
-    trouble of processing it on the C++ side"""
-    with open(commondatafilename) as f:
-        try:
-            l = f.readline()
-            name, nsys_str, ndata_str = l.split()
-            l = f.readline()
-            process_type_str = l.split()[1]
-        except Exception:
-            log.error(f"Error processing {commondatafilename}")
-            raise
-
-    return CommonDataMetadata(name, int(nsys_str), int(ndata_str),
-                              get_kinlabel_key(process_type_str))
-
-
 class CommonDataSpec(TupleComp):
     def __init__(self, datafile, sysfile, plotfiles, name=None, metadata=None):
         self.datafile = datafile
@@ -312,9 +274,8 @@ def __iter__(self):
         return iter((self.datafile, self.sysfile, self.plotfiles))
 
     @functools.lru_cache()
-    def load(self)->CommonData:
-        #TODO: Use better path handling in python 3.6
-        return CommonData.ReadFile(str(self.datafile), str(self.sysfile))
+    def load(self):
+        return parse_commondata(self.datafile, self.sysfile, self.name)
 
     @property
     def plot_kinlabels(self):
@@ -472,7 +433,8 @@ def __init__(self, *, name, commondata, fkspecs, thspec, cuts,
 
     @functools.lru_cache()
     def load(self):
-        cd = self.commondata.load()
+        """Load the libNNPDF version of the dataset"""
+        cd = LegacyCommonData.ReadFile(str(self.commondata.datafile), str(self.commondata.sysfile))
 
         fktables = []
         for p in self.fkspecs:
@@ -508,7 +470,7 @@ def load_commondata(self):
             loaded_cuts = self.cuts.load()
             if not (hasattr(loaded_cuts, '_full') and loaded_cuts._full):
                 intmask = [int(ele) for ele in loaded_cuts]
-                cd = CommonData(cd, intmask)
+                cd = cd.with_cuts(intmask)
         return cd
 
     def to_unweighted(self):

diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pandas as pd
 
+KIN_NAMES = ["kin1", "kin2", "kin3"]
+
 
 @dataclasses.dataclass(eq=False)
 class FKTableData:
@@ -175,6 +177,7 @@ def get_np_fktable(self):
 
         return fktable
 
+
 @dataclasses.dataclass(eq=False)
 class CFactorData:
     """
@@ -218,15 +221,9 @@ class CommonData:
     nkin : int
         Number of kinematics specified
 
-    kinematics : list of str with length nkin
-        Kinematic variables kin1, kin2, kin3 ...
-
     nsys : int
         Number of systematics
 
-    sysid : list of str with length nsys
-        ID for systematic
-
     commondata_table : pd.DataFrame
         Pandas dataframe containing the commondata
 
@@ -235,6 +232,9 @@ class CommonData:
         for each systematic alongside the uncertainty
         type (ADD/MULT/RAND) and name
         (CORR/UNCORR/THEORYCORR/SKIP)
+
+    systematics_table: pd.DataFrame
+        Panda dataframe containing the table of systematics
     """
 
     setname: str
@@ -248,7 +248,7 @@ class CommonData:
 
     def __post_init__(self):
         self.systematics_table = self.commondata_table.drop(
-            columns=["process", "kin1", "kin2", "kin3", "data", "stat"]
+            columns=["process", "data", "stat"] + KIN_NAMES
         )
 
     def with_cuts(self, cuts):
@@ -284,10 +284,20 @@ def with_cuts(self, cuts):
         new_commondata_table = self.commondata_table.loc[cuts]
         return dataclasses.replace(self, ndata=newndata, commondata_table=new_commondata_table)
 
+    @property
+    def kinematics(self):
+        return self.commondata_table[KIN_NAMES]
+
+    def get_kintable(self):
+        return self.kinematics.values
+
     @property
     def central_values(self):
         return self.commondata_table["data"]
 
+    def get_cv(self):
+        return self.central_values.values
+
     @property
     def stat_errors(self):
         return self.commondata_table["stat"]
@@ -353,3 +363,22 @@ def systematic_errors(self, central_values=None):
             central_values = self.central_values.to_numpy()
         converted_mult_errors = self.multiplicative_errors * central_values[:, np.newaxis] / 100
         return pd.concat((self.additive_errors, converted_mult_errors), axis=1)
+
+    def export(self, path):
+        """Export the data, and error types
+        Use the same format as libNNPDF:
+
+        - A DATA_<dataset>.dat file with the dataframe of accepted points
+        - A systypes/STYPES_<dataset>.dat file with the error types
+        """
+        dat_path = path / f"DATA_{self.setname}.dat"
+        sys_path = path / "systypes" / f"SYSTYPE_{self.setname}_DEFAULT.dat"
+        sys_path.parent.mkdir(exist_ok=True)
+
+        dat_string_raw = self.commondata_table.to_string(index=False, header=False, float_format="{:.8e}".format)
+        header = f"{self.setname}    {self.nsys} {self.ndata}"
+        dat_string = "\n".join([f" {i+1}    {r}" for i, r in enumerate(dat_string_raw.split("\n"))])
+        dat_path.write_text(f"{header}\n{dat_string}\n")
+
+        sys_raw = self.systype_table.to_string(index=True, header=False, index_names=False)
+        sys_path.write_text(f"{self.nsys}\n{sys_raw}\n")
diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py
@@ -26,6 +26,7 @@
 from validphys.plotoptions import get_info, kitable, transform_result
 from validphys import plotutils
 from validphys.utils import sane_groupby_iter, split_ranges, scale_from_grid
+from validphys.coredata import KIN_NAMES
 
 log = logging.getLogger(__name__)
 
@@ -960,12 +961,13 @@ def plot_positivity(pdfs, positivity_predictions_for_pdfs, posdataset, pos_use_k
     ax.axhline(0, color='red')
 
     posset = posdataset.load_commondata()
-    ndata  = posset.GetNData()
+    ndata  = posset.ndata
     xvals = []
 
     if pos_use_kin:
-        ax.set_xlabel('kin1')
-        xvals = [posset.GetKinematics(i, 0) for i in range(0, ndata)]
+        kin_name = KIN_NAMES[0]
+        ax.set_xlabel(kin_name)
+        xvals = posset.kinematics[kin_name].values
     else:
         ax.set_xlabel('idat')
         xvals = np.arange(ndata)