From 6d1997cae898b41b8ad3d408046083b1489fd6ab Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@lairen.eu>
Date: Thu, 15 Dec 2022 15:46:18 +0100
Subject: [PATCH 1/7] use python commondata instead of libNNPDF

---
 validphys2/src/validphys/commondataparser.py  | 101 +++++++++++++++---
 validphys2/src/validphys/core.py              |  61 +++--------
 validphys2/src/validphys/coredata.py          |  43 ++++++--
 validphys2/src/validphys/filters.py           |  54 +++++++---
 validphys2/src/validphys/fitdata.py           |   4 +-
 validphys2/src/validphys/mc_gen.py            |   1 +
 validphys2/src/validphys/plotoptions/core.py  |   3 +-
 validphys2/src/validphys/tests/test_loader.py |   4 +-
 .../src/validphys/tests/test_weights.py       |  12 ++-
 9 files changed, 195 insertions(+), 88 deletions(-)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index ab2cbaf8c5..bf6966b13d 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -2,17 +2,50 @@
 This module implements parsers for commondata  and systype files into useful
 datastructures, contained in the :py:mod:`validphys.coredata` module, which are
 not backed by C++ managed memory, and so they can be easily pickled and
-interfaces with common Python libraries.  The integration of these objects into
-the codebase is currently work in progress, and at the moment this module
-serves as a proof of concept.
+interfaces with common Python libraries. 
+
+The validphys commondata structure is an instance of :py:class:`validphys.coredata.CommonData`
 """
+from collections import namedtuple
 from operator import attrgetter
+import logging
 
 import pandas as pd
 
-from validphys.core import peek_commondata_metadata
 from validphys.coredata import CommonData
 
+log = logging.getLogger(__name__)
+
+kinlabels_latex = {
+    "DIJET": ("\\eta", "$\\m_{1,2} (GeV)", "$\\sqrt{s} (GeV)"),
+    "DIS": ("$x$", "$Q^2 (GeV^2)$", "$y$"),
+    "DYP": ("$y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_JPT": ("$p_T (GeV)$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_JRAP": ("$\\eta/y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_MLL": ("$M_{ll} (GeV)$", "$M_{ll}^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_PT": ("$p_T (GeV)$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_PTRAP": ("$\\eta/y$", "$p_T^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWJ_RAP": ("$\\eta/y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWK_MLL": ("$M_{ll} (GeV)$", "$M_{ll}^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWK_PT": ("$p_T$ (GeV)", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWK_PTRAP": ("$\\eta/y$", "$p_T^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "EWK_RAP": ("$\\eta/y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HIG_RAP": ("$y$", "$M_H^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HQP_MQQ": ("$M^{QQ} (GeV)$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HQP_PTQ": ("$p_T^Q (GeV)$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HQP_PTQQ": ("$p_T^{QQ} (GeV)$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HQP_YQ": ("$y^Q$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "HQP_YQQ": ("$y^{QQ}$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "INC": ("$0$", "$\\mu^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "JET": ("$\\eta$", "$p_T^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "PHT": ("$\\eta_\\gamma$", "$E_{T,\\gamma}^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
+    "SIA": ("$z$", "$Q^2 (GeV^2)$", "$y$"),
+}
+
+
+_kinlabels_keys = sorted(kinlabels_latex, key=len, reverse=True)
+
+
 def load_commondata(spec):
     """
     Load the data corresponding to a CommonDataSpec object.
@@ -42,12 +75,12 @@ def parse_commondata(commondatafile, systypefile, setname):
         and systype files.
     """
     # First parse commondata file
-    commondatatable = pd.read_csv(commondatafile, sep=r'\s+', skiprows=1, header=None)
+    commondatatable = pd.read_csv(commondatafile, sep=r"\s+", skiprows=1, header=None)
     # Remove NaNs
     # TODO: replace commondata files with bad formatting
     # Build header
-    commondataheader = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat']
-    nsys  = (commondatatable.shape[1] - len(commondataheader)) // 2
+    commondataheader = ["entry", "process", "kin1", "kin2", "kin3", "data", "stat"]
+    nsys = (commondatatable.shape[1] - len(commondataheader)) // 2
 
     commondataheader += ["ADD", "MULT"] * nsys
     commondatatable.columns = commondataheader
@@ -55,8 +88,8 @@ def parse_commondata(commondatafile, systypefile, setname):
     ndata = len(commondatatable)
     commondataproc = commondatatable["process"][1]
     # Check for consistency with commondata metadata
-    cdmetadata =  peek_commondata_metadata(commondatafile)
-    if (setname, nsys, ndata) != attrgetter('name', 'nsys', 'ndata')(cdmetadata):
+    cdmetadata = peek_commondata_metadata(commondatafile)
+    if (setname, nsys, ndata) != attrgetter("name", "nsys", "ndata")(cdmetadata):
         raise ValueError("Commondata table information does not match metadata")
 
     # Now parse the systype file
@@ -70,18 +103,18 @@ def parse_commondata(commondatafile, systypefile, setname):
         nkin=3,
         nsys=nsys,
         commondata_table=commondatatable,
-        systype_table=systypetable
+        systype_table=systypetable,
     )
 
+
 def parse_systypes(systypefile):
-    """Parses a systype file and returns a pandas dataframe.
-    """
+    """Parses a systype file and returns a pandas dataframe."""
     systypeheader = ["sys_index", "type", "name"]
     try:
         systypetable = pd.read_csv(
             systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None
         )
-        systypetable.dropna(axis='columns', inplace=True)
+        systypetable.dropna(axis="columns", inplace=True)
     # Some datasets e.g. CMSWCHARMRAT have no systematics
     except pd.errors.EmptyDataError:
         systypetable = pd.DataFrame(columns=systypeheader)
@@ -89,3 +122,45 @@ def parse_systypes(systypefile):
     systypetable.set_index("sys_index", inplace=True)
 
     return systypetable
+
+
+CommonDataMetadata = namedtuple("CommonDataMetadata", ("name", "nsys", "ndata", "process_type"))
+
+
+def peek_commondata_metadata(commondatafilename):
+    """Check some basic properties commondata object without going though the
+    trouble of processing it on the C++ side"""
+    with open(commondatafilename) as f:
+        try:
+            l = f.readline()
+            name, nsys_str, ndata_str = l.split()
+            l = f.readline()
+            process_type_str = l.split()[1]
+        except Exception:
+            log.error(f"Error processing {commondatafilename}")
+            raise
+
+    return CommonDataMetadata(
+        name, int(nsys_str), int(ndata_str), get_kinlabel_key(process_type_str)
+    )
+
+
+def get_plot_kinlabels(commondata):
+    """Return the LaTex kinematic labels for a given Commondata"""
+    key = commondata.process_type
+
+    return kinlabels_latex[key]
+
+
+def get_kinlabel_key(process_label):
+    # Since there is no 1:1 correspondence between latex keys and GetProc,
+    # we match the longest key such that the proc label starts with it.
+    l = process_label
+    try:
+        return next(k for k in _kinlabels_keys if l.startswith(k))
+    except StopIteration as e:
+        raise ValueError(
+            "Could not find a set of kinematic "
+            "variables matching  the process %s Check the "
+            "labels defined in commondata.cc. " % (l)
+        ) from e
diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py
index 694135e3ae..98d5fe8990 100644
--- a/validphys2/src/validphys/core.py
+++ b/validphys2/src/validphys/core.py
@@ -8,7 +8,6 @@
 """
 from __future__ import generator_stop
 
-from collections import namedtuple
 import re
 import enum
 import functools
@@ -24,7 +23,7 @@
 from reportengine.compat import yaml
 
 from NNPDF import (LHAPDFSet as libNNPDF_LHAPDFSet,
-    CommonData,
+    CommonData as LegacyCommonData,
     FKTable,
     FKSet,
     DataSet,
@@ -41,6 +40,9 @@
 from validphys.lhapdfset import LHAPDFSet
 from validphys.fkparser import load_fktable
 from validphys.pineparser import pineappl_reader
+from validphys.commondataparser import (peek_commondata_metadata,
+    get_plot_kinlabels,
+    parse_commondata,)
 
 log = logging.getLogger(__name__)
 
@@ -234,46 +236,6 @@ def get_members(self):
         return len(self)
 
 
-kinlabels_latex = CommonData.kinLabel_latex.asdict()
-_kinlabels_keys = sorted(kinlabels_latex, key=len, reverse=True)
-
-
-def get_plot_kinlabels(commondata):
-    """Return the LaTex kinematic labels for a given Commondata"""
-    key = commondata.process_type
-
-    return kinlabels_latex[key]
-
-def get_kinlabel_key(process_label):
-    #Since there is no 1:1 correspondence between latex keys and GetProc,
-    #we match the longest key such that the proc label starts with it.
-    l = process_label
-    try:
-        return next(k for k in _kinlabels_keys if l.startswith(k))
-    except StopIteration as e:
-        raise ValueError("Could not find a set of kinematic "
-                         "variables matching  the process %s Check the "
-                         "labels defined in commondata.cc. " % (l)) from e
-
-CommonDataMetadata = namedtuple('CommonDataMetadata', ('name', 'nsys', 'ndata', 'process_type'))
-
-def peek_commondata_metadata(commondatafilename):
-    """Check some basic properties commondata object without going though the
-    trouble of processing it on the C++ side"""
-    with open(commondatafilename) as f:
-        try:
-            l = f.readline()
-            name, nsys_str, ndata_str = l.split()
-            l = f.readline()
-            process_type_str = l.split()[1]
-        except Exception:
-            log.error(f"Error processing {commondatafilename}")
-            raise
-
-    return CommonDataMetadata(name, int(nsys_str), int(ndata_str),
-                              get_kinlabel_key(process_type_str))
-
-
 class CommonDataSpec(TupleComp):
     def __init__(self, datafile, sysfile, plotfiles, name=None, metadata=None):
         self.datafile = datafile
@@ -312,9 +274,11 @@ def __iter__(self):
         return iter((self.datafile, self.sysfile, self.plotfiles))
 
     @functools.lru_cache()
-    def load(self)->CommonData:
-        #TODO: Use better path handling in python 3.6
-        return CommonData.ReadFile(str(self.datafile), str(self.sysfile))
+    def load(self):
+        cd = parse_commondata(self.datafile, self.sysfile, self.name)
+#         cd_old = LegacyCommonData.ReadFile(str(self.datafile), str(self.sysfile))
+#         cd.old = cd_old # DEBUG
+        return cd
 
     @property
     def plot_kinlabels(self):
@@ -472,7 +436,8 @@ def __init__(self, *, name, commondata, fkspecs, thspec, cuts,
 
     @functools.lru_cache()
     def load(self):
-        cd = self.commondata.load()
+        """Load the libNNPDF version of the dataset"""
+        cd = LegacyCommonData.ReadFile(str(self.commondata.datafile), str(self.commondata.sysfile))
 
         fktables = []
         for p in self.fkspecs:
@@ -508,7 +473,9 @@ def load_commondata(self):
             loaded_cuts = self.cuts.load()
             if not (hasattr(loaded_cuts, '_full') and loaded_cuts._full):
                 intmask = [int(ele) for ele in loaded_cuts]
-                cd = CommonData(cd, intmask)
+#                 cd_old = LegacyCommonData(cd.old, intmask)
+                cd = cd.with_cuts(intmask)
+#                 cd.old = cd_old # DEBUG
         return cd
 
     def to_unweighted(self):
diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py
index 5e73e3777f..b27b7ce657 100644
--- a/validphys2/src/validphys/coredata.py
+++ b/validphys2/src/validphys/coredata.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pandas as pd
 
+KINNAMES = ["kin1", "kin2", "kin3"]
+
 
 @dataclasses.dataclass(eq=False)
 class FKTableData:
@@ -175,6 +177,7 @@ def get_np_fktable(self):
 
         return fktable
 
+
 @dataclasses.dataclass(eq=False)
 class CFactorData:
     """
@@ -218,15 +221,9 @@ class CommonData:
     nkin : int
         Number of kinematics specified
 
-    kinematics : list of str with length nkin
-        Kinematic variables kin1, kin2, kin3 ...
-
     nsys : int
         Number of systematics
 
-    sysid : list of str with length nsys
-        ID for systematic
-
     commondata_table : pd.DataFrame
         Pandas dataframe containing the commondata
 
@@ -235,6 +232,9 @@ class CommonData:
         for each systematic alongside the uncertainty
         type (ADD/MULT/RAND) and name
         (CORR/UNCORR/THEORYCORR/SKIP)
+
+    systematics_table: pd.DataFrame
+        Panda dataframe containing the table of systematics
     """
 
     setname: str
@@ -248,7 +248,7 @@ class CommonData:
 
     def __post_init__(self):
         self.systematics_table = self.commondata_table.drop(
-            columns=["process", "kin1", "kin2", "kin3", "data", "stat"]
+            columns=["process", "data", "stat"] + KINNAMES
         )
 
     def with_cuts(self, cuts):
@@ -284,10 +284,20 @@ def with_cuts(self, cuts):
         new_commondata_table = self.commondata_table.loc[cuts]
         return dataclasses.replace(self, ndata=newndata, commondata_table=new_commondata_table)
 
+    @property
+    def kinematics(self):
+        return self.commondata_table[KINNAMES]
+
+    def get_kintable(self):
+        return self.kinematics.values
+
     @property
     def central_values(self):
         return self.commondata_table["data"]
 
+    def get_cv(self):
+        return self.central_values.values
+
     @property
     def stat_errors(self):
         return self.commondata_table["stat"]
@@ -353,3 +363,22 @@ def systematic_errors(self, central_values=None):
             central_values = self.central_values.to_numpy()
         converted_mult_errors = self.multiplicative_errors * central_values[:, np.newaxis] / 100
         return pd.concat((self.additive_errors, converted_mult_errors), axis=1)
+
+    def export(self, path):
+        """Export the data, and error types
+        Use the same format as libNNPDF:
+
+        - A DATA_<dataset>.dat file with the dataframe of accepted points
+        - A systypes/STYPES_<dataset>.dat file with the error types
+        """
+        dat_path = path / f"DATA_{self.setname}.dat"
+        sys_path = path / "systypes" / f"SYSTYPE_{self.setname}_DEFAULT.dat"
+        sys_path.parent.mkdir(exist_ok=True)
+
+        dat_string_raw = self.commondata_table.to_string(index=False, header=False, float_format="{:.8e}".format)
+        header = f"{self.setname}    {self.nsys} {self.ndata}"
+        dat_string = "\n".join([f" {i+1}    {r}" for i, r in enumerate(dat_string_raw.split("\n"))])
+        dat_path.write_text(f"{header}\n{dat_string}\n")
+
+        sys_raw = self.systype_table.to_string(index=True, header=False, index_names=False)
+        sys_path.write_text(f"{self.nsys}\n{sys_raw}\n")
diff --git a/validphys2/src/validphys/filters.py b/validphys2/src/validphys/filters.py
index 913f883e8b..6fc5b76f4f 100644
--- a/validphys2/src/validphys/filters.py
+++ b/validphys2/src/validphys/filters.py
@@ -9,13 +9,39 @@
 
 import numpy as np
 
-from NNPDF import CommonData
 from reportengine.checks import make_argcheck, check, check_positive, make_check
 from reportengine.compat import yaml
 import validphys.cuts
 
 log = logging.getLogger(__name__)
 
+KIN_LABEL = {
+    "DIS": ("x", "Q2", "y"),
+    "DYP": ("y", "M2", "sqrts"),
+    "JET": ("eta", "p_T2", "sqrts"),
+    "DIJET": ("eta", "m_12", "sqrts"),
+    "PHT": ("eta_gamma", "E_{T,gamma)2", "sqrts"),
+    "INC": ("0", "mu2", "sqrts"),
+    "EWK_RAP": ("etay", "M2", "sqrts"),
+    "EWK_PT": ("p_T", "M2", "sqrts"),
+    "EWK_PTRAP": ("etay", "p_T2", "sqrts"),
+    "EWK_MLL": ("M_ll", "M_ll2", "sqrts"),
+    "EWJ_RAP": ("etay", "M2", "sqrts"),
+    "EWJ_PT": ("p_T", "M2", "sqrt(s)"),
+    "EWJ_PTRAP": ("etay", "p_T2", "sqrts"),
+    "EWJ_JRAP": ("etay", "M2", "sqrts"),
+    "EWJ_JPT": ("p_T", "M2", "sqrts"),
+    "EWJ_MLL": ("M_ll", "M_ll2", "sqrts"),
+    "HQP_YQQ": ("yQQ", "mu2", "sqrts"),
+    "HQP_MQQ": ("MQQ", "mu2", "sqrts"),
+    "HQP_PTQQ": ("p_TQQ", "mu2", "sqrts"),
+    "HQP_YQ": ("yQ", "mu2", "sqrts"),
+    "HQP_PTQ": ("p_TQ", "mu2", "sqrts"),
+    "HIG_RAP": ("y", "M_H2", "sqrts"),
+    "SIA": ("z", "Q2", "y"),
+}
+
+
 class RuleProcessingError(Exception):
     """Exception raised when we couldn't process a rule."""
 
@@ -164,7 +190,7 @@ def _filter_real_data(filter_path, data):
         nfull, ncut = _write_ds_cut_data(path, dataset)
         total_data_points += nfull
         total_cut_data_points += ncut
-        dataset.load_commondata().Export(str(path))
+        dataset.load_commondata().export(path)
     return total_data_points, total_cut_data_points
 
 
@@ -343,14 +369,14 @@ def __init__(
                     f"Could not find dataset {self.dataset}"
                 ) from e
             if cd.process_type[:3] == "DIS":
-                self.variables = CommonData.kinLabel["DIS"]
+                self.variables = KIN_LABEL["DIS"]
             else:
-                self.variables = CommonData.kinLabel[cd.process_type]
+                self.variables = KIN_LABEL[cd.process_type]
         else:
             if self.process_type[:3] == "DIS":
-                self.variables = CommonData.kinLabel["DIS"]
+                self.variables = KIN_LABEL["DIS"]
             else:
-                self.variables = CommonData.kinLabel[self.process_type]
+                self.variables = KIN_LABEL[self.process_type]
 
         if hasattr(self, "local_variables"):
             if not isinstance(self.local_variables, Mapping):
@@ -422,19 +448,21 @@ def __hash__(self):
         return hash(self._properties)
 
     def __call__(self, dataset, idat):
-        central_value = dataset.GetData(idat)
+        central_value = dataset.get_cv()[idat]
+        process_name = dataset.commondataproc
+
         # We return None if the rule doesn't apply. This
         # is different to the case where the rule does apply,
         # but the point was cut out by the rule.
         if (
-            dataset.GetSetName() != self.dataset
-            and dataset.GetProc(idat) != self.process_type
+            dataset.setname != self.dataset
+            and process_name != self.process_type
             and self.process_type != "DIS_ALL"
         ):
             return None
 
         # Handle the generalised DIS cut
-        if self.process_type == "DIS_ALL" and dataset.GetProc(idat)[:3] != "DIS":
+        if self.process_type == "DIS_ALL" and not process_name.startswith("DIS"):
             return None
 
         ns = self._make_point_namespace(dataset, idat)
@@ -468,7 +496,7 @@ def __repr__(self): # pragma: no cover
 
     def _make_kinematics_dict(self, dataset, idat) -> dict:
         """Fill in a dictionary with the kinematics for each point"""
-        kinematics = [dataset.GetKinematics(idat, j) for j in range(3)]
+        kinematics = dataset.kinematics.values[idat]
         return dict(zip(self.variables, kinematics))
 
     def _make_point_namespace(self, dataset, idat) -> dict:
@@ -488,7 +516,7 @@ def get_cuts_for_dataset(commondata, rules) -> list:
 
     Parameters
     ----------
-    commondata: NNPDF CommonData spec
+    commondata: :py:class:`validphys.coredata.CommonData`
     rules: List[Rule]
         A list of Rule objects specifying the filters.
 
@@ -515,7 +543,7 @@ def get_cuts_for_dataset(commondata, rules) -> list:
     dataset = commondata.load()
 
     mask = []
-    for idat in range(dataset.GetNData()):
+    for idat in range(dataset.ndata):
         broken = False
         for rule in rules:
             rule_result = rule(dataset, idat)
diff --git a/validphys2/src/validphys/fitdata.py b/validphys2/src/validphys/fitdata.py
index d518efcccd..2cf35d561d 100644
--- a/validphys2/src/validphys/fitdata.py
+++ b/validphys2/src/validphys/fitdata.py
@@ -440,10 +440,10 @@ def print_systype_overlap(groups_commondata, group_dataset_inputs_by_metadata):
     systype_groups = dict()
     for group_cd, group in zip(groups_commondata, group_dataset_inputs_by_metadata):
         systype_groups[group["group_name"]] = {
-            cd.load().GetSys(0, i).name
+            cd.load().systype_table.iloc[i]["name"]
             for cd in group_cd
             for i in range(cd.nsys)
-            if cd.load().GetSys(0, i).name not in allow_list
+            if cd.load().systype_table.iloc[i]["name"] not in allow_list
         }
 
     systype_overlap = set()
diff --git a/validphys2/src/validphys/mc_gen.py b/validphys2/src/validphys/mc_gen.py
index ff68d59c88..bf94b5b478 100644
--- a/validphys2/src/validphys/mc_gen.py
+++ b/validphys2/src/validphys/mc_gen.py
@@ -5,6 +5,7 @@
 Tools to check the pseudo-data MC generation.
 """
 # The functions in this module have been ported to not use libNNPDF
+# but <art_rep_generation> is still using it under the hood
 # it has been a direct port of the libnnpdf dependent structure 
 # so they should not be used as an example
 import logging
diff --git a/validphys2/src/validphys/plotoptions/core.py b/validphys2/src/validphys/plotoptions/core.py
index d96aef7290..cf0571b811 100644
--- a/validphys2/src/validphys/plotoptions/core.py
+++ b/validphys2/src/validphys/plotoptions/core.py
@@ -19,8 +19,9 @@
 from reportengine.compat import yaml
 from reportengine.utils import get_functions, ChainMap
 
-from NNPDF import CommonData, DataSet
+from NNPDF import DataSet
 from validphys.core import CommonDataSpec, DataSetSpec, Cuts, InternalCutsWrapper
+from validphys.coredata import CommonData
 from validphys.plotoptions.utils import apply_to_all_columns, get_subclasses
 from validphys.plotoptions import labelers, kintransforms, resulttransforms
 from validphys.utils import parse_yaml_inp
diff --git a/validphys2/src/validphys/tests/test_loader.py b/validphys2/src/validphys/tests/test_loader.py
index decf0a020c..4af8fcf1e8 100644
--- a/validphys2/src/validphys/tests/test_loader.py
+++ b/validphys2/src/validphys/tests/test_loader.py
@@ -42,8 +42,8 @@ def test_rebuild_commondata_without_cuts(tmp_path_factory, arg):
         cutpath = tmp / "cuts.txt"
         np.savetxt(cutpath, np.asarray(cuts, dtype=int), fmt="%u")
         cutspec = Cuts(cd, cutpath)
-        lcd = type(lcd)(lcd, cuts)
-    lcd.Export(str(tmp))
+        lcd = lcd.with_cuts(cuts)
+    lcd.export(tmp)
     # We have to reconstruct the name here...
     with_cuts = tmp / f"DATA_{cd.name}.dat"
     newpath = tmp / "commondata.dat"
diff --git a/validphys2/src/validphys/tests/test_weights.py b/validphys2/src/validphys/tests/test_weights.py
index 0c1043fa44..134cf47e2f 100644
--- a/validphys2/src/validphys/tests/test_weights.py
+++ b/validphys2/src/validphys/tests/test_weights.py
@@ -8,9 +8,15 @@
 def test_weights_have_same_commondata(weighted_data_witht0_config):
     data = API.data(**weighted_data_witht0_config)
     normal, weighted = data.datasets
-    normalds, weightedds = normal.load(), weighted.load()
-    assert normalds.GetSys(0, 0).mult == weightedds.GetSys(0, 0).mult
-    assert normalds.GetSys(0, 0).add == weightedds.GetSys(0, 0).add
+    normalds, weightedds = normal.load_commondata(), weighted.load_commondata()
+    assert (
+        normalds.systematics_table["MULT"].iloc[0][0]
+        == weightedds.systematics_table["MULT"].iloc[0][0]
+    )
+    assert (
+        normalds.systematics_table["ADD"].iloc[0][0]
+        == weightedds.systematics_table["ADD"].iloc[0][0]
+    )
 
 
 def test_chi2_arithmetic(weighted_data_witht0_internal_cuts_config):

From e8e9e4497e5deae2613878cecfe9370bf69d8133 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@lairen.eu>
Date: Fri, 16 Dec 2022 10:23:27 +0100
Subject: [PATCH 2/7] update missing libnnpdf methods

---
 validphys2/src/validphys/dataplots.py              | 7 ++++---
 validphys2/src/validphys/theorycovariance/tests.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py
index 80e2709a7e..31b7fb2062 100644
--- a/validphys2/src/validphys/dataplots.py
+++ b/validphys2/src/validphys/dataplots.py
@@ -960,12 +960,13 @@ def plot_positivity(pdfs, positivity_predictions_for_pdfs, posdataset, pos_use_k
     ax.axhline(0, color='red')
 
     posset = posdataset.load_commondata()
-    ndata  = posset.GetNData()
+    ndata  = posset.ndata
     xvals = []
 
     if pos_use_kin:
-        ax.set_xlabel('kin1')
-        xvals = [posset.GetKinematics(i, 0) for i in range(0, ndata)]
+        kin_name = "kin1"
+        ax.set_xlabel(kin_name)
+        xvals = posset.kinematics[kin_name].values
     else:
         ax.set_xlabel('idat')
         xvals = np.arange(ndata)
diff --git a/validphys2/src/validphys/theorycovariance/tests.py b/validphys2/src/validphys/theorycovariance/tests.py
index 1a90302cac..6dd14d85c2 100644
--- a/validphys2/src/validphys/theorycovariance/tests.py
+++ b/validphys2/src/validphys/theorycovariance/tests.py
@@ -219,7 +219,7 @@ def all_matched_data_lengths(all_matched_datasets):
     """Returns a list of the data sets lengths."""
     lens = []
     for rlist in all_matched_datasets:
-        lens.append(rlist[0].load_commondata().GetNData())
+        lens.append(rlist[0].load_commondata().ndata)
     return lens
 
 

From 609d6c862702ea19d09228f7ff038cbd3cb373c2 Mon Sep 17 00:00:00 2001
From: "Juan M. Cruz-Martinez" <juacrumar@lairen.eu>
Date: Tue, 17 Jan 2023 12:48:49 +0400
Subject: [PATCH 3/7] Apply suggestions from code review

Co-authored-by: Alessandro Candido <candido.ale@gmail.com>
---
 validphys2/src/validphys/core.py    | 7 +------
 validphys2/src/validphys/filters.py | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py
index 98d5fe8990..59be563b0f 100644
--- a/validphys2/src/validphys/core.py
+++ b/validphys2/src/validphys/core.py
@@ -275,10 +275,7 @@ def __iter__(self):
 
     @functools.lru_cache()
     def load(self):
-        cd = parse_commondata(self.datafile, self.sysfile, self.name)
-#         cd_old = LegacyCommonData.ReadFile(str(self.datafile), str(self.sysfile))
-#         cd.old = cd_old # DEBUG
-        return cd
+        return parse_commondata(self.datafile, self.sysfile, self.name)
 
     @property
     def plot_kinlabels(self):
@@ -473,9 +470,7 @@ def load_commondata(self):
             loaded_cuts = self.cuts.load()
             if not (hasattr(loaded_cuts, '_full') and loaded_cuts._full):
                 intmask = [int(ele) for ele in loaded_cuts]
-#                 cd_old = LegacyCommonData(cd.old, intmask)
                 cd = cd.with_cuts(intmask)
-#                 cd.old = cd_old # DEBUG
         return cd
 
     def to_unweighted(self):
diff --git a/validphys2/src/validphys/filters.py b/validphys2/src/validphys/filters.py
index 6fc5b76f4f..1669772a39 100644
--- a/validphys2/src/validphys/filters.py
+++ b/validphys2/src/validphys/filters.py
@@ -516,7 +516,7 @@ def get_cuts_for_dataset(commondata, rules) -> list:
 
     Parameters
     ----------
-    commondata: :py:class:`validphys.coredata.CommonData`
+    commondata: validphys.coredata.CommonData
     rules: List[Rule]
         A list of Rule objects specifying the filters.
 

From c8e9cf61fd240f97399349f9a856ae90f82e6ee1 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@lairen.eu>
Date: Tue, 17 Jan 2023 15:11:41 +0400
Subject: [PATCH 4/7] apply review comments

---
 validphys2/src/validphys/commondataparser.py | 29 ++++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index bf6966b13d..f4459dc0b1 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -6,7 +6,7 @@
 
 The validphys commondata structure is an instance of :py:class:`validphys.coredata.CommonData`
 """
-from collections import namedtuple
+import dataclasses
 from operator import attrgetter
 import logging
 
@@ -16,7 +16,7 @@
 
 log = logging.getLogger(__name__)
 
-kinlabels_latex = {
+KINLABEL_LATEX = {
     "DIJET": ("\\eta", "$\\m_{1,2} (GeV)", "$\\sqrt{s} (GeV)"),
     "DIS": ("$x$", "$Q^2 (GeV^2)$", "$y$"),
     "DYP": ("$y$", "$M^2 (GeV^2)$", "$\\sqrt{s} (GeV)$"),
@@ -43,9 +43,6 @@
 }
 
 
-_kinlabels_keys = sorted(kinlabels_latex, key=len, reverse=True)
-
-
 def load_commondata(spec):
     """
     Load the data corresponding to a CommonDataSpec object.
@@ -124,12 +121,18 @@ def parse_systypes(systypefile):
     return systypetable
 
 
-CommonDataMetadata = namedtuple("CommonDataMetadata", ("name", "nsys", "ndata", "process_type"))
+@dataclasses.dataclass
+class CommonDataMetadata:
+    """Contains metadata information about the data being read"""
+    name: str
+    nsys: int
+    ndata: int
+    process_type: str
 
 
 def peek_commondata_metadata(commondatafilename):
-    """Check some basic properties commondata object without going though the
-    trouble of processing it on the C++ side"""
+    """Read some of the properties of the commondata object as a CommonData Metadata
+    """
     with open(commondatafilename) as f:
         try:
             l = f.readline()
@@ -149,15 +152,17 @@ def get_plot_kinlabels(commondata):
     """Return the LaTex kinematic labels for a given Commondata"""
     key = commondata.process_type
 
-    return kinlabels_latex[key]
+    return KINLABEL_LATEX[key]
 
 
 def get_kinlabel_key(process_label):
-    # Since there is no 1:1 correspondence between latex keys and GetProc,
-    # we match the longest key such that the proc label starts with it.
+    """
+    Since there is no 1:1 correspondence between latex keys and GetProc,
+    we match the longest key such that the proc label starts with it.
+    """
     l = process_label
     try:
-        return next(k for k in _kinlabels_keys if l.startswith(k))
+        return next(k for k in sorted(KINLABEL_LATEX, key=len, reverse=True) if l.startswith(k))
     except StopIteration as e:
         raise ValueError(
             "Could not find a set of kinematic "

From 5ae445cb428969990cef2a0f04a49a7fa7f7bb4f Mon Sep 17 00:00:00 2001
From: "Juan M. Cruz-Martinez" <juacrumar@lairen.eu>
Date: Tue, 17 Jan 2023 15:46:06 +0400
Subject: [PATCH 5/7] Update validphys2/src/validphys/commondataparser.py

---
 validphys2/src/validphys/commondataparser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index f4459dc0b1..7da1f038da 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -121,7 +121,7 @@ def parse_systypes(systypefile):
     return systypetable
 
 
-@dataclasses.dataclass
+@dataclasses.dataclass(frozen=True)
 class CommonDataMetadata:
     """Contains metadata information about the data being read"""
     name: str

From a159219eb724bafdc252b7829beb34deb69225c7 Mon Sep 17 00:00:00 2001
From: "Juan M. Cruz-Martinez" <juacrumar@lairen.eu>
Date: Tue, 17 Jan 2023 23:21:26 +0400
Subject: [PATCH 6/7] Apply suggestions from code review

Co-authored-by: Roy Stegeman <roystegeman@live.nl>
---
 validphys2/src/validphys/commondataparser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
index 7da1f038da..1611a1db65 100644
--- a/validphys2/src/validphys/commondataparser.py
+++ b/validphys2/src/validphys/commondataparser.py
@@ -2,7 +2,7 @@
 This module implements parsers for commondata  and systype files into useful
 datastructures, contained in the :py:mod:`validphys.coredata` module, which are
 not backed by C++ managed memory, and so they can be easily pickled and
-interfaces with common Python libraries. 
+interfaced with common Python libraries. 
 
 The validphys commondata structure is an instance of :py:class:`validphys.coredata.CommonData`
 """

From df13d189c9fbab373494fe332841011eda0c4527 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@lairen.eu>
Date: Tue, 17 Jan 2023 23:29:58 +0400
Subject: [PATCH 7/7] kinnames -> kin_names

---
 validphys2/src/validphys/coredata.py  | 6 +++---
 validphys2/src/validphys/dataplots.py | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py
index b27b7ce657..2735606197 100644
--- a/validphys2/src/validphys/coredata.py
+++ b/validphys2/src/validphys/coredata.py
@@ -8,7 +8,7 @@
 import numpy as np
 import pandas as pd
 
-KINNAMES = ["kin1", "kin2", "kin3"]
+KIN_NAMES = ["kin1", "kin2", "kin3"]
 
 
 @dataclasses.dataclass(eq=False)
@@ -248,7 +248,7 @@ class CommonData:
 
     def __post_init__(self):
         self.systematics_table = self.commondata_table.drop(
-            columns=["process", "data", "stat"] + KINNAMES
+            columns=["process", "data", "stat"] + KIN_NAMES
         )
 
     def with_cuts(self, cuts):
@@ -286,7 +286,7 @@ def with_cuts(self, cuts):
 
     @property
     def kinematics(self):
-        return self.commondata_table[KINNAMES]
+        return self.commondata_table[KIN_NAMES]
 
     def get_kintable(self):
         return self.kinematics.values
diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py
index 31b7fb2062..b1f54a0344 100644
--- a/validphys2/src/validphys/dataplots.py
+++ b/validphys2/src/validphys/dataplots.py
@@ -26,6 +26,7 @@
 from validphys.plotoptions import get_info, kitable, transform_result
 from validphys import plotutils
 from validphys.utils import sane_groupby_iter, split_ranges, scale_from_grid
+from validphys.coredata import KIN_NAMES
 
 log = logging.getLogger(__name__)
 
@@ -964,7 +965,7 @@ def plot_positivity(pdfs, positivity_predictions_for_pdfs, posdataset, pos_use_k
     xvals = []
 
     if pos_use_kin:
-        kin_name = "kin1"
+        kin_name = KIN_NAMES[0]
         ax.set_xlabel(kin_name)
         xvals = posset.kinematics[kin_name].values
     else: