NNPDF · scarlehoff · Feb 22, 2023 · Dec 13, 2022 · Dec 13, 2022 · Dec 13, 2022
diff --git a/n3fit/runcards/examples/developing.yml b/n3fit/runcards/examples/developing.yml
@@ -92,6 +92,7 @@ parameters: # This defines the parameter dictionary that is passed to the Model
 
 fitting:
   fitbasis: EVOL # EVOL (7), EVOLQED (8), etc.
+  savepseudodata: False
   basis:
   - {fl: sng, trainable: false, smallx: [1.093, 1.121], largex: [1.486, 3.287]}
   - {fl: g, trainable: false, smallx: [0.8329, 1.071], largex: [3.084, 6.767]}

diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
@@ -124,6 +124,7 @@ def parse_systypes(systypefile):
 @dataclasses.dataclass(frozen=True)
 class CommonDataMetadata:
     """Contains metadata information about the data being read"""
+
     name: str
     nsys: int
     ndata: int

diff --git a/validphys2/src/validphys/commondatawriter.py b/validphys2/src/validphys/commondatawriter.py
@@ -0,0 +1,83 @@
+"""
+This module contains functions to write commondata and systypes
+tables to files
+"""
+
+def write_commondata_data(commondata, buffer):
+    """
+    write commondata table to buffer, this can be a memory map,
+    compressed archive or strings (using for instance StringIO)
+
+
+    Parameters
+    ----------
+
+    commondata : validphys.coredata.CommonData
+
+    buffer : memory map, compressed archive or strings
+            example: StringIO object
+
+
+    Example
+    -------
+    >>> from validphys.loader import Loader
+    >>> from io import StringIO
+
+    >>> l = Loader()
+    >>> cd = l.check_commondata("NMC").load_commondata_instance()
+    >>> sio = StringIO()
+    >>> write_commondata_data(cd,sio)
+    >>> print(sio.getvalue())
+
+    """
+    header = f"{commondata.setname} {commondata.nsys} {commondata.ndata}\n"
+    buffer.write(header)
+    commondata.commondata_table.to_csv(buffer, sep="\t", header=None)
+
+
+def write_commondata_to_file(commondata, path):
+    """
+    write commondata table to file
+    """
+    with open(path, "w") as file:
+        write_commondata_data(commondata, file)
+
+
+def write_systype_data(commondata, buffer):
+    """
+    write systype table to buffer, this can be a memory map,
+    compressed archive or strings (using for instance StringIO)
+
+
+    Parameters
+    ----------
+
+    commondata : validphys.coredata.CommonData
+
+    buffer : memory map, compressed archive or strings
+            example: StringIO object
+
+
+    Example
+    -------
+    >>> from validphys.loader import Loader
+    >>> from io import StringIO
+
+    >>> l = Loader()
+    >>> cd = l.check_commondata("NMC").load_commondata_instance()
+    >>> sio = StringIO()
+    >>> write_systype_data(cd,sio)
+    >>> print(sio.getvalue())
+
+    """
+    header = f"{commondata.nsys}\n"
+    buffer.write(header)
+    commondata.systype_table.to_csv(buffer, sep="\t", header=None)
+
+
+def write_systype_to_file(commondata, path):
+    """
+    write systype table to file
+    """
+    with open(path, "w") as file:
+        write_systype_data(commondata, file)
diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py
@@ -277,6 +277,14 @@ def __iter__(self):
     def load(self):
         return parse_commondata(self.datafile, self.sysfile, self.name)
 
+    def load_commondata_instance(self):
+        """
+        load a validphys.core.CommonDataSpec to validphys.core.CommonData
+        """
+        from validphys.commondataparser import load_commondata
+
+        return load_commondata(self)
+
     @property
     def plot_kinlabels(self):
         return get_plot_kinlabels(self)
@@ -618,6 +626,21 @@ def load(self):
     def load_commondata(self):
         return [d.load_commondata() for d in self.datasets]
 
+
+    def load_commondata_instance(self):
+        """
+        Given Experiment load list of validphys.coredata.CommonData
+        objects with cuts already applied
+        """
+        commodata_list = []
+        for dataset in self.datasets:
+            cd = dataset.commondata.load_commondata_instance()
+            if dataset.cuts is None:
+                commodata_list.append(cd)
+            else:
+                commodata_list.append(cd.with_cuts(dataset.cuts.load()))
+        return commodata_list
+
     @property
     def thspec(self):
         #TODO: Is this good enough? Should we explicitly pass the theory

diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py
@@ -7,7 +7,7 @@
 import dataclasses
 import numpy as np
 import pandas as pd
-
+from validphys.commondatawriter import write_systype_to_file, write_commondata_to_file
 KIN_NAMES = ["kin1", "kin2", "kin3"]
 
 
@@ -295,6 +295,11 @@ def get_kintable(self):
     def central_values(self):
         return self.commondata_table["data"]
 
+    def with_central_value(self, cv):
+         tb = self.commondata_table.copy()
+         tb["data"] = cv
+         return dataclasses.replace(self, commondata_table=tb)
+
     def get_cv(self):
         return self.central_values.values
 
@@ -364,21 +369,18 @@ def systematic_errors(self, central_values=None):
         converted_mult_errors = self.multiplicative_errors * central_values[:, np.newaxis] / 100
         return pd.concat((self.additive_errors, converted_mult_errors), axis=1)
 
+
     def export(self, path):
         """Export the data, and error types
-        Use the same format as libNNPDF:
+         Use the same format as libNNPDF:
 
         - A DATA_<dataset>.dat file with the dataframe of accepted points
         - A systypes/STYPES_<dataset>.dat file with the error types
         """
+
         dat_path = path / f"DATA_{self.setname}.dat"
         sys_path = path / "systypes" / f"SYSTYPE_{self.setname}_DEFAULT.dat"
         sys_path.parent.mkdir(exist_ok=True)
 
-        dat_string_raw = self.commondata_table.to_string(index=False, header=False, float_format="{:.8e}".format)
-        header = f"{self.setname}    {self.nsys} {self.ndata}"
-        dat_string = "\n".join([f" {i+1}    {r}" for i, r in enumerate(dat_string_raw.split("\n"))])
-        dat_path.write_text(f"{header}\n{dat_string}\n")
-
-        sys_raw = self.systype_table.to_string(index=True, header=False, index_names=False)
-        sys_path.write_text(f"{self.nsys}\n{sys_raw}\n")
+        write_systype_to_file(self, sys_path)
+        write_commondata_to_file(self, dat_path)
diff --git a/validphys2/src/validphys/filters.py b/validphys2/src/validphys/filters.py
@@ -9,10 +9,13 @@
 
 import numpy as np
 
-from reportengine.checks import make_argcheck, check, check_positive, make_check
+from reportengine.checks import check, make_check
 from reportengine.compat import yaml
 import validphys.cuts
-
+from validphys.commondatawriter import (
+        write_commondata_to_file,
+        write_systype_to_file,
+    )
 log = logging.getLogger(__name__)
 
 KIN_LABEL = {
@@ -72,12 +75,6 @@ def default_filter_rules_input():
     return yaml.safe_load(read_text(validphys.cuts, "filters.yaml"))
 
 
-@make_argcheck
-def check_rngalgo(rngalgo: int):
-    """Check rngalgo content"""
-    check(0 <= rngalgo < 17,
-          "Invalid rngalgo. Must be int between [0, 16].")
-
 
 def check_nonnegative(var: str):
     """Ensure that `var` is positive"""
@@ -100,41 +97,20 @@ def export_mask(path, mask):
     """Dump mask to file"""
     np.savetxt(path, mask, fmt='%d')
 
-@check_rngalgo
-@check_nonnegative('filterseed')
-@check_nonnegative('seed')
-def prepare_nnpdf_rng(filterseed:int, rngalgo:int, seed:int):
-    """Initialise the internal NNPDF RNG, specified by ``rngalgo`` which must
-    be an integer between 0 and 16, seeded with ``filterseed``.
-    The RNG can then be subsequently used to i.e generate pseudodata.
-    """
-    try:
-        from NNPDF import RandomGenerator
-    except ImportError as e:
-        logging.error("Generating closure data needs a valid installation of libNNPDF")
-        raise e
-
-    log.warning("Importing libNNPDF")
-    log.info("Initialising RNG")
-    RandomGenerator.InitRNG(rngalgo, seed)
-    RandomGenerator.GetRNG().SetSeed(filterseed)
-
-@check_positive('errorsize')
-def filter_closure_data(filter_path, data, fakepdf, fakenoise, errorsize, prepare_nnpdf_rng):
+
+def filter_closure_data(filter_path, data, fakepdf, fakenoise, filterseed):
     """Filter closure data. In addition to cutting data points, the data is
     generated from an underlying ``fakepdf``, applying a shift to the data
     if ``fakenoise`` is ``True``, which emulates the experimental central values
     being shifted away from the underlying law.
 
     """
     log.info('Filtering closure-test data.')
-    return _filter_closure_data(
-        filter_path, data, fakepdf, fakenoise, errorsize)
+    return _filter_closure_data(filter_path, data, fakepdf, fakenoise, filterseed)
 
 
-@check_positive("errorsize")
 def filter_closure_data_by_experiment(
-    filter_path, experiments_data, fakepdf, fakenoise, errorsize, prepare_nnpdf_rng,
+    filter_path, experiments_data, fakepdf, fakenoise, filterseed, experiments_index
 ):
     """
     Like :py:func:`filter_closure_data` except filters data by experiment.
@@ -145,10 +121,19 @@ def filter_closure_data_by_experiment(
     not reproducible.
 
     """
-    return [
-        _filter_closure_data(filter_path, exp, fakepdf, fakenoise, errorsize)
-        for exp in experiments_data
-    ]
+
+    res = []
+    for exp in experiments_data:
+        experiment_index = experiments_index[
+            experiments_index.isin([exp.name], level=0)
+        ]
+        res.append(
+            _filter_closure_data(
+                filter_path, exp, fakepdf, fakenoise, filterseed, experiment_index
+            )
+        )
+
+    return res
 
 
 def filter_real_data(filter_path, data):
@@ -183,6 +168,7 @@ def _write_ds_cut_data(path, dataset):
 
 def _filter_real_data(filter_path, data):
     """Filter real experimental data."""
+
     total_data_points = 0
     total_cut_data_points = 0
     for dataset in data.datasets:
@@ -194,24 +180,93 @@ def _filter_real_data(filter_path, data):
     return total_data_points, total_cut_data_points
 
 
-def _filter_closure_data(filter_path, data, fakepdf, fakenoise, errorsize):
-    """Filter closure test data."""
+def _filter_closure_data(
+    filter_path, data, fakepdf, fakenoise, filterseed, experiments_index
+):
+    """
+    This function is accessed within a closure test only, that is, the fakedata
+    namespace has to be True (If fakedata = False, the _filter_real_data function
+    will be used to write the commondata files).
+
+    The function writes commondata and systypes files within the
+    name_closure_test/filter folder.
+    If fakenoise is True, Level 1 type data is written to the filter folder, otherwise
+    Level 0 data is written.
+
+    Level 1 data is generated from the Level 0 data by adding noise sampled from
+    the experimental covariance matrix using the validphys.pseudodata.make_replica
+    function.
+
+    Parameters
+    ----------
+
+    filter_path : str
+                  path to filter folder
+
+    data : validphys.core.DataGroupSpec
+
+    fakepdf : validphys.core.PDF
+
+    fakenoise : bool
+                if fakenoise perform level1 shift of central data values
+
+    filterseed : int
+                 random seed used for the generation of
+                 random noise added to Level 0 data
+
+
+    experiments_index : pandas.MultiIndex
+
+
+    Returns
+    -------
+    tuple
+         total data points and points passing the cuts
+
+    """
     total_data_points = 0
     total_cut_data_points = 0
-    fakeset = fakepdf.legacy_load()
-    # Load data, don't cache result
-    loaded_data = data.load.__wrapped__(data)
-    # generate level 1 shift if fakenoise
-    loaded_data.MakeClosure(fakeset, fakenoise)
-    for j, dataset in enumerate(data.datasets):
+
+    # circular import generated @ core.py
+    from validphys.pseudodata import level0_commondata_wc, make_level1_data
+
+    closure_data = level0_commondata_wc(data, fakepdf)
+
+    for dataset in data.datasets:
+        #== print number of points passing cuts, make dataset directory and write FKMASK  ==#
         path = filter_path / dataset.name
         nfull, ncut = _write_ds_cut_data(path, dataset)
+        make_dataset_dir(path / "systypes")
         total_data_points += nfull
         total_cut_data_points += ncut
-        loaded_ds = loaded_data.GetSet(j)
-        if errorsize != 1.0:
-            loaded_ds.RescaleErrors(errorsize)
-        loaded_ds.Export(str(path))
+
+    if fakenoise:
+        #======= Level 1 closure test =======#
+
+        closure_data = make_level1_data(
+                data,
+                closure_data,
+                filterseed,
+                experiments_index,
+            )
+
+    #====== write commondata and systype files ======#
+    if fakenoise:
+        log.info("Writing Level1 data")
+    else:
+        log.info("Writing Level0 data")
+
+    for cd in closure_data:
+        path_cd = filter_path / cd.setname / f"DATA_{cd.setname}.dat"
+        path_sys = (
+            filter_path
+            / cd.setname
+            / "systypes"
+            / f"SYSTYPE_{cd.setname}_DEFAULT.dat"
+        )
+        write_commondata_to_file(commondata=cd, path=path_cd)
+        write_systype_to_file(commondata=cd, path=path_sys)
+
     return total_data_points, total_cut_data_points