NNPDF · scarlehoff · Nov 12, 2021 · Oct 5, 2021 · Oct 5, 2021 · Oct 6, 2021
diff --git a/validphys2/examples/mc_gen_example.yaml b/validphys2/examples/mc_gen_example.yaml
@@ -7,14 +7,14 @@ meta:
     keywords: [pseudodata, test, artificial replica]
 
 fit: NNPDF40_nlo_as_01180
-
 theoryid: 208
+mcseed: 4
+genrep: True
 
 template: mc_gen_report.md
 
 actions_:
   - report(main=true)
 
 dataset_inputs:
-  - { dataset: CHORUSNBPb_dw_ite, frac: 0.5}
-
+  - { dataset: CHORUSNBPb_dw_ite, frac: 0.5}
diff --git a/validphys2/src/validphys/chi2grids.py b/validphys2/src/validphys/chi2grids.py
@@ -4,109 +4,71 @@
 Compute and store χ² data from replicas, possibly keeping the correlations
 between pseudorreplica fluctuations between different fits. This is applied
 here to parameter determinations such as those of αs.
-
-This module is severly thwarted by the poor adecuacy of libnnpdf for this use
-case. Several pieces of functionality need to be implemented there.
 """
 import logging
 from collections import namedtuple
 
+import numpy as np
 import pandas as pd
 
 from reportengine import collect
 from reportengine.table import table
-from NNPDF import pseudodata, single_replica, RandomGenerator
-
-from validphys.core import PDF
-from validphys.results import ThPredictionsResult, DataResult, chi2_breakdown_by_dataset
+from validphys.calcutils import calc_chi2
 
-PseudoReplicaExpChi2Data = namedtuple('PseudoReplicaChi2Data',
-    ['experiment', 'dataset', 'ndata', 'chi2', 'nnfit_index'])
+PseudoReplicaExpChi2Data = namedtuple(
+    "PseudoReplicaChi2Data", ["group", "ndata", "chi2", "nnfit_index"]
+)
 
 
 log = logging.getLogger(__name__)
 
-def computed_psedorreplicas_chi2(
-        experiments, dataseed, pdf, fitted_replica_indexes,
-        t0set:(PDF, type(None))):
-    """Return a dataframe with the chi² of each replica wirh its corrsponding
-    pseudodata (i.e. the one it was fitted with). The chi² is computed for
-    both each experiment and each dataset in the experiment. The index of the
-    dataframe is
-
-    ['experiment', 'dataset', 'ndata' , 'nnfit_index']
 
-    where 'experiment' is the name of the experiment, 'dataset' is the name of
-    the dataset, or "Total" for the total value, 'ndata' is the corresponding
-    number of points and 'nnfit_index' is the index specifying the
-    pseudorreplica fluctuation.
+def computed_pseudoreplicas_chi2(
+    fitted_make_replicas,
+    group_result_table_no_table,  # to get the results already in the form of a dataframe
+    groups_sqrtcovmat,
+):
+    """Return a dataframe with the chi² of each replica with its corresponding
+    pseudodata (i.e. the one it was fitted with). The chi² is computed by group.
+    The index of the output dataframe is
+        ``['group',  'ndata' , 'nnfit_index']``
+    where ``nnftix_index`` is the name of the corresponding replica
     """
-
-    #TODO: Everythning about this function is horrible. We need to rewrite
-    #experiments.cc from scratch.
-
-    #TODO: Do this somewhere else
-    RandomGenerator.InitRNG(0,0)
-    if t0set is not None:
-        lt0 = t0set.load_t0()
-    pdfname = pdf.name
-    datas = []
-
-    #No need to save these in the cache, so we call __wrapped__
-    original_experiments = [e.load.__wrapped__(e) for e in experiments]
-    sqrtcovmat_table = []
-    log.debug("Generating dataset covmats")
-    for exp in original_experiments:
-        if t0set is not None:
-            exp.SetT0(lt0)
-        #The covariance matrices are currently very expensive to recompute.
-        #Store them after computing T0
-        sqrtcovmat_table.append([ds.get_sqrtcovmat() for ds in exp.DataSets()])
-
-    for lhapdf_index, nnfit_index in enumerate(fitted_replica_indexes, 1):
-
-        flutuated_experiments = pseudodata(original_experiments, dataseed, nnfit_index)
-        lpdf = single_replica(pdfname, lhapdf_index)
-        for expspec, exp, mats in zip(experiments, flutuated_experiments, sqrtcovmat_table):
-            #We need to manage the memory
-            exp.thisown = True
-
-            th = ThPredictionsResult.from_convolution(pdf, expspec,
-                loaded_data=exp, loaded_pdf=lpdf)
-
-
-            results = DataResult(exp, exp.get_covmat(), exp.get_sqrtcovmat()), th
-            #The experiment already has T0. No need to set it again.
-            #TODO: This is a hack. Get rid of this.
-            chi2 = chi2_breakdown_by_dataset(results, exp, t0set=None,
-                                             datasets_sqrtcovmat=mats)
-
-            for i, (dsname,(value, ndata)) in enumerate(chi2.items()):
-                data = PseudoReplicaExpChi2Data(
-                    nnfit_index=nnfit_index,
-                    experiment=expspec.name,
-                    #We set the i so that the sort order is maintaned here.
-                    dataset = (i, dsname),
-                    ndata = ndata,
-                    chi2=value
-                    )
-                datas.append(data)
-
-    df =  pd.DataFrame(datas, columns=PseudoReplicaExpChi2Data._fields)
-    df.set_index(['experiment', 'dataset', 'ndata' , 'nnfit_index'], inplace=True)
+    # Stack the replica pseudodata to have the prediction shape
+    r_data = np.stack(fitted_make_replicas, axis=1)
+
+    # Drop data central and theory central which is not useful here
+    r_prediction = group_result_table_no_table.drop(columns=["data_central", "theory_central"])
+
+    # Now compute the chi2 in a per-group basis
+    diff = r_prediction - r_data
+    group_level = r_prediction.index.get_level_values("group")
+
+    # Save the results in a dataframe similar (but not equal) to the old one
+    df_output = []
+    for group in group_level.unique():
+        group_diff = diff.loc[group_level == group]
+        its_covmat = groups_sqrtcovmat[group_level == group][group]
+        chi2_per_replica = calc_chi2(its_covmat, group_diff)
+        ndata = len(group_diff)
+        for i, chi2 in enumerate(chi2_per_replica):
+            df_output.append(PseudoReplicaExpChi2Data(group, ndata, chi2, i))
+
+    df = pd.DataFrame(df_output, columns=PseudoReplicaExpChi2Data._fields)
+    df.set_index(["group", "ndata", "nnfit_index"], inplace=True)
     df.sort_index(inplace=True)
-    #Now that we have the order we like, we remove the i
-    df.index.set_levels([x[1] for x in df.index.levels[1]], level=1, inplace=True)
     return df
 
-#TODO: Probably fitcontext should set all of the variables required to compute
-#this. But better setting
-#them explicitly than setting some, se we require the user to do that.
-fits_computed_psedorreplicas_chi2 = collect(computed_psedorreplicas_chi2, ('fits',))
 
-dataspecs_computed_pseudorreplicas_chi2 = collect(computed_psedorreplicas_chi2, ('dataspecs',))
+# TODO: Probably fitcontext should set all of the variables required to compute
+# this. But better setting
+# them explicitly than setting some, so we require the user to do that.
+fits_computed_pseudoreplicas_chi2 = collect(computed_pseudoreplicas_chi2, ("fits",))
+
+dataspecs_computed_pseudorreplicas_chi2 = collect(computed_pseudoreplicas_chi2, ("dataspecs",))
+
 
 @table
-def export_fits_computed_psedorreplicas_chi2(fits_computed_psedorreplicas_chi2):
+def export_fits_computed_pseudoreplicas_chi2(fits_computed_pseudoreplicas_chi2):
     """Hack to force writting the CSV output"""
-    return fits_computed_psedorreplicas_chi2
+    return fits_computed_pseudoreplicas_chi2
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
@@ -216,6 +216,10 @@ def parse_use_cuts(self, use_cuts: (bool, str)):
 
         return res
 
+    def produce_replicas(self, nreplica: int):
+        """Produce a replicas array"""
+        return NSList(range(1, nreplica+1), nskey="replica")
+
     def produce_inclusive_use_scalevar_uncertainties(self, use_scalevar_uncertainties: bool = False,
                                         point_prescription: (str, None) = None):
         """Whether to use a scale variation uncertainty theory covmat.

diff --git a/validphys2/src/validphys/filters.py b/validphys2/src/validphys/filters.py
@@ -10,7 +10,7 @@
 
 import numpy as np
 
-from NNPDF import CommonData, RandomGenerator
+from NNPDF import CommonData
 from reportengine.checks import make_argcheck, check, check_positive, make_check
 from reportengine.compat import yaml
 import validphys.cuts
@@ -83,6 +83,13 @@ def prepare_nnpdf_rng(filterseed:int, rngalgo:int, seed:int):
     be an integer between 0 and 16, seeded with ``filterseed``.
     The RNG can then be subsequently used to i.e generate pseudodata.
     """
+    try:
+        from NNPDF import RandomGenerator
+    except ImportError as e:
+        logging.error("Generating closure data needs a valid installation of libNNPDF")
+        raise e
+
+    log.warning("Importing libNNPDF")
     log.info("Initialising RNG")
     RandomGenerator.InitRNG(rngalgo, seed)
     RandomGenerator.GetRNG().SetSeed(filterseed)