NNPDF · siranipour · Sep 1, 2021 · Jul 19, 2021 · Jul 28, 2021 · Aug 31, 2021
diff --git a/doc/sphinx/source/n3fit/runcard_detailed.rst b/doc/sphinx/source/n3fit/runcard_detailed.rst
@@ -338,6 +338,25 @@ and load it in a different runcard and continue the training from there.
 While the load file is read as an absolute path, the file to save to will be found
 inside the replica folder.
 
+Saving and loading fit pseudodata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the user wishes to save the Monte Carlo pseudodata used for each replica within a fit,
+they can do so using the ``savepseudodata`` flag under the ``fitting`` top-level namespace:
+
+.. code-block:: yaml
+
+   fitting:
+      savepseudodata: true
+
+This will cause a ``csv`` file to be saved for each replica under
+``<fit_directory>/replica_<number>/datacuts_theory_fitting_training_pseudodata.csv`` and
+``<fit_directory>/replica_<number>/datacuts_theory_fitting_validation_pseudodata.csv``
+for the training and validation splits respectively. The data points are indexed
+according to their experiment. Additionally, the union of these two is saved in
+``<fit_directory>/replica_<number>/datacuts_theory_fitting_pseudodata_table.csv``
+if one is not interested in the exact nature of the splitting.
+
 
 Imposing sum rules
 ^^^^^^^^^^^^^^^^^^

diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py
@@ -26,6 +26,9 @@
     actions_ = []
 )
 
+FIT_NAMESPACE = "datacuts::theory::fitting "
+CLOSURE_NAMESPACE = "datacuts::theory::closuretest::fitting "
+
 N3FIT_PROVIDERS = [
     "n3fit.performfit",
     "n3fit.n3fit_checks_provider",
@@ -124,10 +127,11 @@ def from_yaml(cls, o, *args, **kwargs):
             raise ConfigError(f"Expecting input runcard to be a mapping, " f"not '{type(file_content)}'.")
 
         if file_content.get('closuretest') is not None:
-            fit_action = 'datacuts::theory::closuretest::fitting performfit'
+            namespace = CLOSURE_NAMESPACE
         else:
-            fit_action = 'datacuts::theory::fitting performfit'
-        N3FIT_FIXED_CONFIG['actions_'].append(fit_action)
+            namespace = FIT_NAMESPACE
+
+        N3FIT_FIXED_CONFIG['actions_'].append(namespace + "performfit")
 
         if file_content["fitting"].get("savepseudodata"):
             if len(kwargs["environment"].replicas) != 1:
@@ -137,8 +141,10 @@ def from_yaml(cls, o, *args, **kwargs):
                     "to `false` or fit replicas one at a time."
                 )
             # take same namespace configuration on the pseudodata_table action.
-            table_action = fit_action.replace('performfit', 'pseudodata_table')
-            N3FIT_FIXED_CONFIG['actions_'].append(table_action)
+            training_action = namespace + "training_pseudodata"
+            validation_action = namespace + "validation_pseudodata"
+
+            N3FIT_FIXED_CONFIG['actions_'].extend((training_action, validation_action))
 
         file_content.update(N3FIT_FIXED_CONFIG)
         return cls(file_content, *args, **kwargs)

diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py
@@ -354,8 +354,11 @@ def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nn
 exps_pseudodata = collect("generate_data_replica", ("group_dataset_inputs_by_experiment",))
 replicas_exps_pseudodata = collect("exps_pseudodata", ("replicas",))
 
+replicas_indexed_make_replica = collect('indexed_make_replica', ('replicas',))
+
+
 @table
-def pseudodata_table(replicas_exps_pseudodata, replicas, experiments_index):
+def pseudodata_table(replicas_indexed_make_replica, replicas):
     """Creates a pandas DataFrame containing the generated pseudodata. The
     index is :py:func:`validphys.results.experiments_index` and the columns
     are the replica numbers.
@@ -367,23 +370,42 @@ def pseudodata_table(replicas_exps_pseudodata, replicas, experiments_index):
     The table can be found in the replica folder i.e. <fit dir>/nnfit/replica_*/
 
     """
-    rep_dfs = []
-    for rep_exps_pseudodata, rep in zip(replicas_exps_pseudodata, replicas):
-        all_pseudodata = np.concatenate(rep_exps_pseudodata)
-        rep_dfs.append(pd.DataFrame(
-            all_pseudodata,
-            columns=[f"replica {rep}"],
-            index=experiments_index
-        ))
-    return pd.concat(rep_dfs, axis=1)
+    df = pd.concat(replicas_indexed_make_replica)
+    df.columns = [f"replica {rep}" for rep in replicas]
+    return df
+
+
+@table
+def training_pseudodata(pseudodata_table, training_mask):
+    """Save the training data for the given replica.
+    Activate by setting ``fitting::savepseudodata: True``
+    from within the fit runcard.
+
+    See Also
+    --------
+    :py:func:`validphys.n3fit_data.validation_pseudodata`
+    """
+    return pseudodata_table.loc[training_mask.values]
+
+
+@table
+def validation_pseudodata(pseudodata_table, training_mask):
+    """Save the training data for the given replica.
+    Activate by setting ``fitting::savepseudodata: True``
+    from within the fit runcard.
+
+    See Also
+    --------
+    :py:func:`validphys.n3fit_data.training_pseudodata`
+    """
+    return pseudodata_table.loc[~training_mask.values]
 
 
 exps_tr_masks = collect("tr_masks", ("group_dataset_inputs_by_experiment",))
 replicas_exps_tr_masks = collect("exps_tr_masks", ("replicas",))
 
 
-@table
-def training_mask_table(replicas_exps_tr_masks, replicas, experiments_index):
+def training_mask(replicas_exps_tr_masks, replicas, experiments_index):
     """Save the boolean mask used to split data into training and validation
     for each replica as a pandas DataFrame, indexed by
     :py:func:`validphys.results.experiments_index`. Can be used to reconstruct
@@ -451,6 +473,11 @@ def training_mask_table(replicas_exps_tr_masks, replicas, experiments_index):
         ))
     return pd.concat(rep_dfs, axis=1)
 
+
+@table
+def training_mask_table(training_mask):
+    return training_mask
+
 def fitting_pos_dict(posdataset):
     """Loads a positivity dataset. For more information see
     :py:func:`validphys.n3fit_data_utils.positivity_reader`.