From acfda7ff0ae6a5b32667cbdbfcf2b1865492ab75 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 15 Dec 2020 16:27:15 +0100
Subject: [PATCH 01/27] fit many models at once

---
 n3fit/src/n3fit/ModelTrainer.py               |  45 +++---
 .../n3fit/backends/keras_backend/losses.py    |   4 +-
 .../backends/keras_backend/operations.py      |  14 +-
 n3fit/src/n3fit/layers/DY.py                  |   2 +-
 n3fit/src/n3fit/layers/Rotations.py           |   4 +-
 n3fit/src/n3fit/model_gen.py                  | 136 +++++++++---------
 n3fit/src/n3fit/msr.py                        |   5 +-
 n3fit/src/n3fit/performfit.py                 |  58 +++++---
 8 files changed, 152 insertions(+), 116 deletions(-)

diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index 1fa73109b8..c7f4279850 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -177,8 +177,9 @@ def __init__(
         save_weights_each=False,
         kfold_parameters=None,
         max_cores=None,
-        model_file=None, 
+        model_file=None,
         sum_rules=True,
+        parallel_models=1
     ):
         """
         Parameters
@@ -213,6 +214,7 @@ def __init__(
         self.debug = debug
         self.save_weights_each = save_weights_each
         self.all_datasets = []
+        self.parallel_models = parallel_models
 
         # Initialise internal variables which define behaviour
         if debug:
@@ -359,7 +361,7 @@ def _fill_the_dictionaries(self):
                 self.training["expdata"].append(integ_dict["expdata"])
                 self.training["integdatasets"].append(integ_dict["name"])
 
-    def _model_generation(self, pdf_model, partition):
+    def _model_generation(self, pdf_models, partition):
         """
         Fills the three dictionaries (``training``, ``validation``, ``experimental``)
         with the ``model`` entry
@@ -398,13 +400,23 @@ def _model_generation(self, pdf_model, partition):
         input_arr = np.concatenate(self.input_list, axis=1)
         input_layer = operations.numpy_to_input(input_arr.T)
 
-        # The input to the full model is expected to be the input to the PDF
-        # by reutilizing `pdf_model.parse_input` we ensure any auxiliary input is also accunted fro
-        full_model_input_dict = pdf_model._parse_input([input_layer], pass_content=False)
+        # The trainable part of the model is a concatenation of all PDF models
+        # where each model corresponds to a different replica
+        all_replicas_pdf = []
+
+        for pdf_model in pdf_models:
+            # The input to the full model is expected to be the input to the PDF
+            # by reutilizing `pdf_model.parse_input` we ensure any auxiliary input is also accunted fro
+            full_model_input_dict = pdf_model._parse_input([input_layer], pass_content=False)
+
+            # The output of the pdf on input_layer will be thus a concatenation
+            # of the PDF values for all experiments
+            full_pdf = pdf_model.apply_as_layer([input_layer])
+
+            all_replicas_pdf.append(full_pdf)
+
+        full_pdf_per_replica = operations.stack(all_replicas_pdf, axis=-1)
 
-        # The output of the pdf on input_layer will be thus a concatenation
-        # of the PDF values for all experiments
-        full_pdf = pdf_model.apply_as_layer([input_layer])
         # The input layer is a concatenation of all experiments
         # we need now to split the output on a different array per experiment
         sp_ar = [self.input_sizes]
@@ -412,7 +424,7 @@ def _model_generation(self, pdf_model, partition):
         splitting_layer = operations.as_layer(
             operations.split, op_args=sp_ar, op_kwargs=sp_kw, name="pdf_split"
         )
-        splitted_pdf = splitting_layer(full_pdf)
+        splitted_pdf = splitting_layer(full_pdf_per_replica)
 
         # If we are in a kfolding partition, select which datasets are out
         if partition:
@@ -610,11 +622,11 @@ def _generate_pdf(
             pdf_model: MetaModel
                 pdf model
         """
-        log.info("Generating PDF model")
+        log.info("Generating PDF models")
 
         # Set the parameters of the NN
         # Generate the NN layers
-        pdf_model = model_gen.pdfNN_layer_generator(
+        pdf_models = model_gen.pdfNN_layer_generator(
             nodes=nodes_per_layer,
             activations=activation_per_layer,
             layer_type=layer_type,
@@ -626,8 +638,9 @@ def _generate_pdf(
             regularizer=regularizer,
             regularizer_args=regularizer_args,
             impose_sumrule=self.impose_sumrule,
+            parallel_models=self.parallel_models
         )
-        return pdf_model
+        return pdf_models
 
     def _assign_data(self, models, fold_k=0):
         """Assign to each model the data to compare with as well as the
@@ -814,7 +827,7 @@ def hyperparametrizable(self, params):
                 seed = np.random.randint(0, pow(2, 31))
 
             # Generate the pdf model
-            pdf_model = self._generate_pdf(
+            pdf_models = self._generate_pdf(
                 params["nodes_per_layer"],
                 params["activation_per_layer"],
                 params["initializer"],
@@ -827,7 +840,7 @@ def hyperparametrizable(self, params):
 
             # Model generation joins all the different observable layers
             # together with pdf model generated above
-            models = self._model_generation(pdf_model, partition)
+            models = self._model_generation(pdf_models, partition)
 
             # Only after model generation, apply possible weight file
             if self.model_file:
@@ -860,7 +873,7 @@ def hyperparametrizable(self, params):
             stopping_object = Stopping(
                 validation_model,
                 reporting,
-                pdf_model,
+                pdf_models[0], # TODO
                 total_epochs=epochs,
                 stopping_patience=stopping_epochs,
                 save_weights_each=self.save_weights_each,
@@ -934,7 +947,7 @@ def hyperparametrizable(self, params):
         dict_out["stopping_object"] = stopping_object
         dict_out["experimental"] = self.experimental
         dict_out["training"] = self.training
-        dict_out["pdf_model"] = pdf_model
+        dict_out["pdf_models"] = pdf_models
 
         # Only after the training has finished, we save all models for future reporting
         self.model_dicts = model_dicts
diff --git a/n3fit/src/n3fit/backends/keras_backend/losses.py b/n3fit/src/n3fit/backends/keras_backend/losses.py
index c4b427dd68..3bb23ce329 100644
--- a/n3fit/src/n3fit/backends/keras_backend/losses.py
+++ b/n3fit/src/n3fit/backends/keras_backend/losses.py
@@ -16,9 +16,7 @@ def l_invcovmat(invcovmat_np):
     def true_loss(y_true, y_pred):
         # (yt - yp) * covmat * (yt - yp)
         tmp = y_true - y_pred
-        right_dot = tf.tensordot(invcovmat, K.transpose(tmp), axes=1)
-        res = tf.tensordot(tmp, right_dot, axes=1)
-        return tf.reshape(res, (-1,))
+        return tf.einsum('bri,ij,brj->b', tmp, invcovmat, tmp)
 
     return true_loss
 
diff --git a/n3fit/src/n3fit/backends/keras_backend/operations.py b/n3fit/src/n3fit/backends/keras_backend/operations.py
index 5bad7ef570..847168ae38 100644
--- a/n3fit/src/n3fit/backends/keras_backend/operations.py
+++ b/n3fit/src/n3fit/backends/keras_backend/operations.py
@@ -234,6 +234,13 @@ def concatenate(tensor_list, axis=-1, target_shape=None, name=None):
         return concatenated_tensor
 
 
+def stack(tensor_list, axis=0, **kwargs):
+    """ Stack a list of tensors
+    see full `docs <https://www.tensorflow.org/api_docs/python/tf/stack>`_
+    """
+    return tf.stack(tensor_list, axis=axis, **kwargs)
+
+
 # Mathematical operations
 def pdf_masked_convolution(raw_pdf, basis_mask):
     """ Computes a masked convolution of two equal pdfs
@@ -254,11 +261,8 @@ def pdf_masked_convolution(raw_pdf, basis_mask):
             rank3 (len(mask_true), xgrid, xgrid)
     """
     pdf = tf.squeeze(raw_pdf, axis=0)  # remove the batchsize
-    luminosity = tensor_product(pdf, pdf, axes=0)
-    # (xgrid, flavour, xgrid, flavour)
-    # reshape to put the flavour indices at the beginning to apply mask
-    lumi_tmp = K.permute_dimensions(luminosity, (3, 1, 2, 0))
-    pdf_x_pdf = boolean_mask(lumi_tmp, basis_mask)
+    luminosity = tf.einsum('air,bjr->jibar', pdf, pdf)
+    pdf_x_pdf = boolean_mask(luminosity, basis_mask)
     return pdf_x_pdf
 
 
diff --git a/n3fit/src/n3fit/layers/DY.py b/n3fit/src/n3fit/layers/DY.py
index 0cacff7897..3c70d052f3 100644
--- a/n3fit/src/n3fit/layers/DY.py
+++ b/n3fit/src/n3fit/layers/DY.py
@@ -60,5 +60,5 @@ def call(self, pdf_raw):
                 results.append(res)
 
         # the masked convolution removes the batch dimension
-        ret = self.operation(results)
+        ret = op.transpose(self.operation(results))
         return op.batchit(ret)
diff --git a/n3fit/src/n3fit/layers/Rotations.py b/n3fit/src/n3fit/layers/Rotations.py
index 0933d0d0d0..0ccca32c0b 100644
--- a/n3fit/src/n3fit/layers/Rotations.py
+++ b/n3fit/src/n3fit/layers/Rotations.py
@@ -64,9 +64,9 @@ class FkRotation(MetaLayer):
     # TODO: Generate a rotation matrix in the input and just do tf.tensordot in call
     # the matrix should be: (8, 14) so that we can just do tf.tensordot(pdf, rotmat, axes=1)
     # i.e., create the matrix and inherit from the Rotation layer above
-    def __init__(self, output_dim=14, **kwargs):
+    def __init__(self, output_dim=14, name="evolution", **kwargs):
         self.output_dim = output_dim
-        super().__init__(**kwargs, name="evolution")
+        super().__init__(name, **kwargs)
 
     def call(self, pdf_raw):
         # Transpose the PDF so that the flavour index is the first one
diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 03857e4fbd..55af577085 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -131,7 +131,7 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
 
     # Prepare a concatenation as experiments are one single entity formed by many datasets
     def gen_concat(name):
-        return operations.as_layer(operations.concatenate, op_kwargs={"axis": 1}, name=name)
+        return operations.as_layer(operations.concatenate, op_kwargs={"axis": 2}, name=name)
 
     # Tensorflow operations have ugly name,
     # we want the final observables to be named just {spec_name} (with'val/exp' if needed)
@@ -360,6 +360,7 @@ def pdfNN_layer_generator(
     regularizer=None,
     regularizer_args=None,
     impose_sumrule=False,
+    parallel_models=1,
 ):  # pylint: disable=too-many-locals
     """
     Generates the PDF model which takes as input a point in x (from 0 to 1)
@@ -442,6 +443,7 @@ def pdfNN_layer_generator(
         model_pdf: n3fit.backends.MetaModel
             a model f(x) = y where x is a tensor (1, xgrid, 1) and y a tensor (1, xgrid, out)
     """
+    # Parse the input configuration
     if nodes is None:
         nodes = [15, 8]
     ln = len(nodes)
@@ -462,84 +464,90 @@ def pdfNN_layer_generator(
         raise ValueError(
             "Number of activation functions does not match number of layers @ model_gen.py"
         )
-    # The number of nodes in the last layer is equal to the number of fitted flavours (== len(flav_info))
     last_layer_nodes = nodes[-1]
 
-    if layer_type == "dense":
-        reg = regularizer_selector(regularizer, **regularizer_args)
-        list_of_pdf_layers = generate_dense_network(
-            inp,
-            nodes,
-            activations,
-            initializer_name,
-            seed=seed,
-            dropout_rate=dropout,
-            regularizer=reg,
-        )
-    elif layer_type == "dense_per_flavour":
-        # Define the basis size attending to the last layer in the network
-        # TODO: this information should come from the basis information
-        #       once the basis information is passed to this class
-        list_of_pdf_layers = generate_dense_per_flavour_network(
-            inp, nodes, activations, initializer_name, seed=seed, basis_size=last_layer_nodes,
-        )
+    # Generate the generic layers
+
+    # Prepare the input for the PDF model
+    placeholder_input = Input(shape=(None, 1), batch_size=1)
 
     # If the input is of type (x, logx)
     # create a x --> (x, logx) layer to preppend to everything
     if inp == 2:
         add_log = Lambda(lambda x: operations.concatenate([x, operations.op_log(x)], axis=-1))
 
-    def dense_me(x):
-        """Takes an input tensor `x` and applies all layers
-        from the `list_of_pdf_layers` in order"""
-        if inp == 1:
-            curr_fun = list_of_pdf_layers[0](x)
-        else:
-            curr_fun = list_of_pdf_layers[0](add_log(x))
-
-        for dense_layer in list_of_pdf_layers[1:]:
-            curr_fun = dense_layer(curr_fun)
-        return curr_fun
-
-    # Preprocessing layer (will be multiplied to the last of the denses)
-    preproseed = seed + number_of_layers
-    layer_preproc = Preprocessing(
-        input_shape=(1,),
-        name="pdf_prepro",
-        flav_info=flav_info,
-        seed=preproseed,
-        output_dim=last_layer_nodes
-    )
+    # Evolution layer
+    layer_evln = FkRotation(input_shape=(last_layer_nodes,), output_dim=out)
+
     # Basis rotation
     basis_rotation = FlavourToEvolution(flav_info=flav_info, fitbasis=fitbasis)
 
-    # Evolution layer
-    layer_evln = FkRotation(input_shape=(last_layer_nodes,), output_dim=out)
+    integrator_input = None # TODO
+    pdf_models = []
+
+    # Now we need a trainable network per model to be trained in parallel
+    for i in range(parallel_models):
+        layer_seed = seed + i*number_of_layers
+        if layer_type == "dense":
+            reg = regularizer_selector(regularizer, **regularizer_args)
+            list_of_pdf_layers = generate_dense_network(
+                inp,
+                nodes,
+                activations,
+                initializer_name,
+                seed=seed,
+                dropout_rate=dropout,
+                regularizer=reg,
+            )
+        elif layer_type == "dense_per_flavour":
+            # Define the basis size attending to the last layer in the network
+            # TODO: this information should come from the basis information
+            #       once the basis information is passed to this class
+            list_of_pdf_layers = generate_dense_per_flavour_network(
+                inp, nodes, activations, initializer_name, seed=layer_seed, basis_size=last_layer_nodes,
+            )
 
 
-    # Apply preprocessing and basis
-    def layer_fitbasis(x):
-        ret = operations.op_multiply([dense_me(x), layer_preproc(x)])
-        if basis_rotation.is_identity():
-            # if we don't need to rotate basis we don't want spurious layers
-            return ret
-        return basis_rotation(ret)
+        def dense_me(x):
+            """Takes an input tensor `x` and applies all layers
+            from the `list_of_pdf_layers` in order"""
+            if inp == 1:
+                curr_fun = list_of_pdf_layers[0](x)
+            else:
+                curr_fun = list_of_pdf_layers[0](add_log(x))
 
-    # Rotation layer, changes from the 8-basis to the 14-basis
-    def layer_pdf(x):
-        return layer_evln(layer_fitbasis(x))
+            for dense_layer in list_of_pdf_layers[1:]:
+                curr_fun = dense_layer(curr_fun)
+            return curr_fun
 
-    # Prepare the input for the PDF model
-    placeholder_input = Input(shape=(None, 1), batch_size=1)
+        # Preprocessing layer (will be multiplied to the last of the denses)
+        preproseed = seed + number_of_layers*(i+1)
+        layer_preproc = Preprocessing(
+            input_shape=(1,), name=f"pdf_prepro_{i}", flav_info=flav_info, seed=preproseed
+        )
 
-    # Impose sumrule if necessary
-    if impose_sumrule:
-        layer_pdf, integrator_input = msr_constraints.msr_impose(layer_fitbasis, layer_pdf, mode=impose_sumrule)
-        model_input = [integrator_input, placeholder_input]
-    else:
-        integrator_input = None
-        model_input = [placeholder_input]
+        # Apply preprocessing and basis
+        def layer_fitbasis(x):
+            ret = operations.op_multiply([dense_me(x), layer_preproc(x)])
+            if basis_rotation.is_identity():
+                # if we don't need to rotate basis we don't want spurious layers
+                return ret
+            return basis_rotation(ret)
+
+        # Rotation layer, changes from the 8-basis to the 14-basis
+        def layer_pdf(x):
+            return layer_evln(layer_fitbasis(x))
+
+        # Impose sumrule if necessary #TODO still a lot of repetition going on inside the MSR, but not important -for now-
+        if impose_sumrule:
+            layer_pdf, integrator_input = msr_constraints.msr_impose(layer_fitbasis, layer_pdf, mode=impose_sumrule)
+            model_input = [integrator_input, placeholder_input]
+        else:
+            integrator_input = None
+            model_input = [placeholder_input]
+
+        pdf_model = MetaModel(model_input, layer_pdf(placeholder_input), name=f"PDF_{i}")
 
-    pdf_model = MetaModel(model_input, layer_pdf(placeholder_input), name="PDF")
+        pdf_models.append(pdf_model)
 
-    return pdf_model
+    return pdf_models
diff --git a/n3fit/src/n3fit/msr.py b/n3fit/src/n3fit/msr.py
index 1f552888f1..e7d706131d 100644
--- a/n3fit/src/n3fit/msr.py
+++ b/n3fit/src/n3fit/msr.py
@@ -37,7 +37,7 @@ def gen_integration_input(nx):
     return xgrid, weights_array
 
 
-def msr_impose(fit_layer, final_pdf_layer, mode='All', verbose=False):
+def msr_impose(fit_layer, final_pdf_layer, mode='All', xgrid_input=None, verbose=False):
     """
         This function receives:
             - fit_layer: the 8-basis layer of PDF which we fit
@@ -64,7 +64,8 @@ def pdf_integrand(x):
     normalizer = MSR_Normalization(input_shape=(8,), mode=mode)
 
     # 5. Make the xgrid numpy array into a backend input layer so it can be given
-    xgrid_input = operations.numpy_to_input(xgrid)
+    if xgrid_input is None:
+        xgrid_input = operations.numpy_to_input(xgrid)
     normalization = normalizer(integrator(pdf_integrand(xgrid_input)))
 
     def ultimate_pdf(x):
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index b12e6359ea..607d23f163 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -92,6 +92,7 @@ def performfit(
     hyperopt=None,
     debug=False,
     maxcores=None,
+    parallel_models=1,
 ):
     """
         This action will (upon having read a validcard) process a full PDF fit for a given replica.
@@ -243,7 +244,11 @@ def performfit(
             kfold_parameters=kfold_parameters,
             max_cores=maxcores,
             model_file=fitting.get("load"),
+<<<<<<< HEAD
             sum_rules=fitting.get("sum_rules", True)
+=======
+            parallel_models=parallel_models
+>>>>>>> 6e0fc5f2c (fit many models at once)
         )
 
         # This is just to give a descriptive name to the fit function
@@ -297,7 +302,7 @@ def performfit(
 
         # After the fit is run we get a 'result' dictionary with the following items:
         stopping_object = result["stopping_object"]
-        pdf_model = result["pdf_model"]
+        pdf_models = result["pdf_models"]
         true_chi2 = result["loss"]
         training = result["training"]
         log.info("Total exp chi2: %s", true_chi2)
@@ -317,31 +322,38 @@ def performfit(
             )
         )
 
-        # Create a pdf instance
-        pdf_instance = N3PDF(pdf_model, fit_basis=fitting.get("basis"))
+        final_time = stopwatch.stop()
 
-        # Generate the writer wrapper
-        writer_wrapper = WriterWrapper(
-            replica_number,
-            pdf_instance,
-            stopping_object,
-            theoryid.get_description().get("Q0") ** 2,
-            stopwatch.stop(),
-        )
+        for i, pdf_model in enumerate(pdf_models):
+            # Each model goes into its own replica folder
+            replica_path_set = replica_path / f"replica_{replica_number + i}"
 
-        # Now write the data down
-        training_chi2, val_chi2, exp_chi2 = the_model_trainer.evaluate(stopping_object)
-        writer_wrapper.write_data(
-            replica_path_set, output_path.name, training_chi2, val_chi2, true_chi2
-        )
+            # Create a pdf instance
+            pdf_instance = N3PDF(pdf_model, fit_basis=fitting.get("basis"))
+
+            # Generate the writer wrapper
+            writer_wrapper = WriterWrapper(
+                replica_number,
+                pdf_instance,
+                stopping_object, # TODO
+                theoryid.get_description().get("Q0") ** 2,
+                final_time,
+            )
+
+            # Now write the data down
+            # TODO: recompute training, valudation and experimental _per_ pdfmodel
+            training_chi2, val_chi2, exp_chi2 = the_model_trainer.evaluate(stopping_object)
+            writer_wrapper.write_data(
+                replica_path_set, output_path.name, training_chi2, val_chi2, true_chi2
+            )
 
-        # Save the weights to some file for the given replica
-        model_file = fitting.get("save")
-        if model_file:
-            model_file_path = replica_path_set / model_file
-            log.info(" > Saving the weights for future in %s", model_file_path)
-            # Need to use "str" here because TF 2.2 has a bug for paths objects (fixed in 2.3 though)
-            pdf_model.save_weights(str(model_file_path), save_format="h5")
+            # Save the weights to some file for the given replica
+            model_file = fitting.get("save")
+            if model_file:
+                model_file_path = replica_path_set / model_file
+                log.info(" > Saving the weights for future in %s", model_file_path)
+                # Need to use "str" here because TF 2.2 has a bug for paths objects (fixed in 2.3 though)
+                pdf_model.save_weights(str(model_file_path), save_format="h5")
 
         # If the history of weights is active then loop over it
         # rewind the state back to every step and write down the results

From 15e50eec83c8711b542284235d43dafe19072c24 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Wed, 16 Dec 2020 11:48:47 +0100
Subject: [PATCH 02/27] avoid repetitions in the sum rules

---
 n3fit/src/n3fit/model_gen.py | 21 +++++++++--------
 n3fit/src/n3fit/msr.py       | 45 ++++++++++++++++++------------------
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 55af577085..9ab97a41db 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -482,7 +482,15 @@ def pdfNN_layer_generator(
     # Basis rotation
     basis_rotation = FlavourToEvolution(flav_info=flav_info, fitbasis=fitbasis)
 
-    integrator_input = None # TODO
+    # Normalization and sum rules
+    if impose_sumrule:
+        sumrule_imposition, integrator_input = msr_constraints.msr_impose(mode=impose_sumrule)
+        model_input = [integrator_input, placeholder_input]
+    else:
+        sumrule_imposition = lambda x: x
+        integrator_input = None
+        model_input = [placeholder_input]
+
     pdf_models = []
 
     # Now we need a trainable network per model to be trained in parallel
@@ -538,15 +546,10 @@ def layer_fitbasis(x):
         def layer_pdf(x):
             return layer_evln(layer_fitbasis(x))
 
-        # Impose sumrule if necessary #TODO still a lot of repetition going on inside the MSR, but not important -for now-
-        if impose_sumrule:
-            layer_pdf, integrator_input = msr_constraints.msr_impose(layer_fitbasis, layer_pdf, mode=impose_sumrule)
-            model_input = [integrator_input, placeholder_input]
-        else:
-            integrator_input = None
-            model_input = [placeholder_input]
+        # Final PDF
+        final_pdf = sumrule_imposition(layer_fitbasis, layer_pdf)
 
-        pdf_model = MetaModel(model_input, layer_pdf(placeholder_input), name=f"PDF_{i}")
+        pdf_model = MetaModel(model_input, final_pdf(placeholder_input), name=f"PDF_{i}")
 
         pdf_models.append(pdf_model)
 
diff --git a/n3fit/src/n3fit/msr.py b/n3fit/src/n3fit/msr.py
index e7d706131d..d009e26054 100644
--- a/n3fit/src/n3fit/msr.py
+++ b/n3fit/src/n3fit/msr.py
@@ -37,7 +37,7 @@ def gen_integration_input(nx):
     return xgrid, weights_array
 
 
-def msr_impose(fit_layer, final_pdf_layer, mode='All', xgrid_input=None, verbose=False):
+def msr_impose(nx=int(2e3), basis_size=8, mode='All'):
     """
         This function receives:
             - fit_layer: the 8-basis layer of PDF which we fit
@@ -46,43 +46,44 @@ def msr_impose(fit_layer, final_pdf_layer, mode='All', xgrid_input=None, verbose
         the final_pdf layer with a normalisation by which the sum rule is imposed
     """
     # 1. Generate the fake input which will be used to integrate
-    nx = int(2e3)
     xgrid, weights_array = gen_integration_input(nx)
 
     # 2. Prepare the pdf for integration
     #    for that we need to multiply several flavours with 1/x
     division_by_x = xDivide()
 
-    def pdf_integrand(x):
-        res = operations.op_multiply([division_by_x(x), fit_layer(x)])
-        return res
-
     # 3. Now create the integration layer (the layer that will simply integrate, given some weight
     integrator = xIntegrator(weights_array, input_shape=(nx,))
 
     # 4. Now create the normalization by selecting the right integrations
-    normalizer = MSR_Normalization(input_shape=(8,), mode=mode)
+    normalizer = MSR_Normalization(input_shape=(basis_size,), mode=mode)
 
-    # 5. Make the xgrid numpy array into a backend input layer so it can be given
-    if xgrid_input is None:
-        xgrid_input = operations.numpy_to_input(xgrid)
-    normalization = normalizer(integrator(pdf_integrand(xgrid_input)))
+    # 5. Make the xgrid array into a backend input layer so it can be given to the normalization
+    xgrid_input = operations.numpy_to_input(xgrid)
 
-    def ultimate_pdf(x):
-        return operations.op_multiply_dim([final_pdf_layer(x), normalization])
+    # Now parepare a function that takes as input the 8-flavours output of the NN
+    # and the 14-flavours after the fk rotation and returns a 14-flavours normalized output
+    # note + TODO:
+    # the idea was that the normalization should always be applied at the fktable 14-flavours
+    # and always computed at the output of the NN (in case one would like to compute it differently)
+    # don't think it is a good idea anymore and should be changed to act only on the output to the fktable
+    # but will be dealt with in the future.
+                            # fitlayer        #final_pdf
+    def apply_normalization(layer_fitbasis, layer_pdf):
+        """
+            layer_fitbasis: output of the NN
+            layer_pdf: output for the fktable
+        """
 
-    if verbose:
-        #         only_int = integrator(pdf_integrand(xgrid_input))
-        #         modelito = MetaModel(xgrid_input, only_int)
-        #         result = modelito.predict(x = None, steps = 1)
+        pdf_integrand = operations.op_multiply([division_by_x(xgrid_input), layer_fitbasis(xgrid_input)])
+        normalization = normalizer(integrator(pdf_integrand))
 
-        print(" > > Generating model for the inyection layer which imposes MSR")
-        check_integration(ultimate_pdf, xgrid_input)
+        def ultimate_pdf(x):
+            return operations.op_multiply([layer_pdf(x), normalization])
 
-    # Save a reference to xgrid in ultimate_pdf, very useful for debugging
-    ultimate_pdf.ref_xgrid = xgrid_input
+        return ultimate_pdf
 
-    return ultimate_pdf, xgrid_input
+    return apply_normalization, xgrid_input
 
 
 def check_integration(ultimate_pdf, integration_input):

From b629f0a2593cb7fd10dc7f33336a02d2e75082d0 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Wed, 16 Dec 2020 12:04:03 +0100
Subject: [PATCH 03/27] fix some outstanding issues

---
 n3fit/src/n3fit/ModelTrainer.py                      | 6 ++++--
 n3fit/src/n3fit/backends/keras_backend/losses.py     | 4 ++++
 n3fit/src/n3fit/backends/keras_backend/operations.py | 2 +-
 n3fit/src/n3fit/layers/DY.py                         | 4 ++--
 n3fit/src/n3fit/tests/test_backend.py                | 3 ++-
 n3fit/src/n3fit/tests/test_fit.py                    | 1 +
 n3fit/src/n3fit/tests/test_layers.py                 | 3 ++-
 7 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index c7f4279850..c578cef048 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -843,9 +843,11 @@ def hyperparametrizable(self, params):
             models = self._model_generation(pdf_models, partition)
 
             # Only after model generation, apply possible weight file
+            # TODO: not sure whether it is a good idea that all of them start at the same point
             if self.model_file:
                 log.info("Applying model file %s", self.model_file)
-                pdf_model.load_weights(self.model_file)
+                for pdf_model in pdf_models:
+                    pdf_model.load_weights(self.model_file)
 
             if k > 0:
                 # Reset the positivity and integrability multipliers
@@ -873,7 +875,7 @@ def hyperparametrizable(self, params):
             stopping_object = Stopping(
                 validation_model,
                 reporting,
-                pdf_models[0], # TODO
+                pdf_models[0], # TODO, not stopping for now
                 total_epochs=epochs,
                 stopping_patience=stopping_epochs,
                 save_weights_each=self.save_weights_each,
diff --git a/n3fit/src/n3fit/backends/keras_backend/losses.py b/n3fit/src/n3fit/backends/keras_backend/losses.py
index 3bb23ce329..d3c8d0600d 100644
--- a/n3fit/src/n3fit/backends/keras_backend/losses.py
+++ b/n3fit/src/n3fit/backends/keras_backend/losses.py
@@ -14,6 +14,10 @@ def l_invcovmat(invcovmat_np):
     invcovmat = K.constant(invcovmat_np)
 
     def true_loss(y_true, y_pred):
+        """
+            y_true: (1, N)
+            y_pred: (1, replicas, N)
+        """
         # (yt - yp) * covmat * (yt - yp)
         tmp = y_true - y_pred
         return tf.einsum('bri,ij,brj->b', tmp, invcovmat, tmp)
diff --git a/n3fit/src/n3fit/backends/keras_backend/operations.py b/n3fit/src/n3fit/backends/keras_backend/operations.py
index 847168ae38..3e7a85ed44 100644
--- a/n3fit/src/n3fit/backends/keras_backend/operations.py
+++ b/n3fit/src/n3fit/backends/keras_backend/operations.py
@@ -250,7 +250,7 @@ def pdf_masked_convolution(raw_pdf, basis_mask):
     Parameters
     ----------
         pdf: tf.tensor
-            rank 3 (batchsize, xgrid, flavours)
+            rank 4 (batchsize, xgrid, flavours, replicas)
         basis_mask: tf.tensor
             rank  2 tensor (flavours, flavours)
             mask to apply to the pdf convolution
diff --git a/n3fit/src/n3fit/layers/DY.py b/n3fit/src/n3fit/layers/DY.py
index 3c70d052f3..5d714731ea 100644
--- a/n3fit/src/n3fit/layers/DY.py
+++ b/n3fit/src/n3fit/layers/DY.py
@@ -30,12 +30,12 @@ def call(self, pdf_raw):
         Parameters
         ----------
             pdf_in: tensor
-                rank 3 tensor (batchsize, xgrid, flavours)
+                rank 4 tensor (batchsize, xgrid, flavours, replicas)
 
         Returns
         -------
             results: tensor
-                rank 2 tensor (batchsize, ndata)
+                rank 3 tensor (batchsize, replicas, ndata)
         """
         # Hadronic observables might need splitting of the input pdf in the x dimension
         # so we have 3 different paths for this layer
diff --git a/n3fit/src/n3fit/tests/test_backend.py b/n3fit/src/n3fit/tests/test_backend.py
index 2693fd435a..1487e08271 100644
--- a/n3fit/src/n3fit/tests/test_backend.py
+++ b/n3fit/src/n3fit/tests/test_backend.py
@@ -122,7 +122,8 @@ def test_sum():
 # Tests loss functions
 def test_l_invcovmat():
     loss_f = losses.l_invcovmat(INVCOVMAT)
-    result = loss_f(T1, T2)
+    # Add a replica and batch dimension to T2
+    result = loss_f(T1, np.expand_dims(T2, [0,1]))
     y = ARR1 - ARR2
     tmp = np.dot(INVCOVMAT, y)
     reference = np.dot(y, tmp)
diff --git a/n3fit/src/n3fit/tests/test_fit.py b/n3fit/src/n3fit/tests/test_fit.py
index adfb4809a8..2c3f509405 100644
--- a/n3fit/src/n3fit/tests/test_fit.py
+++ b/n3fit/src/n3fit/tests/test_fit.py
@@ -138,6 +138,7 @@ def test_performfit_and_timing(tmp_path):
     auxiliary_performfit(tmp_path, replica=2, timing=True)
 
 
+@pytest.mark.skip(reason="Still not implemented in parallel mode")
 def test_hyperopt(tmp_path):
     # Prepare the run
     quickcard = f"hyper-{QUICKNAME}.yml"
diff --git a/n3fit/src/n3fit/tests/test_layers.py b/n3fit/src/n3fit/tests/test_layers.py
index 8ea84457da..d27616b830 100644
--- a/n3fit/src/n3fit/tests/test_layers.py
+++ b/n3fit/src/n3fit/tests/test_layers.py
@@ -163,7 +163,8 @@ def test_DY():
         fks = [i['fktable'] for i in fkdicts]
         obs_layer = layers.DY(fkdicts, fks, ope, nfl=FLAVS)
         pdf = np.random.rand(XSIZE, FLAVS)
-        kp = op.numpy_to_tensor(np.expand_dims(pdf, 0))
+        # Add batch dimension (0) and replica dimension (-1)
+        kp = op.numpy_to_tensor(np.expand_dims(pdf, [0,-1]))
         # generate the n3fit results
         result_tensor = obs_layer(kp)
         result = op.evaluate(result_tensor)

From edf697ace47c2fbf0f12b131cebe34ad1c7a4d10 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Thu, 17 Dec 2020 18:58:49 +0100
Subject: [PATCH 04/27] promote the losses to their own layers as outputs of
 predict

---
 .../n3fit/backends/keras_backend/MetaModel.py |  5 +-
 .../n3fit/backends/keras_backend/losses.py    | 76 -------------------
 .../backends/keras_backend/operations.py      | 20 +++++
 n3fit/src/n3fit/layers/losses.py              | 52 +++++++++++++
 n3fit/src/n3fit/model_gen.py                  | 49 ++++++------
 5 files changed, 97 insertions(+), 105 deletions(-)
 delete mode 100644 n3fit/src/n3fit/backends/keras_backend/losses.py
 create mode 100644 n3fit/src/n3fit/layers/losses.py

diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
index fe17eaec99..d6733de8d0 100644
--- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
+++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -305,10 +305,7 @@ def make_test_function(self):
         @tf.function
         def eval_fun(*args):
             predictions = self(self._parse_input(None))
-            # Concatenate the output to split them again as a list
-            ypred = tf.concat(predictions, axis=-1)
-            predspl = tf.split(ypred, lens, axis=-1)
-            loss_list = [lfun(target, pred) for target, pred, lfun in zip(tt, predspl, self.loss)]
+            loss_list = [lfun(target, pred) for target, pred, lfun in zip(tt, predictions, self.loss)]
             ret = [tf.reduce_sum(loss_list)] + loss_list
             return dict(zip(out_names, ret))
 
diff --git a/n3fit/src/n3fit/backends/keras_backend/losses.py b/n3fit/src/n3fit/backends/keras_backend/losses.py
deleted file mode 100644
index d3c8d0600d..0000000000
--- a/n3fit/src/n3fit/backends/keras_backend/losses.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""
-    Module containing a list of loss functions availables to the fitting code
-"""
-
-import tensorflow as tf
-from tensorflow.keras import backend as K
-
-
-def l_invcovmat(invcovmat_np):
-    """
-    Returns a loss function such that:
-    L = \sum_{ij} (yt - yp)_{i} invcovmat_{ij} (yt - yp)_{j}
-    """
-    invcovmat = K.constant(invcovmat_np)
-
-    def true_loss(y_true, y_pred):
-        """
-            y_true: (1, N)
-            y_pred: (1, replicas, N)
-        """
-        # (yt - yp) * covmat * (yt - yp)
-        tmp = y_true - y_pred
-        return tf.einsum('bri,ij,brj->b', tmp, invcovmat, tmp)
-
-    return true_loss
-
-
-def l_positivity(alpha=1e-7):
-    """
-    Returns L = elu(y_pred) (considers y_true as 0)
-
-    The positivity loss is computed by inverting the sign of the
-    datapoints and then applying the elu function, this function is
-        f(x) = x if x > 0
-        f(x) = alpha * (e^{x} - 1) if x < 0
-    This is done to avoid a big discontinuity in the derivative at 0 when
-    the lagrange multiplier is very big.
-    In practice this function can produce results in the range (-alpha, inf)
-    """
-
-    def true_loss(y_true, y_pred):
-        y = -y_pred
-        loss = K.elu(y, alpha=alpha)
-        res = K.sum(loss)
-        return tf.reshape(res, (-1,))
-
-    return true_loss
-
-
-def l_integrability():
-    """
-    Returns (y_pred)*(y_pred)
-    """
-
-    def true_loss(y_true, y_pred):
-        loss = K.square(y_pred)
-        res = K.sum(loss, keepdims=True)
-        return tf.reshape(res, (-1,))
-
-    return true_loss
-
-
-def l_diaginvcovmat(diaginvcovmat_np):
-    """
-    Returns a loss function such that:
-    L = sum_{i} (yt - yp)_{i} invcovmat_{ii} (yt - yp)_{i}
-    diaginvcovmat_np should be 1d
-    """
-    invcovmat = K.constant(diaginvcovmat_np)
-
-    def true_loss(y_true, y_pred):
-        tmp = y_true - y_pred
-        res = tf.tensordot(invcovmat, K.transpose(tmp * tmp), axes=1)
-        return tf.reshape(res, (-1,))
-
-    return true_loss
diff --git a/n3fit/src/n3fit/backends/keras_backend/operations.py b/n3fit/src/n3fit/backends/keras_backend/operations.py
index 3e7a85ed44..35f6a632fb 100644
--- a/n3fit/src/n3fit/backends/keras_backend/operations.py
+++ b/n3fit/src/n3fit/backends/keras_backend/operations.py
@@ -266,6 +266,7 @@ def pdf_masked_convolution(raw_pdf, basis_mask):
     return pdf_x_pdf
 
 
+
 def tensor_product(*args, **kwargs):
     """
     Computes the tensordot product between tensor_x and tensor_y
@@ -274,6 +275,14 @@ def tensor_product(*args, **kwargs):
     return tf.tensordot(*args, **kwargs)
 
 
+def einsum(equation, *args, **kwargs):
+    """
+    Computes the tensor product using einsum
+    See full `docs <https://www.tensorflow.org/api_docs/python/tf/einsum>`_
+    """
+    return tf.einsum(equation, *args, **kwargs)
+
+
 
 @tf.function(experimental_relax_shapes=True)
 def op_log(o_tensor, **kwargs):
@@ -299,6 +308,7 @@ def split(*args, **kwargs):
     """
     return tf.split(*args, **kwargs)
 
+
 def scatter_to_one(values, indices=[[1]], output_dim=14):
     """
     Like scatter_nd initialized to one instead of zero
@@ -306,3 +316,13 @@ def scatter_to_one(values, indices=[[1]], output_dim=14):
     """
     ones = np.ones(output_dim, dtype=np.float32)
     return tf.tensor_scatter_nd_update(ones, indices, values)
+
+
+@tf.function
+def backend_function(fun_name, *args, **kwargs):
+    """
+    Calls the (``fun_name``) backend function
+    see full `docs <https://keras.io/api/utils/backend_utils/>`_ for some possibilities
+    """
+    fun = getattr(K, fun_name)
+    return fun(*args, **kwargs)
diff --git a/n3fit/src/n3fit/layers/losses.py b/n3fit/src/n3fit/layers/losses.py
new file mode 100644
index 0000000000..f56dc0f461
--- /dev/null
+++ b/n3fit/src/n3fit/layers/losses.py
@@ -0,0 +1,52 @@
+"""
+    Module containg the losses to be apply to the models as layers
+"""
+
+from n3fit.backends import MetaLayer
+from n3fit.backends import operations as op
+
+class L_invcovmat(MetaLayer):
+    """
+    Loss function such that:
+    L = \sum_{ij} (yt - yp)_{i} invcovmat_{ij} (yt - yp)_{j}
+    """
+    def __init__(self, invcovmat, y_true, **kwargs):
+        self.invcovmat = op.numpy_to_tensor(invcovmat)
+        self.y_true = op.numpy_to_tensor(y_true)
+        super().__init__(**kwargs)
+
+    def call(self, y_pred, **kwargs):
+        tmp = self.y_true - y_pred
+        res = op.einsum('bri, ij, brj -> r', tmp, self.invcovmat, tmp)
+        return res
+
+class L_positivity(MetaLayer):
+    """
+    Returns L = elu(y_pred) (considers y_true as 0)
+
+    The positivity loss is computed by inverting the sign of the
+    datapoints and then applying the elu function, this function is
+        f(x) = x if x > 0
+        f(x) = alpha * (e^{x} - 1) if x < 0
+    This is done to avoid a big discontinuity in the derivative at 0 when
+    the lagrange multiplier is very big.
+    In practice this function can produce results in the range (-alpha, inf)
+    """
+    def __init__(self, alpha=1e-7, **kwargs):
+        self.alpha = alpha
+        super().__init__(**kwargs)
+
+    def call(self, y_pred, **kwargs):
+        y = -y_pred
+        loss = op.backend_function("elu", y, alpha=self.alpha)
+        # Sum over the batch and the datapoints
+        return op.sum(loss, axis=[0,-1])
+
+class L_integrability(MetaLayer):
+    """
+    Returns L = (y_pred)*(y_pred)
+    """
+    def call(self, y_pred, **kwargs):
+        y = op.backend_function("square", y_pred)
+        # Sum over the batch and the datapoints
+        return op.sum(y, axis=[0,-1])
diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 9ab97a41db..9a075d548c 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -8,12 +8,11 @@
             Generates the PDF NN layer to be fitted
 """
 import n3fit.msr as msr_constraints
-from n3fit.layers import DIS, DY, Mask, ObsRotation
+from n3fit.layers import DIS, DY, Mask, ObsRotation, losses
 from n3fit.layers import Preprocessing, FkRotation, FlavourToEvolution
 
 from n3fit.backends import MetaModel, Input
 from n3fit.backends import operations
-from n3fit.backends import losses
 from n3fit.backends import MetaLayer, Lambda
 from n3fit.backends import base_layer_selector, regularizer_selector
 
@@ -130,19 +129,20 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
         model_obs_ex.append(obs_layer_ex)
 
     # Prepare a concatenation as experiments are one single entity formed by many datasets
-    def gen_concat(name):
-        return operations.as_layer(operations.concatenate, op_kwargs={"axis": 2}, name=name)
+    def gen_concat():
+        return operations.as_layer(operations.concatenate, op_kwargs={"axis": 2})
 
     # Tensorflow operations have ugly name,
     # we want the final observables to be named just {spec_name} (with'val/exp' if needed)
     tr_name = spec_name
     vl_name = f"{spec_name}_val"
     ex_name = f"{spec_name}_exp"
-    concat_ex = gen_concat(ex_name)
+
+    concat_ex = gen_concat()
     # For data transformation all concatenations are the same
     if spec_dict.get("data_transformation") is None:
-        concat_tr = gen_concat(tr_name)
-        concat_vl = gen_concat(vl_name)
+        concat_tr = gen_concat()
+        concat_vl = gen_concat()
     else:
         concat_tr = concat_ex
         concat_vl = concat_ex
@@ -177,22 +177,22 @@ def experiment_layer(pdf, model_obs=model_obs_ex, concat=concat_ex, rotation=Non
 
     # Now create the model for this experiment
     full_nx = sum(dataset_xsizes)
+    loss = lambda x,y: operations.sum(y)
 
     if spec_dict["positivity"]:
         out_mask = Mask(
             c=positivity_initial,
             axis=1,
-            name=spec_name,
         )
 
-        def out_positivity(pdf_layer, datasets_out=None):
-            exp_result = experiment_layer(pdf_layer)
-            return out_mask(exp_result)
-
         if integrability:
-            loss = losses.l_integrability()
+            loss_pos = losses.L_integrability(name=spec_name)
         else:
-            loss = losses.l_positivity()
+            loss_pos = losses.L_positivity(name=spec_name)
+
+        def out_positivity(pdf_layer, datasets_out=None):
+            exp_result = experiment_layer(pdf_layer)
+            return loss_pos(out_mask(exp_result))
 
         layer_info = {
             "inputs": model_inputs,
@@ -206,42 +206,41 @@ def out_positivity(pdf_layer, datasets_out=None):
     invcovmat_vl = spec_dict["invcovmat_vl"]
     invcovmat = spec_dict["invcovmat_true"]
 
+    # Prepare the loss function
+    loss_tr = losses.L_invcovmat(invcovmat_tr, spec_dict["expdata"], name=tr_name)
+    loss_vl = losses.L_invcovmat(invcovmat_vl, spec_dict["expdata_vl"], name=vl_name)
+    loss_ex = losses.L_invcovmat(invcovmat, spec_dict["expdata_true"], name=ex_name)
+
     # Generate the loss function and rotations of the final data (if any)
     if spec_dict.get("data_transformation") is not None:
+        # TODO: I'm asuming that the diagonal covmat will work ootb, check
         # The rotation is the last layer so it should carry The Name
         obsrot_tr = ObsRotation(spec_dict.get("data_transformation"), name=tr_name)
         obsrot_vl = ObsRotation(spec_dict.get("data_transformation_vl"), name=vl_name)
-        loss_tr = losses.l_diaginvcovmat(invcovmat_tr)
-        loss_vl = losses.l_diaginvcovmat(invcovmat_vl)
     else:
         obsrot_tr = None
         obsrot_vl = None
-        loss_tr = losses.l_invcovmat(invcovmat_tr)
-        # TODO At this point we need to intercept the data and compile the loss with it
-        # then the validation must have a list of None as an output
-        loss_vl = losses.l_invcovmat(invcovmat_vl)
-    loss = losses.l_invcovmat(invcovmat)
 
     def out_tr(pdf_layer, datasets_out=None):
         exp_result = experiment_layer(
             pdf_layer, model_obs=model_obs_tr, concat=concat_tr, datasets_out=datasets_out, rotation=obsrot_tr
         )
-        return exp_result
+        return loss_tr(exp_result)
 
     def out_vl(pdf_layer, datasets_out=None):
         exp_result = experiment_layer(
             pdf_layer, model_obs=model_obs_vl, concat=concat_vl, datasets_out=datasets_out, rotation=obsrot_vl
         )
-        return exp_result
+        return loss_vl(exp_result)
 
     layer_info = {
         "inputs": model_inputs,
         "output": experiment_layer,
         "loss": loss,
         "output_tr": out_tr,
-        "loss_tr": loss_tr,
+        "loss_tr": loss,
         "output_vl": out_vl,
-        "loss_vl": loss_vl,
+        "loss_vl": loss,
         "experiment_xsize": full_nx,
     }
 

From a40f87257935250edc781b8fc082d5275ac55338 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Thu, 17 Dec 2020 19:51:18 +0100
Subject: [PATCH 05/27] added a test for the new losses

---
 n3fit/src/n3fit/tests/test_backend.py | 33 --------------------
 n3fit/src/n3fit/tests/test_losses.py  | 43 +++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 33 deletions(-)
 create mode 100644 n3fit/src/n3fit/tests/test_losses.py

diff --git a/n3fit/src/n3fit/tests/test_backend.py b/n3fit/src/n3fit/tests/test_backend.py
index 1487e08271..64355a06f1 100644
--- a/n3fit/src/n3fit/tests/test_backend.py
+++ b/n3fit/src/n3fit/tests/test_backend.py
@@ -6,7 +6,6 @@
 import functools
 import numpy as np
 from n3fit.backends import operations as op
-from n3fit.backends import losses
 
 # General parameters
 DIM = 7
@@ -117,35 +116,3 @@ def test_tensor_product():
 
 def test_sum():
     numpy_check(op.sum, np.sum, mode='single')
-
-
-# Tests loss functions
-def test_l_invcovmat():
-    loss_f = losses.l_invcovmat(INVCOVMAT)
-    # Add a replica and batch dimension to T2
-    result = loss_f(T1, np.expand_dims(T2, [0,1]))
-    y = ARR1 - ARR2
-    tmp = np.dot(INVCOVMAT, y)
-    reference = np.dot(y, tmp)
-    are_equal(result, reference)
-
-
-def test_l_positivity():
-    alpha = 1e-7
-    loss_f = losses.l_positivity(alpha=alpha)
-    result = loss_f(0.0, T1)
-
-    def elu_sum(yarr_in):
-        """ Applies Exponential Linear Unit
-        to an array and sums it up """
-        yarr = -yarr_in
-        res = 0.0
-        for y in yarr:
-            if y > 0:
-                res += y
-            else:
-                res += alpha * (np.exp(y) - 1)
-        return res
-
-    reference = elu_sum(ARR1)
-    are_equal(result, reference)
diff --git a/n3fit/src/n3fit/tests/test_losses.py b/n3fit/src/n3fit/tests/test_losses.py
new file mode 100644
index 0000000000..44a51e9fc3
--- /dev/null
+++ b/n3fit/src/n3fit/tests/test_losses.py
@@ -0,0 +1,43 @@
+"""
+    Test the losses layers
+"""
+import numpy as np
+from n3fit.layers import losses
+from n3fit.backends import operations as op
+from .test_backend import are_equal, DIM
+
+ARR1 = np.random.rand(DIM)
+ARR2 = np.random.rand(DIM)
+C = np.random.rand(DIM, DIM)
+INVCOVMAT = np.linalg.inv(C @ C.T)
+
+# Tests loss functions
+def test_l_invcovmat():
+    loss_f = losses.L_invcovmat(INVCOVMAT, ARR1)
+    # Add a replica and batch dimension to T2
+    result = loss_f(np.expand_dims(ARR2, [0, 1]))
+    y = ARR1 - ARR2
+    tmp = np.dot(INVCOVMAT, y)
+    reference = np.dot(y, tmp)
+    are_equal(result, reference)
+
+
+def test_l_positivity():
+    alpha = 1e-7
+    loss_f = losses.L_positivity(alpha=alpha)
+    result = loss_f(np.expand_dims(ARR2, [0, 1]))
+
+    def elu_sum(yarr_in):
+        """Applies Exponential Linear Unit
+        to an array and sums it up"""
+        yarr = -yarr_in
+        res = 0.0
+        for y in yarr:
+            if y > 0:
+                res += y
+            else:
+                res += alpha * (np.exp(y) - 1)
+        return res
+
+    reference = elu_sum(ARR1)
+    are_equal(result, reference)

From d20a9f3602405bcbf357c61e287f8d5abad352d9 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Thu, 17 Dec 2020 19:55:01 +0100
Subject: [PATCH 06/27] minor changes so the code continues to work

---
 n3fit/src/n3fit/ModelTrainer.py               |  2 +-
 n3fit/src/n3fit/backends/__init__.py          |  1 -
 .../n3fit/backends/keras_backend/MetaModel.py | 45 ++++++++++---------
 n3fit/src/n3fit/model_gen.py                  |  5 ++-
 4 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index c578cef048..a38c324211 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -897,7 +897,7 @@ def hyperparametrizable(self, params):
             validation_loss = stopping_object.vl_chi2
 
             # Compute experimental loss
-            exp_loss_raw = models["experimental"].compute_losses()["loss"]
+            exp_loss_raw = np.take(models["experimental"].compute_losses()["loss"], -1)
             experimental_loss = exp_loss_raw / model_dicts["experimental"]["ndata"]
 
             if self.mode_hyperopt:
diff --git a/n3fit/src/n3fit/backends/__init__.py b/n3fit/src/n3fit/backends/__init__.py
index 3370ae75a8..726a752eda 100644
--- a/n3fit/src/n3fit/backends/__init__.py
+++ b/n3fit/src/n3fit/backends/__init__.py
@@ -12,7 +12,6 @@
     regularizer_selector,
     Concatenate,
 )
-from n3fit.backends.keras_backend import losses
 from n3fit.backends.keras_backend import operations
 from n3fit.backends.keras_backend import constraints
 from n3fit.backends.keras_backend import callbacks
diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
index d6733de8d0..7d36fa6386 100644
--- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
+++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -123,6 +123,7 @@ def __init__(self, input_tensors, output_tensors, **kwargs):
         self.all_outputs = output_list
         self.target_tensors = None
         self.eval_fun = None
+        self.compute_losses_function = None
 
     def _parse_input(self, extra_input=None, pass_content=True):
         """ Returns the input tensors the model was compiled with.
@@ -169,34 +170,36 @@ def predict(self, x=None, **kwargs):
 
     def compute_losses(self):
         """
-        This function is the fast-equivalent to the model ``evaluate(x,y)`` method.
+        This function is equivalent to the model ``evaluate(x,y)`` method of most TensorFlow models
+        which return a dictionary of losses per output layer.
+        The losses reported in the ``evaluate`` method for n3fit are, however, summed over replicas.
+        Instead the loss we are interested in is usually the output of the model (i.e., predict)
 
-        On first call it calls ``.evaluate(return_dict=True, verbose=0)`` to force
-        the initialization of the test function.
-        Subsequent calls of this method will (when applicable)
-        directly call the internal evaluation function ``eval_fun``.
-        This bypasses the pre- and post- evaluation steps, resulting in a ~10% speed up
-        with respect to ``.evaluate(...)``
+        This function then generates a dictionary of partial losses of the model separated per replica.
+        i.e., the output for experiment {'LHC_exp'} will be an array of Nrep elements.
 
         Returns
         -------
             dict
                 a dictionary with all partial losses of the model
         """
-        if self.eval_fun is None:
-            # We still need to perform some initialization
-            if LEGACY:
-                # For TF < 2.2 we need to generate the test_function ourselves
-                self.make_test_function()
-            else:
-                return self.evaluate(return_dict=True, verbose=False)
-        if LEGACY:
-            # For tF < 2.2 we need to force the output to be a float
-            ret = self.eval_fun()
-            ret['loss'] = ret['loss'].numpy()
-            return ret
-        else:
-            return self.eval_fun()
+        # TODO might not work for TF < 2.2, we might not care either
+        if self.compute_losses_function is None:
+            out_names = [f"{i}_loss" for i in self.output_names]
+            out_names.insert(0, "loss")
+
+            # Compile a evaluation function
+            @tf.function
+            def losses_fun():
+                predictions = self(self._parse_input(None))
+                total_loss = tf.reduce_sum(predictions, axis=0)
+                ret = [total_loss] + predictions
+                return dict(zip(out_names, ret))
+
+            self.compute_losses_function = losses_fun
+
+        return self.compute_losses_function()
+
 
     def evaluate(self, x=None, y=None, **kwargs):
         """
diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 9a075d548c..113c148b65 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -233,9 +233,12 @@ def out_vl(pdf_layer, datasets_out=None):
         )
         return loss_vl(exp_result)
 
+    def out_exp(pdf_layer, datasets_out=None):
+        return loss_ex(experiment_layer(pdf_layer))
+
     layer_info = {
         "inputs": model_inputs,
-        "output": experiment_layer,
+        "output": out_exp,
         "loss": loss,
         "output_tr": out_tr,
         "loss_tr": loss,

From 353bd2a29514a787858e9579dd8dcc031a61d712 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Fri, 18 Dec 2020 13:04:10 +0100
Subject: [PATCH 07/27] keep track of the validation and positivity separately

---
 n3fit/src/n3fit/ModelTrainer.py |   2 +-
 n3fit/src/n3fit/io/writer.py    |  24 +++--
 n3fit/src/n3fit/stopping.py     | 186 ++++++++++++++++++++++++--------
 3 files changed, 157 insertions(+), 55 deletions(-)

diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index a38c324211..60f8f52aac 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -875,7 +875,7 @@ def hyperparametrizable(self, params):
             stopping_object = Stopping(
                 validation_model,
                 reporting,
-                pdf_models[0], # TODO, not stopping for now
+                pdf_models,
                 total_epochs=epochs,
                 stopping_patience=stopping_epochs,
                 save_weights_each=self.save_weights_each,
diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
index 7ae1fb24a0..43552c0caf 100644
--- a/n3fit/src/n3fit/io/writer.py
+++ b/n3fit/src/n3fit/io/writer.py
@@ -69,6 +69,14 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
         # Check the directory exist, if it doesn't, generate it
         os.makedirs(replica_path_set, exist_ok=True)
 
+        # Get the replica status for this object
+        replica_status = self.stopping_object.get_next_replica()
+        stop_epoch = self.stopping_object.epoch_of_the_stop
+        # TODO, this is wrong, will be dealt with later
+        tr_chi2 = tr_chi2.tolist()[0]
+        true_chi2 = true_chi2.tolist()
+        vl_chi2 = vl_chi2.tolist()[0]
+
         # export PDF grid to file
         storefit(
             self.pdf_object,
@@ -84,7 +92,7 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
             integrability_numbers,
             allchi2_lines,
             preproc_lines,
-            self.stopping_object.positivity_status(),
+            replica_status.positivity_status,
             self.timings,
         )
 
@@ -92,18 +100,18 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
         # export all metadata from the fit to a single yaml file
         output_file = f"{replica_path_set}/{fitname}.json"
         json_dict = jsonfit(
-            self.stopping_object, self.pdf_object, tr_chi2, vl_chi2, true_chi2, self.timings
+            replica_status, self.pdf_object, tr_chi2, true_chi2, stop_epoch, self.timings
         )
         with open(output_file, "w") as fs:
             json.dump(json_dict, fs, indent=2)
 
 
-def jsonfit(stopping_object, pdf_object, tr_chi2, vl_chi2, true_chi2, timing):
+def jsonfit(replica_status, pdf_object, tr_chi2, true_chi2, epoch_stop, timing):
     """Generates a dictionary containing all relevant metadata for the fit
 
     Parameters
     ----------
-        stopping_object: n3fit.stopping.Stopping
+        replica_status: n3fit.stopping.ReplicaBest
             a stopping.Validation object
         pdf_object: n3fit.vpinterface.N3PDF
             N3PDF object constructed from the pdf_model
@@ -121,12 +129,12 @@ def jsonfit(stopping_object, pdf_object, tr_chi2, vl_chi2, true_chi2, timing):
     # Generate preprocessing information
     all_info["preprocessing"] = pdf_object.get_preprocessing_factors()
     # .fitinfo-like info
-    all_info["stop_epoch"] = stopping_object.stop_epoch
-    all_info["best_epoch"] = stopping_object.e_best_chi2
+    all_info["stop_epoch"] = epoch_stop
+    all_info["best_epoch"] = replica_status.best_epoch
     all_info["erf_tr"] = tr_chi2
-    all_info["erf_vl"] = vl_chi2
+    all_info["erf_vl"] = replica_status.best_vl
     all_info["chi2"] = true_chi2
-    all_info["pos_state"] = stopping_object.positivity_status()
+    all_info["pos_state"] = replica_status.positivity_status
     all_info["arc_lengths"] = pdf_object.compute_arclength().tolist()
     all_info["integrability"] = pdf_object.integrability_numbers().tolist()
     all_info["timing"] = timing
diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py
index 6773cd6c1c..c881a13658 100644
--- a/n3fit/src/n3fit/stopping.py
+++ b/n3fit/src/n3fit/stopping.py
@@ -117,7 +117,7 @@ def parse_losses(history_object, data, suffix="loss"):
     total_points = 0
     total_loss = 0
     for exp_name, npoints in data.items():
-        loss = np.take(hobj[exp_name + f"_{suffix}"], -1)
+        loss = np.array(hobj[exp_name + f"_{suffix}"])
         dict_chi2[exp_name] = loss / npoints
         total_points += npoints
         total_loss += loss
@@ -133,7 +133,7 @@ def parse_losses(history_object, data, suffix="loss"):
 
 class FitState:
     """
-        Holds the state of the chi2 during the fit.
+        Holds the state of the chi2 during the fit for all replicas
 
         It holds the necessary information to reload the fit
         to a specific point in time if we are interested on reloading
@@ -163,11 +163,6 @@ def __init__(self, all_tr_chi2, all_vl_chi2, info):
         self.weights = None
         self.best_epoch = 0
 
-    def register_weigths(self, weights, best_epoch):
-        """ Save the current best weights and best_epoch of the fit """
-        self.weights = weights
-        self.best_epoch = best_epoch
-
     @property
     def vl_chi2(self):
         """ Returns the total validation chi2 """
@@ -178,10 +173,56 @@ def tr_chi2(self):
         """ Returns the total training chi2 """
         return self.all_tr_chi2["total"]
 
+    def vl_chi2_for_replica(self, i):
+        """ Returns the validation_chi2 for a given replica """
+        return self.all_vl_chi2["total"][i]
+
     def __str__(self):
         return f"chi2: tr={self.tr_chi2} vl={self.vl_chi2}"
 
 
+class ReplicaBest:
+    """ Extra complication which eventually will be merged with someone else
+    but it is here only for development."""
+
+    def __init__(self, pdf_model):
+        self._pdf_model = pdf_model
+        self._weights = None
+        self._best_epoch = None
+        self._best_vl_chi2 = INITIAL_CHI2
+
+    def positivity_pass(self):
+        """ By definition, if we have a ``best_epoch`` then positivity passed """
+        if self._best_epoch is None:
+            return False
+        else:
+            return True
+
+    @property
+    def best_epoch(self):
+        return self._best_epoch
+
+    @property
+    def best_vl(self):
+        return float(self._best_vl_chi2)
+
+    @property
+    def positivity_status(self):
+        if self.positivity_pass():
+            return POS_OK
+        else:
+            return POS_BAD
+
+    def register_best(self, chi2, epoch):
+        self._weights = self._pdf_model.get_weights()
+        self._best_epoch = epoch
+        self._best_vl_chi2 = chi2
+
+    def reload(self):
+        if self._weights:
+            self._pdf_model.set_weights(self._weights)
+
+
 class FitHistory:
     """
         Keeps a list of FitState items holding the full history of the fit.
@@ -194,20 +235,24 @@ class FitHistory:
 
         Parameters
         ----------
-            pdf_model: n3fit.backends.MetaModel
-                PDF model being trained, used to saved the weights and compute
-                more
+            pdf_models: n3fit.backends.MetaModel
+                list of PDF models being trained, used to saved the weights
+
             save_weights_each: int
                 if given, it will save a snapshot of the fit every  `save_weights_each` epochs
     """
+    def __init__(self, pdf_models, save_weights_each=None):
+        # Save a list of status per replica
+        self._replicas = []
+        for pdf_model in pdf_models:
+            self._replicas.append(ReplicaBest(pdf_model))
+        # Save a list of status for the entire fit
+        self._history = []
 
-    def __init__(self, pdf_model, save_weights_each=None):
-        self._pdf_model = pdf_model
         self._save_weights_each = save_weights_each
         # Initialize variables for the history
         self._weights = None
         self._best_epoch = None
-        self._history = []
         self.final_epoch = None
         # Initialize variables for the snapshots
         self.reloadable_history = []
@@ -257,6 +302,20 @@ def best_tr(self):
         else:
             return self._history[self.final_epoch].tr_chi2
 
+    def save_best_replica(self, i, epoch = None):
+        """ Save the state of replica ``i`` as a best fit so far.
+        If an epoch is given, save the best as the given epoch, otherwise
+        use the last one
+        """
+        if epoch is None:
+            epoch = self.final_epoch
+        chi2 = self._history[epoch].vl_chi2[i]
+        self._replicas[i].register_best(chi2, epoch)
+
+    def all_best_vl(self):
+        """ Returns the best validation chi2 for each replica """
+        return [i.best_vl for i in self._replicas]
+
     def register(self, fitstate, epoch):
         """ Save a new fitstate and updates the current final epoch
         Every `save_weights_each` (if set) saves a snapshot of the current best fit into
@@ -264,8 +323,9 @@ def register(self, fitstate, epoch):
 
         Parameters
         ----------
-            `fitstate`
-                a fitstate object to save
+            fitstate: FitState
+                FitState object 
+                the fitstate of the object to save
             `epoch`
                 the current epoch of the fit
         """
@@ -273,19 +333,22 @@ def register(self, fitstate, epoch):
         self._history.append(fitstate)
         if self._save_weights_each:
             save_here = (epoch + 1) % self._save_weights_each
-            if save_here == 0:
-                fitstate.register_weigths(self._weights, self.best_epoch)
-                self.reloadable_history.append(fitstate)
+            # TODO this must be done differently now
+#             if save_here == 0:
+#                 fitstate.register_weigths(self._weights, self.best_epoch)
+#                 self.reloadable_history.append(fitstate)
 
     def reload(self, weights=None):
         """ Reloads the best fit weights into the model
         if there are models to be reloaded
         A set of weights can be enforced as an optional argument
         """
+        # TODO the weights part is tricky
         if weights is None:
-            weights = self._weights
-        if weights:
-            self._pdf_model.set_weights(weights)
+            for replica in self._replicas:
+                replica.reload()
+#         if weights:
+#             self._pdf_model.set_weights(weights)
 
     def rewind(self, step):
         """ Rewind the FitHistory object to the step `step` in the fit
@@ -308,13 +371,13 @@ class Stopping:
         Parameters
         ----------
             validation_model: n3fit.backends.MetaModel
-                the model with the validation mask applied
-                (and compiled with the validation data and covmat)
+               the model with the validation mask applied
+               (and compiled with the validation data and covmat)
             all_data_dict: dict
-                list containg all dictionaries containing all information about
-                the experiments/validation/regularizers/etc to be parsed by Stopping
-            pdf_model: n3fit.backends.MetaModel
-                pdf_model being trained
+               list containg all dictionaries containing all information about
+               the experiments/validation/regularizers/etc to be parsed by Stopping
+            pdf_models: list(n3fit.backends.MetaModel)
+               list of pdf_models being trained
             threshold_positivity: float
                maximum value allowed for the sum of all positivity losses
             total_epochs: int
@@ -331,7 +394,7 @@ def __init__(
         self,
         validation_model,
         all_data_dicts,
-        pdf_model,
+        pdf_models,
         threshold_positivity=1e-6,
         total_epochs=0,
         stopping_patience=7000,
@@ -348,9 +411,11 @@ def __init__(
         else:
             self.validation = Validation(validation_model, vl_ndata)
         self.positivity = Positivity(threshold_positivity, pos_sets)
-        self.history = FitHistory(pdf_model, save_weights_each=save_weights_each)
+
+        self.history = FitHistory(pdf_models)
 
         # Initialize internal variables for the stopping
+        self.n_replicas = len(pdf_models)
         self.threshold_chi2 = threshold_chi2
         self.dont_stop = dont_stop
         self.stop_now = False
@@ -358,6 +423,7 @@ def __init__(
         self.stopping_degree = 0
         self.count = 0
         self.total_epochs = total_epochs
+        self.replica_iterator = None
 
     @property
     def vl_chi2(self):
@@ -416,45 +482,67 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
 
         Parameters
         ----------
-            `training_info`
-                the output of a .fit() run
-            `epoch`
-                the index of the epoch
+            training_info: dict
+                output of a .fit() call, dictionary of the total loss (summed over replicas) for 
+                each experiment
+            epoch: int
+                index of the epoch
 
         Returns
         -------
-            `pass_ok`
+            pass_ok: bool
                 true/false according to the status of the run
         """
         # Step 1. Preprocess the event, count it towards the stopping degree
         #         parse the training information and check whether it is a good point
         tr_chi2, all_tr = parse_losses(training_info, self._tr_ndata)
-
+        
         if np.isnan(tr_chi2):
             log.warning(" > NaN found, stopping activated")
             self.stop_now = True
             # If we had a good model at any point, reload
-            self.history.reload()
+            self.history.reload() # TODO
             return False
 
         self.stopping_degree += self.count
 
         # Step 2. Check the validation loss at this point
+        # each loss is an array of the loss per replica
         vl_chi2, all_vl = self.validation.loss()
 
         # Step 3. Store information about the run and print stats if asked
         fitstate = FitState(all_tr, all_vl, self.validation.state)
         self.history.register(fitstate, epoch)
+
         if print_stats:
             self.print_current_stats(epoch, fitstate)
 
         # Step 4. Check whether this is a better fit
         #         this means improving vl_chi2 and passing positivity
+
+        # Get the values that pass validation
+        passes_val = vl_chi2 < self.threshold_chi2
+        passes_val &= vl_chi2 < self.history.all_best_vl()
+        # And the ones that pass positivity
+        passes_pos = self.positivity(fitstate)
+
+        # Now loop over the valid indices to check whether the vl improved
+        # TODO: check whether this loop is hurting at all performance (shouldnt???)
+        for i in np.where(passes_val & passes_pos)[0]:
+            self.history.save_best_replica(i)
+
+            # There is no stopping for now
+        return True
+        
+        # TODO for now we force all fits to get to the end
+
+
+        # For each of them, check whether the validation chi2 is better or not
         if self.positivity(fitstate) and vl_chi2 < self.threshold_chi2:
             if vl_chi2 < self.history.best_vl():
                 # Set the new best
                 self.history.best_epoch = epoch
-                # Save stopping info
+                # Reset stopping info
                 self.stopping_degree = 0
                 # Initialize the counter
                 self.count = 1
@@ -517,6 +605,12 @@ def positivity_status(self):
         else:
             return POS_BAD
 
+    def get_next_replica(self):
+        """ Return the next ReplicaBest object"""
+        if self.replica_iterator is None:
+            self.replica_iterator = iter(self.history._replicas)
+        return next(self.replica_iterator)
+
     def chi2exps_str(self, log_each=100):
         """
         Returns a list of log-string with the status of the fit
@@ -593,10 +687,10 @@ def _compute_validation_loss(self):
 
         Returns
         -------
-            `total_loss`
+            total_loss: float
                 total vale for the validation loss
-            `vl_dict`
-                dictionary containing a map of experiment names and loss
+            vl_dict: dict
+                dictionary containing a map of experiment names and their loss per replica
         """
         loss_dict = self.model.compute_losses()
         self.state = loss_dict
@@ -638,20 +732,20 @@ def check_positivity(self, history_object):
 
         If the positivity loss is above the threshold, the positivity fails
         otherwise, it passes.
+        It returns an array booleans which are True if positivity passed
 
         Parameters
         ----------
             history_object: dict
                 dictionary of entries in the form  {'name': loss}, output of a MetaModel .fit()
         """
+        positivity_pass = True
         for key in self.positivity_sets:
             key_loss = f"{key}_loss"
-            # If we are taking the avg when checking the output, we should do so here as well?
-            positivity_loss = np.take(history_object[key_loss], -1)
-            if positivity_loss > self.threshold:
-                return False
-        # If none of the positivities failed, it passes
-        return True
+            positivity_pass &= history_object[key_loss] < self.threshold
+            if not all(positivity_pass):
+                return np.array(positivity_pass)
+        return np.array(positivity_pass)
 
     def __call__(self, fitstate):
         """

From c4c6d26a43df9be79c2a532bff0babd68c16809a Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Fri, 18 Dec 2020 14:22:09 +0100
Subject: [PATCH 08/27] minimal working example

---
 n3fit/src/n3fit/ModelTrainer.py | 2 +-
 n3fit/src/n3fit/io/writer.py    | 6 +++---
 n3fit/src/n3fit/performfit.py   | 2 +-
 n3fit/src/n3fit/stopping.py     | 4 +++-
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index 60f8f52aac..73329ae617 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -762,7 +762,7 @@ def evaluate(self, stopping_object):
         experimental = self.model_dicts["experimental"]
         train_chi2 = stopping_object.evaluate_training(training["model"])
         val_chi2, _ = stopping_object.validation.loss()
-        exp_chi2 = experimental["model"].compute_losses()["loss"] / experimental["ndata"]
+        exp_chi2 = np.array(experimental["model"].compute_losses()["loss"]) / experimental["ndata"]
         return train_chi2, val_chi2, exp_chi2
 
     def hyperparametrizable(self, params):
diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
index 43552c0caf..1f69f04878 100644
--- a/n3fit/src/n3fit/io/writer.py
+++ b/n3fit/src/n3fit/io/writer.py
@@ -70,11 +70,11 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
         os.makedirs(replica_path_set, exist_ok=True)
 
         # Get the replica status for this object
-        replica_status = self.stopping_object.get_next_replica()
+        ii, replica_status = self.stopping_object.get_next_replica()
         stop_epoch = self.stopping_object.epoch_of_the_stop
         # TODO, this is wrong, will be dealt with later
-        tr_chi2 = tr_chi2.tolist()[0]
-        true_chi2 = true_chi2.tolist()
+        tr_chi2 = tr_chi2.tolist()[ii]
+        true_chi2 = true_chi2.tolist()[ii]
         vl_chi2 = vl_chi2.tolist()[0]
 
         # export PDF grid to file
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 607d23f163..3081b76b82 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -344,7 +344,7 @@ def performfit(
             # TODO: recompute training, valudation and experimental _per_ pdfmodel
             training_chi2, val_chi2, exp_chi2 = the_model_trainer.evaluate(stopping_object)
             writer_wrapper.write_data(
-                replica_path_set, output_path.name, training_chi2, val_chi2, true_chi2
+                replica_path_set, output_path.name, training_chi2, val_chi2, exp_chi2
             )
 
             # Save the weights to some file for the given replica
diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py
index c881a13658..35706f6c70 100644
--- a/n3fit/src/n3fit/stopping.py
+++ b/n3fit/src/n3fit/stopping.py
@@ -609,7 +609,9 @@ def get_next_replica(self):
         """ Return the next ReplicaBest object"""
         if self.replica_iterator is None:
             self.replica_iterator = iter(self.history._replicas)
-        return next(self.replica_iterator)
+            self.ii = -1
+        self.ii += 1
+        return self.ii, next(self.replica_iterator)
 
     def chi2exps_str(self, log_each=100):
         """

From 823915d36b9cd58b4362c4ee83f87ffae7902234 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Fri, 18 Dec 2020 15:05:43 +0100
Subject: [PATCH 09/27] allow only running in configurations that might work

---
 n3fit/src/n3fit/checks.py     | 12 ++++++++++++
 n3fit/src/n3fit/performfit.py |  1 +
 2 files changed, 13 insertions(+)

diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
index b96c2d0117..217f367d8a 100644
--- a/n3fit/src/n3fit/checks.py
+++ b/n3fit/src/n3fit/checks.py
@@ -352,3 +352,15 @@ def check_consistent_basis(fitting, theoryid):
         raise CheckError(f"{theoryid} (intrinsic charm) is incompatible with basis {fitbasis}")
     if not theoryid.get_description()["IC"] and has_c:
         raise CheckError(f"{theoryid} (perturbative charm) is incompatible with basis {fitbasis}")
+
+
+@make_argcheck
+def can_run_in_parallel(fitting, parallel_models=1):
+    """ Checks whether a runcard which is trying to run several replicas at once (parallel_models =/= 1) is valid
+    """
+    if parallel_models == 1:
+        return
+    if fitting.get("genrep"):
+        raise CheckError("Replica generation is not supported yet for parallel models")
+    if fitting["parameters"].get("layer_type") != "dense":
+        raise CheckError("Parallelization has only been tested with layer_type=='dense'")
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 3081b76b82..69d9b1f819 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -75,6 +75,7 @@ def initialize_seeds(replica: list, trvlseed: int, nnseed: int, mcseed: int, gen
 
 # Action to be called by valid phys
 # All information defining the NN should come here in the "parameters" dict
+@n3fit.checks.can_run_in_parallel
 @n3fit.checks.check_consistent_basis
 @n3fit.checks.wrapper_check_NN
 @n3fit.checks.wrapper_hyperopt

From 765b735363872d8bd26e3a1db473301a19e71cc8 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Mon, 21 Dec 2020 17:09:31 +0100
Subject: [PATCH 10/27] dont let tensors get out of the model

---
 n3fit/src/n3fit/backends/keras_backend/MetaModel.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
index 7d36fa6386..d30a3c2aef 100644
--- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
+++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -9,6 +9,7 @@
 import tensorflow as tf
 from tensorflow.keras.models import Model
 from tensorflow.keras import optimizers as Kopt
+from tensorflow.python.keras.utils import tf_utils
 from n3fit.backends.keras_backend.operations import numpy_to_tensor
 
 # Check the TF version to check if legacy-mode is needed (TF < 2.2)
@@ -198,7 +199,10 @@ def losses_fun():
 
             self.compute_losses_function = losses_fun
 
-        return self.compute_losses_function()
+        ret = self.compute_losses_function()
+        # undocumented TF function that converts all the tensors from the ret dictionary to numpy arrays
+        # if it dissapears, equivalent to {k: i.numpy() for k, i in ret.items()}
+        return tf_utils.to_numpy_or_python_type(ret)
 
 
     def evaluate(self, x=None, y=None, **kwargs):

From 9fd2bbb8db7ec1d70de0dfef250f27239df95d76 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 22 Dec 2020 10:03:04 +0100
Subject: [PATCH 11/27] change stopping to accept the replica dimension

---
 n3fit/src/n3fit/ModelTrainer.py |  12 +-
 n3fit/src/n3fit/io/writer.py    |  20 +-
 n3fit/src/n3fit/performfit.py   |  22 +-
 n3fit/src/n3fit/stopping.py     | 384 ++++++++++++++------------------
 4 files changed, 196 insertions(+), 242 deletions(-)

diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index 73329ae617..fb2f75f63d 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -759,10 +759,10 @@ def evaluate(self, stopping_object):
         # Needs to receive a `stopping_object` in order to select the part of the
         # training and the validation which are actually `chi2` and not part of the penalty
         training = self.model_dicts["training"]
-        experimental = self.model_dicts["experimental"]
         train_chi2 = stopping_object.evaluate_training(training["model"])
-        val_chi2, _ = stopping_object.validation.loss()
-        exp_chi2 = np.array(experimental["model"].compute_losses()["loss"]) / experimental["ndata"]
+        val_chi2 = stopping_object.vl_chi2
+        experimental = self.model_dicts["experimental"]
+        exp_chi2 = experimental["model"].compute_losses()["loss"] / experimental["ndata"]
         return train_chi2, val_chi2, exp_chi2
 
     def hyperparametrizable(self, params):
@@ -892,8 +892,7 @@ def hyperparametrizable(self, params):
                 epochs=epochs,
             )
 
-            # Save validation and training chi2
-            training_loss = stopping_object.tr_chi2
+            # Save validation chi2
             validation_loss = stopping_object.vl_chi2
 
             # Compute experimental loss
@@ -918,13 +917,11 @@ def hyperparametrizable(self, params):
                     break
 
             # Save all losses
-            l_train.append(training_loss)
             l_valid.append(validation_loss)
             l_exper.append(experimental_loss)
 
         dict_out = {
             "status": passed,
-            "training_loss": np.average(l_train),
             "validation_loss": np.average(l_valid),
             "experimental_loss": np.average(l_exper),
         }
@@ -932,7 +929,6 @@ def hyperparametrizable(self, params):
         if self.mode_hyperopt:
             dict_out["loss"] = self.hyper_loss(l_hyper)
             dict_out["kfold_meta"] = {
-                "training_losses": l_train,
                 "validation_losses": l_valid,
                 "experimental_losses": l_exper,
                 "hyper_losses": l_hyper,
diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
index 1f69f04878..e852404dde 100644
--- a/n3fit/src/n3fit/io/writer.py
+++ b/n3fit/src/n3fit/io/writer.py
@@ -69,13 +69,10 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
         # Check the directory exist, if it doesn't, generate it
         os.makedirs(replica_path_set, exist_ok=True)
 
-        # Get the replica status for this object
-        ii, replica_status = self.stopping_object.get_next_replica()
         stop_epoch = self.stopping_object.epoch_of_the_stop
-        # TODO, this is wrong, will be dealt with later
-        tr_chi2 = tr_chi2.tolist()[ii]
-        true_chi2 = true_chi2.tolist()[ii]
-        vl_chi2 = vl_chi2.tolist()[0]
+
+        # Get the replica status for this object
+        _, replica_status = self.stopping_object.get_next_replica()
 
         # export PDF grid to file
         storefit(
@@ -96,14 +93,21 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
             self.timings,
         )
 
-        # TODO: compute the chi2s directly from the stopping object
         # export all metadata from the fit to a single yaml file
         output_file = f"{replica_path_set}/{fitname}.json"
         json_dict = jsonfit(
             replica_status, self.pdf_object, tr_chi2, true_chi2, stop_epoch, self.timings
         )
         with open(output_file, "w") as fs:
-            json.dump(json_dict, fs, indent=2)
+            json.dump(json_dict, fs, indent=2, cls = SuperEncoder)
+
+
+class SuperEncoder(json.JSONEncoder):
+    """ Custom json encoder to get around the fact that np.float32 =/= float """
+    def default(self, o):
+        if isinstance(o, np.float32):
+            return float(o)
+        return super().default(o)
 
 
 def jsonfit(replica_status, pdf_object, tr_chi2, true_chi2, epoch_stop, timing):
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 69d9b1f819..efa53f542d 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -324,6 +324,7 @@ def performfit(
         )
 
         final_time = stopwatch.stop()
+        all_training_chi2, all_val_chi2, all_exp_chi2 = the_model_trainer.evaluate(stopping_object)
 
         for i, pdf_model in enumerate(pdf_models):
             # Each model goes into its own replica folder
@@ -341,9 +342,12 @@ def performfit(
                 final_time,
             )
 
-            # Now write the data down
-            # TODO: recompute training, valudation and experimental _per_ pdfmodel
-            training_chi2, val_chi2, exp_chi2 = the_model_trainer.evaluate(stopping_object)
+            # Get the right chi2s
+            training_chi2 = np.take(all_training_chi2, i)
+            val_chi2 = np.take(all_val_chi2, i)
+            exp_chi2 = np.take(all_exp_chi2, i)
+
+            # And write the data down
             writer_wrapper.write_data(
                 replica_path_set, output_path.name, training_chi2, val_chi2, exp_chi2
             )
@@ -358,12 +362,12 @@ def performfit(
 
         # If the history of weights is active then loop over it
         # rewind the state back to every step and write down the results
-        for step in range(len(stopping_object.history.reloadable_history)):
-            stopping_object.history.rewind(step)
-            new_path = output_path / f"history_step_{step}/replica_{replica_number}"
-            # We need to recompute the experimental chi2 for this point
-            training_chi2, val_chi2, exp_chi2 = the_model_trainer.evaluate(stopping_object)
-            writer_wrapper.write_data(new_path, output_path.name, training_chi2, val_chi2, exp_chi2)
+#         for step in range(len(stopping_object.history.reloadable_history)):
+#             stopping_object.history.rewind(step)
+#             new_path = output_path / f"history_step_{step}/replica_{replica_number}"
+#             # We need to recompute the experimental chi2 for this point
+#             training_chi2, val_chi2, exp_chi2 = the_model_trainer.evaluate(stopping_object)
+#             writer_wrapper.write_data(new_path, output_path.name, training_chi2, val_chi2, exp_chi2)
 
         # So every time we want to capture output_path.name and addd a history_step_X
         # parallel to the nnfit folder
diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py
index 35706f6c70..4da39007ad 100644
--- a/n3fit/src/n3fit/stopping.py
+++ b/n3fit/src/n3fit/stopping.py
@@ -41,6 +41,7 @@
 # Pass/veto keys
 POS_OK = "POS_PASS"
 POS_BAD = "POS_VETO"
+THRESHOLD_POS = 1e-6
 
 
 def parse_ndata(all_data):
@@ -145,43 +146,97 @@ class FitState:
 
         Parameters
         ----------
-            all_tr_chi2: dict
-                Chi2 for all training datasets computed before the update of the weights
-            all_vl_chi2: dict
-                Chi2 for all validation datasets computed after the update of the weights
-            info: dict
-                Full return state of the Validation model
+            training_info: dict
+                all losses for the training model
+            validation_info: dict
+                all losses for the validation model
     """
+    vl_ndata = None
+    tr_ndata = None
+    vl_suffix = None
+
+    def __init__(self, training_info, validation_info):
+        if self.vl_ndata is None or self.tr_ndata is None or self.vl_suffix is None:
+            raise ValueError("FitState cannot be instantiated until vl_ndata, tr_ndata and vl_suffix are filled")
+        self.training = training_info
+        self.validation = validation_info
+        self._parsed = False
+        self._vl_chi2 = None
+        self._tr_chi2 = None
+        self._vl_dict = None
+        self._tr_dict = None
 
-    def __init__(self, all_tr_chi2, all_vl_chi2, info):
-        self.all_tr_chi2 = all_tr_chi2
-        self.all_vl_chi2 = all_vl_chi2
-        self.info = info
-        # These two variables are only filled for specific points
-        # in order to save precious memory, and only when we are
-        # saving the fit history each X number of epoch
-        self.weights = None
-        self.best_epoch = 0
+    @property
+    def vl_loss(self):
+        """Return the total validation loss as it comes from the info dictionaries"""
+        return self.validation.get("loss")
 
     @property
-    def vl_chi2(self):
-        """ Returns the total validation chi2 """
-        return self.all_vl_chi2["total"]
+    def tr_loss(self):
+        """Return the total validation loss as it comes from the info dictionaries"""
+        return self.training.get("loss")
+
+    def _parse_chi2(self):
+        """
+        Parses the chi2 from the losses according to the `tr_ndata` and
+        `vl_ndata` dictionaries of {dataset: n_points}
+        """
+        if self._parsed:
+            return
+        if self.training is not None:
+            self._tr_chi2, self._tr_dict = parse_losses(self.training, self.tr_ndata)
+        if self.validation is not None:
+            self._vl_chi2, self._vl_dict = parse_losses(self.validation, self.vl_ndata, suffix=self.vl_suffix)
 
     @property
     def tr_chi2(self):
-        """ Returns the total training chi2 """
-        return self.all_tr_chi2["total"]
+        self._parse_chi2()
+        return self._tr_chi2
 
-    def vl_chi2_for_replica(self, i):
-        """ Returns the validation_chi2 for a given replica """
-        return self.all_vl_chi2["total"][i]
+    @property
+    def vl_chi2(self):
+        self._parse_chi2()
+        return self._vl_chi2
+
+    @property
+    def all_tr_chi2(self):
+        self._parse_chi2()
+        return self._tr_dict
+
+    @property
+    def all_vl_chi2(self):
+        self._parse_chi2()
+        return self._vl_dict
+
+    def all_tr_chi2_for_replica(self, r):
+        """" Return the tr chi2 per dataset for a given replica """
+        return {k:np.take(i, r) for k,i in self.all_tr_chi2.items()}
+
+    def all_vl_chi2_for_replica(self, r):
+        """" Return the vl chi2 per dataset for a given replica """
+        return {k:np.take(i, r) for k,i in self.all_vl_chi2.items()}
+
+    def total_partial_tr_chi2(self):
+        """ Return the tr chi2 summed over replicas per experiment"""
+        return {k:np.sum(i) for k,i in self.all_tr_chi2.items()}
+
+    def total_partial_vl_chi2(self):
+        """ Return the vl chi2 summed over replicas per experiment"""
+        return {k:np.sum(i) for k,i in self.all_tr_chi2.items()}
+
+    def total_tr_chi2(self):
+        """ Return the total tr chi2 summed over replicas """
+        return np.sum(self.tr_chi2)
+
+    def total_vl_chi2(self):
+        """ Return the total vl chi2 summed over replicas """
+        return np.sum(self.vl_chi2)
 
     def __str__(self):
         return f"chi2: tr={self.tr_chi2} vl={self.vl_chi2}"
 
 
-class ReplicaBest:
+class ReplicaState:
     """ Extra complication which eventually will be merged with someone else
     but it is here only for development."""
 
@@ -241,32 +296,28 @@ class FitHistory:
             save_weights_each: int
                 if given, it will save a snapshot of the fit every  `save_weights_each` epochs
     """
-    def __init__(self, pdf_models, save_weights_each=None):
-        # Save a list of status per replica
+    def __init__(self, pdf_models, tr_ndata, vl_ndata, save_weights_each=None):
+        # Create a ReplicaState object for all models
+        # which will hold the best chi2 and weights per replica
         self._replicas = []
         for pdf_model in pdf_models:
-            self._replicas.append(ReplicaBest(pdf_model))
+            self._replicas.append(ReplicaState(pdf_model))
+
+        if vl_ndata is None:
+            vl_ndata = tr_ndata
+            vl_suffix = "loss"
+        else:
+            vl_ndata = vl_ndata
+            vl_suffix = "val_loss"
+        # All instances of FitState should use these
+        FitState.tr_ndata = tr_ndata
+        FitState.vl_ndata = vl_ndata
+        FitState.vl_suffix = vl_suffix
+
         # Save a list of status for the entire fit
         self._history = []
-
-        self._save_weights_each = save_weights_each
-        # Initialize variables for the history
-        self._weights = None
-        self._best_epoch = None
+        self.best_epoch = None
         self.final_epoch = None
-        # Initialize variables for the snapshots
-        self.reloadable_history = []
-
-    @property
-    def best_epoch(self):
-        """ Epoch of the best fit """
-        return self._best_epoch
-
-    @best_epoch.setter
-    def best_epoch(self, epoch):
-        """ Saves the current weight """
-        self._weights = self._pdf_model.get_weights()
-        self._best_epoch = epoch
 
     def get_state(self, epoch):
         """ Get the FitState of the system for a given epoch """
@@ -278,29 +329,7 @@ def best_state(self):
             return None
         else:
             index = self.best_epoch
-            best_state = self._history[index]
-            return best_state
-
-    def best_vl(self):
-        """ Returns the chi2 of the best fit
-        if there was no best fit returns `INITIAL_CHI2`
-        if there was a problem, returns `TERRIBLE_CHI2` """
-        if not self._weights:
-            return TERRIBLE_CHI2
-        best_state = self.best_state()
-        if best_state:
-            return best_state.vl_chi2
-        else:
-            return INITIAL_CHI2
-
-    def best_tr(self):
-        """ Returns the training chi2 of the best fit
-        if there are no best fit, returns the last one """
-        best_state = self.best_state()
-        if best_state:
-            return best_state.tr_chi2
-        else:
-            return self._history[self.final_epoch].tr_chi2
+            return self._history[index]
 
     def save_best_replica(self, i, epoch = None):
         """ Save the state of replica ``i`` as a best fit so far.
@@ -309,14 +338,15 @@ def save_best_replica(self, i, epoch = None):
         """
         if epoch is None:
             epoch = self.final_epoch
-        chi2 = self._history[epoch].vl_chi2[i]
+        chi2 = self._history[epoch].vl_loss[i]
         self._replicas[i].register_best(chi2, epoch)
+        return False
 
-    def all_best_vl(self):
-        """ Returns the best validation chi2 for each replica """
+    def all_best_vl_loss(self):
+        """ Returns the best validation loss for each replica """
         return [i.best_vl for i in self._replicas]
 
-    def register(self, fitstate, epoch):
+    def register(self, epoch, training_info, validation_info):
         """ Save a new fitstate and updates the current final epoch
         Every `save_weights_each` (if set) saves a snapshot of the current best fit into
         the fitstate
@@ -329,14 +359,11 @@ def register(self, fitstate, epoch):
             `epoch`
                 the current epoch of the fit
         """
+        # Save all the information in a fitstate object
+        fitstate = FitState(training_info, validation_info)
         self.final_epoch = epoch
         self._history.append(fitstate)
-        if self._save_weights_each:
-            save_here = (epoch + 1) % self._save_weights_each
-            # TODO this must be done differently now
-#             if save_here == 0:
-#                 fitstate.register_weigths(self._weights, self.best_epoch)
-#                 self.reloadable_history.append(fitstate)
+        return fitstate
 
     def reload(self, weights=None):
         """ Reloads the best fit weights into the model
@@ -347,8 +374,6 @@ def reload(self, weights=None):
         if weights is None:
             for replica in self._replicas:
                 replica.reload()
-#         if weights:
-#             self._pdf_model.set_weights(weights)
 
     def rewind(self, step):
         """ Rewind the FitHistory object to the step `step` in the fit
@@ -395,28 +420,27 @@ def __init__(
         validation_model,
         all_data_dicts,
         pdf_models,
-        threshold_positivity=1e-6,
+        threshold_positivity=THRESHOLD_POS,
         total_epochs=0,
         stopping_patience=7000,
         threshold_chi2=10.0,
         dont_stop=False,
         save_weights_each=None,
     ):
-        # Parse the training, validation and positivity sets from all the input dictionaries
-        self._tr_ndata, vl_ndata, pos_sets = parse_ndata(all_data_dicts)
+        # Save the validation object
+        self._validation = validation_model
 
-        # Create the Validation, Positivity and History objects
-        if vl_ndata is None:
-            self.validation = Validation(validation_model, self._tr_ndata, no_validation=True)
-        else:
-            self.validation = Validation(validation_model, vl_ndata)
-        self.positivity = Positivity(threshold_positivity, pos_sets)
+        # Create the History object
+        tr_ndata, vl_ndata, pos_sets = parse_ndata(all_data_dicts)
+        self.history = FitHistory(pdf_models, tr_ndata, vl_ndata)
 
-        self.history = FitHistory(pdf_models)
+        # And the positivity checker
+        self._positivity = Positivity(threshold_positivity, pos_sets)
 
         # Initialize internal variables for the stopping
         self.n_replicas = len(pdf_models)
         self.threshold_chi2 = threshold_chi2
+
         self.dont_stop = dont_stop
         self.stop_now = False
         self.stopping_patience = stopping_patience
@@ -427,13 +451,10 @@ def __init__(
 
     @property
     def vl_chi2(self):
-        """ Validation chi2 """
-        return self.history.best_vl()
-
-    @property
-    def tr_chi2(self):
-        """ Training chi2 """
-        return self.history.best_tr()
+        """ Current validation chi2 """
+        validation_info = self._validation.compute_losses()
+        fitstate = FitState(None, validation_info)
+        return fitstate.vl_chi2
 
     @property
     def e_best_chi2(self):
@@ -465,8 +486,8 @@ def evaluate_training(self, training_model):
                 chi2 of the given ``training_model``
         """
         training_info = training_model.compute_losses()
-        tr_chi2, _ = parse_losses(training_info, self._tr_ndata)
-        return tr_chi2
+        fitstate = FitState(training_info, None)
+        return fitstate.tr_chi2
 
     def monitor_chi2(self, training_info, epoch, print_stats=False):
         """
@@ -483,7 +504,7 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
         Parameters
         ----------
             training_info: dict
-                output of a .fit() call, dictionary of the total loss (summed over replicas) for 
+                output of a .fit() call, dictionary of the total loss (summed over replicas) for
                 each experiment
             epoch: int
                 index of the epoch
@@ -493,62 +514,44 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
             pass_ok: bool
                 true/false according to the status of the run
         """
-        # Step 1. Preprocess the event, count it towards the stopping degree
-        #         parse the training information and check whether it is a good point
-        tr_chi2, all_tr = parse_losses(training_info, self._tr_ndata)
-        
-        if np.isnan(tr_chi2):
+        # Step 1. Check whether the fit has NaN'd and stop it if so
+        if np.isnan(training_info["loss"]):
             log.warning(" > NaN found, stopping activated")
             self.stop_now = True
             # If we had a good model at any point, reload
             self.history.reload() # TODO
             return False
 
-        self.stopping_degree += self.count
-
-        # Step 2. Check the validation loss at this point
-        # each loss is an array of the loss per replica
-        vl_chi2, all_vl = self.validation.loss()
+        # Step 2. Compute the validation metrics
+        validation_info = self._validation.compute_losses()
 
-        # Step 3. Store information about the run and print stats if asked
-        fitstate = FitState(all_tr, all_vl, self.validation.state)
-        self.history.register(fitstate, epoch)
-
-        if print_stats:
-            self.print_current_stats(epoch, fitstate)
+        # Step 3. Register the current point in (the) history
+        fitstate = self.history.register(epoch, training_info, validation_info)
 
         # Step 4. Check whether this is a better fit
         #         this means improving vl_chi2 and passing positivity
-
-        # Get the values that pass validation
-        passes_val = vl_chi2 < self.threshold_chi2
-        passes_val &= vl_chi2 < self.history.all_best_vl()
+        passes_val = fitstate.vl_loss < self.threshold_chi2
+        passes_val &= fitstate.vl_loss < self.history.all_best_vl_loss()
         # And the ones that pass positivity
-        passes_pos = self.positivity(fitstate)
+        passes_pos = self._positivity(fitstate)
+        passes = passes_val & passes_pos
 
-        # Now loop over the valid indices to check whether the vl improved
-        # TODO: check whether this loop is hurting at all performance (shouldnt???)
-        for i in np.where(passes_val & passes_pos)[0]:
-            self.history.save_best_replica(i)
+        self.stopping_degree += self.count
 
-            # There is no stopping for now
-        return True
-        
-        # TODO for now we force all fits to get to the end
+        # Step 5. loop over the valid indices to check whether the vl improved
+        stop_here = all(passes)
+        for i in np.where(passes)[0]:
+            stop_here &= self.history.save_best_replica(i)
+            self.stopping_degree = 0
+            self.count = 1
 
+        if self.stopping_degree > self.stopping_patience:
+            stop_here = True
 
-        # For each of them, check whether the validation chi2 is better or not
-        if self.positivity(fitstate) and vl_chi2 < self.threshold_chi2:
-            if vl_chi2 < self.history.best_vl():
-                # Set the new best
-                self.history.best_epoch = epoch
-                # Reset stopping info
-                self.stopping_degree = 0
-                # Initialize the counter
-                self.count = 1
+        if print_stats:
+            self.print_current_stats(epoch, fitstate)
 
-        # If your patience has ended, prepare for stop
-        if self.stopping_degree > self.stopping_patience:
+        if stop_here:
             self.make_stop()
         return True
 
@@ -561,20 +564,21 @@ def make_stop(self):
 
     def print_current_stats(self, epoch, fitstate):
         """
-            Prints the last validation and training loss saved
+            Prints ``fitstate`` training and validation chi2s
         """
         epoch_index = epoch + 1
-        tr_loss = fitstate.tr_chi2
-        vl_loss = fitstate.vl_chi2
-        total_str = f"At epoch {epoch_index}/{self.total_epochs}, total loss: {tr_loss}\n"
-
-        partials = []
-        for experiment in self._tr_ndata:
-            chi2 = fitstate.all_tr_chi2[experiment]
-            partials.append(f"{experiment}: {chi2:.3f}")
-        total_str += ", ".join(partials)
-
-        total_str += f"\nValidation loss at this point: {vl_loss}"
+        tr_chi2 = fitstate.total_tr_chi2()
+        vl_chi2 = fitstate.total_vl_chi2()
+        total_str = f"At epoch {epoch_index}/{self.total_epochs}, total loss: {tr_chi2}\n"
+
+        # The partial chi2 makes no sense for more than one replica at once:
+        if self.n_replicas == 1:
+            partial_tr_chi2 = fitstate.total_partial_tr_chi2()
+            partials = []
+            for experiment, chi2 in partial_tr_chi2.items():
+                partials.append(f"{experiment}: {chi2:.3f}")
+            total_str += ", ".join(partials) + "\n"
+        total_str += f"Validation loss at this point: {vl_chi2}"
         log.info(total_str)
 
     def stop_here(self):
@@ -606,106 +610,52 @@ def positivity_status(self):
             return POS_BAD
 
     def get_next_replica(self):
-        """ Return the next ReplicaBest object"""
+        """ Return the next ReplicaState object"""
         if self.replica_iterator is None:
             self.replica_iterator = iter(self.history._replicas)
             self.ii = -1
         self.ii += 1
         return self.ii, next(self.replica_iterator)
 
-    def chi2exps_str(self, log_each=100):
+    def chi2exps_str(self, replica=0, log_each=100):
         """
         Returns a list of log-string with the status of the fit
         every `log_each` epochs
 
         Parameters
         ----------
-            `log_each`
+            replica: int
+                which replica are we writing the log for
+            log_each: int
                 every how many epochs to print the log
 
         Returns
         -------
-            `file_list`
-                a list of string to be printed as `chi2exps.log`
+            file_list: list(str)
+                a list of strings to be printed as `chi2exps.log`
         """
         final_epoch = self.history.final_epoch
         file_list = []
         for i in range(log_each - 1, final_epoch + 1, log_each):
             fitstate = self.history.get_state(i)
-            all_tr = fitstate.all_tr_chi2
-            all_vl = fitstate.all_vl_chi2
+            all_tr = fitstate.all_tr_chi2_for_replica(replica)
+            all_vl = fitstate.all_vl_chi2_for_replica(replica)
             # Here it is assumed the validation exp set is always a subset of the training exp set
             data_list = []
-            for exp in self._tr_ndata:
-                tr_loss = all_tr[exp]
+            for exp, tr_loss in all_tr.items():
                 vl_loss = all_vl.get(exp, 0.0)
                 data_str = f"{exp}: {tr_loss} {vl_loss}"
                 data_list.append(data_str)
             data = "\n".join(data_list)
             epoch_index = i + 1
-            total_tr_loss = fitstate.tr_chi2
-            total_vl_loss = fitstate.vl_chi2
             strout = f"""
 Epoch: {epoch_index}
 {data}
-Total: training = {total_tr_loss} validation = {total_vl_loss}
 """
             file_list.append(strout)
         return file_list
 
 
-class Validation:
-    """
-        Controls the NNPDF cross-validation algorithm
-
-        The cross-validation refers to the validation loss of the points of the dataset
-        not used in the fitting.
-        In general for any points considered here there will accompanying points from the
-        same dataset being included in the fitting.
-
-        Parameters
-        ----------
-            model: n3fit.backends.MetaModel
-                the model with the validation mask applied
-                (and compiled with the validation data and covmat)
-    """
-
-    def __init__(self, model, ndata_dict, verbose=False, no_validation=False):
-        self.model = model
-        self.state = None
-        self.verbose = verbose
-        self.ndata_dict = ndata_dict
-        self.n_val_exp = len(ndata_dict)
-        if no_validation:
-            self.suffix = "loss"
-        else:
-            self.suffix = "val_loss"
-
-    def _compute_validation_loss(self):
-        """
-        Evaluates the validation model and returns a tuple (`total_loss`, `vl_dict`)
-        with the information for the validation loss by experimenet normalized to the
-        number of points of each experiment
-
-        Returns
-        -------
-            total_loss: float
-                total vale for the validation loss
-            vl_dict: dict
-                dictionary containing a map of experiment names and their loss per replica
-        """
-        loss_dict = self.model.compute_losses()
-        self.state = loss_dict
-        return parse_losses(loss_dict, self.ndata_dict, suffix=self.suffix)
-
-    def loss(self):
-        """
-        Returns a tuple with the validation loss and a
-        dictionary for the validation loss per experiment
-        """
-        return self._compute_validation_loss()
-
-
 class Positivity:
     """
         Controls the positivity requirements.
@@ -754,4 +704,4 @@ def __call__(self, fitstate):
             Checks whether a given FitState object
             passes the positivity requirement
         """
-        return self.check_positivity(fitstate.info)
+        return self.check_positivity(fitstate.validation)

From 867cb934f7642f4da09c8b1ba9d18ab8dc1597fb Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 22 Dec 2020 15:31:52 +0100
Subject: [PATCH 12/27] several more fixes

---
 n3fit/src/n3fit/ModelTrainer.py |  7 ++--
 n3fit/src/n3fit/checks.py       | 10 ++++-
 n3fit/src/n3fit/performfit.py   |  3 --
 n3fit/src/n3fit/stopping.py     | 65 +++++++++------------------------
 4 files changed, 28 insertions(+), 57 deletions(-)

diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index fb2f75f63d..bcfb9f4bea 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -710,10 +710,9 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100):
             callbacks=self.callbacks + [callback_st, callback_pos, callback_integ],
         )
 
-        if stopping_object.positivity_pass():
-            return self.pass_status
-        else:
-            return self.failed_status
+        # TODO: this needs to be changed for hyperopt
+        return self.pass_status
+#         return self.failed_status
 
     def _hyperopt_override(self, params):
         """ Unrolls complicated hyperopt structures into very simple dictionaries"""
diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
index 217f367d8a..31fdae7929 100644
--- a/n3fit/src/n3fit/checks.py
+++ b/n3fit/src/n3fit/checks.py
@@ -355,12 +355,18 @@ def check_consistent_basis(fitting, theoryid):
 
 
 @make_argcheck
-def can_run_in_parallel(fitting, parallel_models=1):
+def can_run_in_parallel(fitting, replica, parallel_models=1):
     """ Checks whether a runcard which is trying to run several replicas at once (parallel_models =/= 1) is valid
     """
+    rp = len(replica)
+    genrep = fitting.get("genrep")
+    if rp > 1 and not genrep:
+        raise CheckError("Can't run more than one replica at once if no replicas are to be generated")
     if parallel_models == 1:
         return
-    if fitting.get("genrep"):
+    if genrep:
         raise CheckError("Replica generation is not supported yet for parallel models")
     if fitting["parameters"].get("layer_type") != "dense":
         raise CheckError("Parallelization has only been tested with layer_type=='dense'")
+    if rp > 1:
+        raise CheckError("Parallel mode cannot be used together with multireplica runs")
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index efa53f542d..aec1bd2fb0 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -241,7 +241,6 @@ def performfit(
             fitting["fitbasis"],
             nnseed,
             debug=debug,
-            save_weights_each=fitting.get("save_weights_each"),
             kfold_parameters=kfold_parameters,
             max_cores=maxcores,
             model_file=fitting.get("load"),
@@ -313,13 +312,11 @@ def performfit(
             """
         > > The stopping point has been at: {0} with a loss of {1}
                 which it got at {2}. Stopping degree {3}
-                Positivity state: {4}
                 """.format(
                 stopping_object.stop_epoch,
                 stopping_object.vl_chi2,
                 stopping_object.e_best_chi2,
                 stopping_object.stopping_degree,
-                stopping_object.positivity_status(),
             )
         )
 
diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py
index 4da39007ad..aa4519c3ea 100644
--- a/n3fit/src/n3fit/stopping.py
+++ b/n3fit/src/n3fit/stopping.py
@@ -307,7 +307,6 @@ def __init__(self, pdf_models, tr_ndata, vl_ndata, save_weights_each=None):
             vl_ndata = tr_ndata
             vl_suffix = "loss"
         else:
-            vl_ndata = vl_ndata
             vl_suffix = "val_loss"
         # All instances of FitState should use these
         FitState.tr_ndata = tr_ndata
@@ -338,13 +337,14 @@ def save_best_replica(self, i, epoch = None):
         """
         if epoch is None:
             epoch = self.final_epoch
-        chi2 = self._history[epoch].vl_loss[i]
-        self._replicas[i].register_best(chi2, epoch)
+        self.best_epoch = epoch # TODO: makes sense for only one model in parallel
+        loss = self._history[epoch].vl_loss[i]
+        self._replicas[i].register_best(loss, epoch)
         return False
 
     def all_best_vl_loss(self):
         """ Returns the best validation loss for each replica """
-        return [i.best_vl for i in self._replicas]
+        return np.array([i.best_vl for i in self._replicas])
 
     def register(self, epoch, training_info, validation_info):
         """ Save a new fitstate and updates the current final epoch
@@ -354,9 +354,9 @@ def register(self, epoch, training_info, validation_info):
         Parameters
         ----------
             fitstate: FitState
-                FitState object 
+                FitState object
                 the fitstate of the object to save
-            `epoch`
+            epoch: int
                 the current epoch of the fit
         """
         # Save all the information in a fitstate object
@@ -365,25 +365,13 @@ def register(self, epoch, training_info, validation_info):
         self._history.append(fitstate)
         return fitstate
 
-    def reload(self, weights=None):
+    def reload(self):
         """ Reloads the best fit weights into the model
         if there are models to be reloaded
         A set of weights can be enforced as an optional argument
         """
-        # TODO the weights part is tricky
-        if weights is None:
-            for replica in self._replicas:
-                replica.reload()
-
-    def rewind(self, step):
-        """ Rewind the FitHistory object to the step `step` in the fit
-        """
-        fitstate = self.reloadable_history[step]
-        historic_weights = fitstate.weights
-        self.reload(weights=historic_weights)
-        self.best_epoch = fitstate.best_epoch
-        self.final_epoch = (step + 1) * self._save_weights_each - 1
-        # -1 because we are saving the epochs starting at 0
+        for replica in self._replicas:
+            replica.reload()
 
 
 class Stopping:
@@ -527,10 +515,15 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
 
         # Step 3. Register the current point in (the) history
         fitstate = self.history.register(epoch, training_info, validation_info)
+        if print_stats:
+            self.print_current_stats(epoch, fitstate)
 
         # Step 4. Check whether this is a better fit
         #         this means improving vl_chi2 and passing positivity
-        passes_val = fitstate.vl_loss < self.threshold_chi2
+        passes_val = True
+        # Don't start counting until the chi2 of the validation goes below a certain threshold
+        if self.count == 0:
+            passes_val &= fitstate.vl_chi2 < self.threshold_chi2
         passes_val &= fitstate.vl_loss < self.history.all_best_vl_loss()
         # And the ones that pass positivity
         passes_pos = self._positivity(fitstate)
@@ -548,9 +541,6 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
         if self.stopping_degree > self.stopping_patience:
             stop_here = True
 
-        if print_stats:
-            self.print_current_stats(epoch, fitstate)
-
         if stop_here:
             self.make_stop()
         return True
@@ -569,7 +559,7 @@ def print_current_stats(self, epoch, fitstate):
         epoch_index = epoch + 1
         tr_chi2 = fitstate.total_tr_chi2()
         vl_chi2 = fitstate.total_vl_chi2()
-        total_str = f"At epoch {epoch_index}/{self.total_epochs}, total loss: {tr_chi2}\n"
+        total_str = f"At epoch {epoch_index}/{self.total_epochs}, total chi2: {tr_chi2}\n"
 
         # The partial chi2 makes no sense for more than one replica at once:
         if self.n_replicas == 1:
@@ -578,7 +568,7 @@ def print_current_stats(self, epoch, fitstate):
             for experiment, chi2 in partial_tr_chi2.items():
                 partials.append(f"{experiment}: {chi2:.3f}")
             total_str += ", ".join(partials) + "\n"
-        total_str += f"Validation loss at this point: {vl_chi2}"
+        total_str += f"Validation chi2 at this point: {vl_chi2}"
         log.info(total_str)
 
     def stop_here(self):
@@ -590,25 +580,6 @@ def stop_here(self):
         else:
             return self.stop_now
 
-    def positivity_pass(self):
-        """ Checks whether the positivity loss is below the requested threshold
-        If there is no best state, the positivity (obv) cannot pass
-        """
-        best_state = self.history.best_state()
-        if best_state is not None and self.positivity(best_state):
-            return True
-        else:
-            return False
-
-    def positivity_status(self):
-        """ Checks whether the positivity loss is below the requested threshold
-        If there is no best state, the positivity (obv) cannot pass
-        """
-        if self.positivity_pass():
-            return POS_OK
-        else:
-            return POS_BAD
-
     def get_next_replica(self):
         """ Return the next ReplicaState object"""
         if self.replica_iterator is None:
@@ -695,8 +666,6 @@ def check_positivity(self, history_object):
         for key in self.positivity_sets:
             key_loss = f"{key}_loss"
             positivity_pass &= history_object[key_loss] < self.threshold
-            if not all(positivity_pass):
-                return np.array(positivity_pass)
         return np.array(positivity_pass)
 
     def __call__(self, fitstate):

From 579429bc375462245bf1ecff86c386bac830e9d9 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 22 Dec 2020 15:55:17 +0100
Subject: [PATCH 13/27] many changes to stopping, remove deprecated options

---
 n3fit/src/n3fit/ModelTrainer.py |   5 -
 n3fit/src/n3fit/io/writer.py    |   2 +-
 n3fit/src/n3fit/performfit.py   |  12 --
 n3fit/src/n3fit/stopping.py     | 218 ++++++++++++++++----------------
 4 files changed, 108 insertions(+), 129 deletions(-)

diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index bcfb9f4bea..e307be3401 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -174,7 +174,6 @@ def __init__(
         pass_status="ok",
         failed_status="fail",
         debug=False,
-        save_weights_each=False,
         kfold_parameters=None,
         max_cores=None,
         model_file=None,
@@ -192,8 +191,6 @@ def __init__(
             pass_status: flag to signal a good run
             failed_status: flag to signal a bad run
             debug: flag to activate some debug options
-            save_weights_each: if set, save the state of the fit
-                                    every ``save_weights_each`` epochs
             model_file: str whether to save the models
             sum_rules: str whether sum rules should be enabled (All, MSR, VSR, False)
         """
@@ -212,7 +209,6 @@ def __init__(
         self.pass_status = pass_status
         self.failed_status = failed_status
         self.debug = debug
-        self.save_weights_each = save_weights_each
         self.all_datasets = []
         self.parallel_models = parallel_models
 
@@ -877,7 +873,6 @@ def hyperparametrizable(self, params):
                 pdf_models,
                 total_epochs=epochs,
                 stopping_patience=stopping_epochs,
-                save_weights_each=self.save_weights_each,
                 threshold_positivity=threshold_pos,
                 threshold_chi2=threshold_chi2
             )
diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
index e852404dde..8c325b2910 100644
--- a/n3fit/src/n3fit/io/writer.py
+++ b/n3fit/src/n3fit/io/writer.py
@@ -72,7 +72,7 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
         stop_epoch = self.stopping_object.epoch_of_the_stop
 
         # Get the replica status for this object
-        _, replica_status = self.stopping_object.get_next_replica()
+        replica_status = self.stopping_object.get_next_replica()
 
         # export PDF grid to file
         storefit(
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index aec1bd2fb0..4016690cff 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -357,17 +357,5 @@ def performfit(
                 # Need to use "str" here because TF 2.2 has a bug for paths objects (fixed in 2.3 though)
                 pdf_model.save_weights(str(model_file_path), save_format="h5")
 
-        # If the history of weights is active then loop over it
-        # rewind the state back to every step and write down the results
-#         for step in range(len(stopping_object.history.reloadable_history)):
-#             stopping_object.history.rewind(step)
-#             new_path = output_path / f"history_step_{step}/replica_{replica_number}"
-#             # We need to recompute the experimental chi2 for this point
-#             training_chi2, val_chi2, exp_chi2 = the_model_trainer.evaluate(stopping_object)
-#             writer_wrapper.write_data(new_path, output_path.name, training_chi2, val_chi2, exp_chi2)
-
-        # So every time we want to capture output_path.name and addd a history_step_X
-        # parallel to the nnfit folder
-
         if tboard is not None:
             log.info("Tensorboard logging information is stored at %s", log_path)
diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py
index aa4519c3ea..786e14ca17 100644
--- a/n3fit/src/n3fit/stopping.py
+++ b/n3fit/src/n3fit/stopping.py
@@ -134,30 +134,33 @@ def parse_losses(history_object, data, suffix="loss"):
 
 class FitState:
     """
-        Holds the state of the chi2 during the fit for all replicas
+    Holds the state of the chi2 during the fit for all replicas
 
-        It holds the necessary information to reload the fit
-        to a specific point in time if we are interested on reloading
-        (otherwise the relevant variables stay empty to save memory)
+    It holds the necessary information to reload the fit
+    to a specific point in time if we are interested on reloading
+    (otherwise the relevant variables stay empty to save memory)
 
-        Note: the training chi2 is computed before the update of the weights
-        so it is the chi2 that informed the updated corresponding to this state.
-        The validation chi2 instead is computed after the update of the weights.
+    Note: the training chi2 is computed before the update of the weights
+    so it is the chi2 that informed the updated corresponding to this state.
+    The validation chi2 instead is computed after the update of the weights.
 
-        Parameters
-        ----------
-            training_info: dict
-                all losses for the training model
-            validation_info: dict
-                all losses for the validation model
+    Parameters
+    ----------
+        training_info: dict
+            all losses for the training model
+        validation_info: dict
+            all losses for the validation model
     """
+
     vl_ndata = None
     tr_ndata = None
     vl_suffix = None
 
     def __init__(self, training_info, validation_info):
         if self.vl_ndata is None or self.tr_ndata is None or self.vl_suffix is None:
-            raise ValueError("FitState cannot be instantiated until vl_ndata, tr_ndata and vl_suffix are filled")
+            raise ValueError(
+                "FitState cannot be instantiated until vl_ndata, tr_ndata and vl_suffix are filled"
+            )
         self.training = training_info
         self.validation = validation_info
         self._parsed = False
@@ -186,7 +189,9 @@ def _parse_chi2(self):
         if self.training is not None:
             self._tr_chi2, self._tr_dict = parse_losses(self.training, self.tr_ndata)
         if self.validation is not None:
-            self._vl_chi2, self._vl_dict = parse_losses(self.validation, self.vl_ndata, suffix=self.vl_suffix)
+            self._vl_chi2, self._vl_dict = parse_losses(
+                self.validation, self.vl_ndata, suffix=self.vl_suffix
+            )
 
     @property
     def tr_chi2(self):
@@ -210,19 +215,19 @@ def all_vl_chi2(self):
 
     def all_tr_chi2_for_replica(self, r):
         """" Return the tr chi2 per dataset for a given replica """
-        return {k:np.take(i, r) for k,i in self.all_tr_chi2.items()}
+        return {k: np.take(i, r) for k, i in self.all_tr_chi2.items()}
 
     def all_vl_chi2_for_replica(self, r):
         """" Return the vl chi2 per dataset for a given replica """
-        return {k:np.take(i, r) for k,i in self.all_vl_chi2.items()}
+        return {k: np.take(i, r) for k, i in self.all_vl_chi2.items()}
 
     def total_partial_tr_chi2(self):
         """ Return the tr chi2 summed over replicas per experiment"""
-        return {k:np.sum(i) for k,i in self.all_tr_chi2.items()}
+        return {k: np.sum(i) for k, i in self.all_tr_chi2.items()}
 
     def total_partial_vl_chi2(self):
         """ Return the vl chi2 summed over replicas per experiment"""
-        return {k:np.sum(i) for k,i in self.all_tr_chi2.items()}
+        return {k: np.sum(i) for k, i in self.all_tr_chi2.items()}
 
     def total_tr_chi2(self):
         """ Return the total tr chi2 summed over replicas """
@@ -237,7 +242,7 @@ def __str__(self):
 
 
 class ReplicaState:
-    """ Extra complication which eventually will be merged with someone else
+    """Extra complication which eventually will be merged with someone else
     but it is here only for development."""
 
     def __init__(self, pdf_model):
@@ -280,28 +285,27 @@ def reload(self):
 
 class FitHistory:
     """
-        Keeps a list of FitState items holding the full history of the fit.
+    Keeps a list of FitState items holding the full history of the fit.
 
-        It also keeps track of the best epoch and the associated weights.
+    It also keeps track of the best epoch and the associated weights.
 
-        Can be iterated when there are snapshots of the fit being saved.
-        When iterated it will rewind the fit to each of the point in history
-        that have been saved.
+    Can be iterated when there are snapshots of the fit being saved.
+    When iterated it will rewind the fit to each of the point in history
+    that have been saved.
 
-        Parameters
-        ----------
-            pdf_models: n3fit.backends.MetaModel
-                list of PDF models being trained, used to saved the weights
-
-            save_weights_each: int
-                if given, it will save a snapshot of the fit every  `save_weights_each` epochs
+    Parameters
+    ----------
+        pdf_models: n3fit.backends.MetaModel
+            list of PDF models being trained, used to saved the weights
     """
-    def __init__(self, pdf_models, tr_ndata, vl_ndata, save_weights_each=None):
+
+    def __init__(self, pdf_models, tr_ndata, vl_ndata):
         # Create a ReplicaState object for all models
         # which will hold the best chi2 and weights per replica
         self._replicas = []
         for pdf_model in pdf_models:
             self._replicas.append(ReplicaState(pdf_model))
+        self._iter_replicas = iter(self._replicas)
 
         if vl_ndata is None:
             vl_ndata = tr_ndata
@@ -320,36 +324,30 @@ def __init__(self, pdf_models, tr_ndata, vl_ndata, save_weights_each=None):
 
     def get_state(self, epoch):
         """ Get the FitState of the system for a given epoch """
-        return self._history[epoch]
-
-    def best_state(self):
-        """ Return the FitState object corresponding to the best fit """
-        if self.best_epoch is None:
-            return None
-        else:
-            index = self.best_epoch
-            return self._history[index]
-
-    def save_best_replica(self, i, epoch = None):
-        """ Save the state of replica ``i`` as a best fit so far.
+        try:
+            return self._history[epoch]
+        except IndexError as e:
+            raise ValueError(
+                f"Tried to get obtain the state for epoch {epoch} when only {len(self._history)} epochs have been saved"
+            ) from e
+
+    def save_best_replica(self, i, epoch=None):
+        """Save the state of replica ``i`` as a best fit so far.
         If an epoch is given, save the best as the given epoch, otherwise
         use the last one
         """
         if epoch is None:
             epoch = self.final_epoch
-        self.best_epoch = epoch # TODO: makes sense for only one model in parallel
-        loss = self._history[epoch].vl_loss[i]
+        self.best_epoch = epoch  # TODO: makes sense for only one model in parallel
+        loss = self.get_state(epoch).vl_loss[i]
         self._replicas[i].register_best(loss, epoch)
-        return False
 
     def all_best_vl_loss(self):
         """ Returns the best validation loss for each replica """
         return np.array([i.best_vl for i in self._replicas])
 
     def register(self, epoch, training_info, validation_info):
-        """ Save a new fitstate and updates the current final epoch
-        Every `save_weights_each` (if set) saves a snapshot of the current best fit into
-        the fitstate
+        """Save a new fitstate and updates the current final epoch
 
         Parameters
         ----------
@@ -366,41 +364,42 @@ def register(self, epoch, training_info, validation_info):
         return fitstate
 
     def reload(self):
-        """ Reloads the best fit weights into the model
+        """Reloads the best fit weights into the model
         if there are models to be reloaded
         A set of weights can be enforced as an optional argument
         """
         for replica in self._replicas:
             replica.reload()
 
+    def __next__(self):
+        return next(self._iter_replicas)
+
 
 class Stopping:
     """
-        Driver of the stopping algorithm
+    Driver of the stopping algorithm
 
-        Note, if the total number of points in the validation dictionary is None, it is assumed
-        the validation_model actually corresponds to the training model.
+    Note, if the total number of points in the validation dictionary is None, it is assumed
+    the validation_model actually corresponds to the training model.
 
-        Parameters
-        ----------
-            validation_model: n3fit.backends.MetaModel
-               the model with the validation mask applied
-               (and compiled with the validation data and covmat)
-            all_data_dict: dict
-               list containg all dictionaries containing all information about
-               the experiments/validation/regularizers/etc to be parsed by Stopping
-            pdf_models: list(n3fit.backends.MetaModel)
-               list of pdf_models being trained
-            threshold_positivity: float
-               maximum value allowed for the sum of all positivity losses
-            total_epochs: int
-               total number of epochs
-            stopping_patience: int
-               how many epochs to wait for the validation loss to improve
-            dont_stop: bool
-               dont care about early stopping
-            save_weights_each: int
-                every how many epochs to save a snapshot of the fit
+    Parameters
+    ----------
+        validation_model: n3fit.backends.MetaModel
+           the model with the validation mask applied
+           (and compiled with the validation data and covmat)
+        all_data_dict: dict
+           list containg all dictionaries containing all information about
+           the experiments/validation/regularizers/etc to be parsed by Stopping
+        pdf_models: list(n3fit.backends.MetaModel)
+           list of pdf_models being trained
+        threshold_positivity: float
+           maximum value allowed for the sum of all positivity losses
+        total_epochs: int
+           total number of epochs
+        stopping_patience: int
+           how many epochs to wait for the validation loss to improve
+        dont_stop: bool
+           dont care about early stopping
     """
 
     def __init__(
@@ -413,14 +412,13 @@ def __init__(
         stopping_patience=7000,
         threshold_chi2=10.0,
         dont_stop=False,
-        save_weights_each=None,
     ):
         # Save the validation object
         self._validation = validation_model
 
         # Create the History object
         tr_ndata, vl_ndata, pos_sets = parse_ndata(all_data_dicts)
-        self.history = FitHistory(pdf_models, tr_ndata, vl_ndata)
+        self._history = FitHistory(pdf_models, tr_ndata, vl_ndata)
 
         # And the positivity checker
         self._positivity = Positivity(threshold_positivity, pos_sets)
@@ -435,7 +433,6 @@ def __init__(
         self.stopping_degree = 0
         self.count = 0
         self.total_epochs = total_epochs
-        self.replica_iterator = None
 
     @property
     def vl_chi2(self):
@@ -446,6 +443,7 @@ def vl_chi2(self):
 
     @property
     def e_best_chi2(self):
+<<<<<<< HEAD
         """ Epoch of the best chi2, if there is no best epoch
         return the last epoch"""
         be = self.history.best_epoch
@@ -453,14 +451,18 @@ def e_best_chi2(self):
             return self.stop_epoch
         return be
 
+=======
+        """ Epoch of the best chi2 """
+        return self._history.best_epoch
+>>>>>>> 5fb6df564 (many changes to stopping, remove deprecated options)
 
     @property
     def stop_epoch(self):
         """ Epoch in which the fit is stopped """
-        return self.history.final_epoch + 1
+        return self._history.final_epoch + 1
 
     def evaluate_training(self, training_model):
-        """ Given the training model, evaluates the
+        """Given the training model, evaluates the
         model and parses the chi2 of the training datasets
 
         Parameters
@@ -505,16 +507,14 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
         # Step 1. Check whether the fit has NaN'd and stop it if so
         if np.isnan(training_info["loss"]):
             log.warning(" > NaN found, stopping activated")
-            self.stop_now = True
-            # If we had a good model at any point, reload
-            self.history.reload() # TODO
+            self.make_stop()
             return False
 
         # Step 2. Compute the validation metrics
         validation_info = self._validation.compute_losses()
 
         # Step 3. Register the current point in (the) history
-        fitstate = self.history.register(epoch, training_info, validation_info)
+        fitstate = self._history.register(epoch, training_info, validation_info)
         if print_stats:
             self.print_current_stats(epoch, fitstate)
 
@@ -524,7 +524,7 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
         # Don't start counting until the chi2 of the validation goes below a certain threshold
         if self.count == 0:
             passes_val &= fitstate.vl_chi2 < self.threshold_chi2
-        passes_val &= fitstate.vl_loss < self.history.all_best_vl_loss()
+        passes_val &= fitstate.vl_loss < self._history.all_best_vl_loss()
         # And the ones that pass positivity
         passes_pos = self._positivity(fitstate)
         passes = passes_val & passes_pos
@@ -532,12 +532,12 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
         self.stopping_degree += self.count
 
         # Step 5. loop over the valid indices to check whether the vl improved
-        stop_here = all(passes)
         for i in np.where(passes)[0]:
-            stop_here &= self.history.save_best_replica(i)
+            self._history.save_best_replica(i)
             self.stopping_degree = 0
             self.count = 1
 
+        # By using the stopping degree we only stop when none of the replicas are improving anymore
         if self.stopping_degree > self.stopping_patience:
             stop_here = True
 
@@ -546,15 +546,15 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
         return True
 
     def make_stop(self):
-        """ Convenience method to set the stop_now flag
+        """Convenience method to set the stop_now flag
         and reload the history to the point of the best model if any
         """
         self.stop_now = True
-        self.history.reload()
+        self._history.reload()
 
     def print_current_stats(self, epoch, fitstate):
         """
-            Prints ``fitstate`` training and validation chi2s
+        Prints ``fitstate`` training and validation chi2s
         """
         epoch_index = epoch + 1
         tr_chi2 = fitstate.total_tr_chi2()
@@ -572,7 +572,7 @@ def print_current_stats(self, epoch, fitstate):
         log.info(total_str)
 
     def stop_here(self):
-        """ Returns the stopping status
+        """Returns the stopping status
         If `dont_stop` is set returns always False (i.e., never stop)
         """
         if self.dont_stop:
@@ -582,11 +582,7 @@ def stop_here(self):
 
     def get_next_replica(self):
         """ Return the next ReplicaState object"""
-        if self.replica_iterator is None:
-            self.replica_iterator = iter(self.history._replicas)
-            self.ii = -1
-        self.ii += 1
-        return self.ii, next(self.replica_iterator)
+        return next(self._history)
 
     def chi2exps_str(self, replica=0, log_each=100):
         """
@@ -605,10 +601,10 @@ def chi2exps_str(self, replica=0, log_each=100):
             file_list: list(str)
                 a list of strings to be printed as `chi2exps.log`
         """
-        final_epoch = self.history.final_epoch
+        final_epoch = self._history.final_epoch
         file_list = []
         for i in range(log_each - 1, final_epoch + 1, log_each):
-            fitstate = self.history.get_state(i)
+            fitstate = self._history.get_state(i)
             all_tr = fitstate.all_tr_chi2_for_replica(replica)
             all_vl = fitstate.all_vl_chi2_for_replica(replica)
             # Here it is assumed the validation exp set is always a subset of the training exp set
@@ -629,19 +625,19 @@ def chi2exps_str(self, replica=0, log_each=100):
 
 class Positivity:
     """
-        Controls the positivity requirements.
+    Controls the positivity requirements.
 
-        In order to check the positivity passes will check the history of the fitting
-        as the fitting included positivity sets.
-        If the sum of all positivity sets losses is above a certain value the model is
-        not accepted and the training continues.
+    In order to check the positivity passes will check the history of the fitting
+    as the fitting included positivity sets.
+    If the sum of all positivity sets losses is above a certain value the model is
+    not accepted and the training continues.
 
-        Parameters
-        ----------
-            threshold_positivity: float
-                maximum value allowed for the sum of all positivity losses
-            positivity_sets: list
-                list of positivity datasets
+    Parameters
+    ----------
+        threshold_positivity: float
+            maximum value allowed for the sum of all positivity losses
+        positivity_sets: list
+            list of positivity datasets
     """
 
     def __init__(self, threshold, positivity_sets):
@@ -670,7 +666,7 @@ def check_positivity(self, history_object):
 
     def __call__(self, fitstate):
         """
-            Checks whether a given FitState object
-            passes the positivity requirement
+        Checks whether a given FitState object
+        passes the positivity requirement
         """
         return self.check_positivity(fitstate.validation)

From edbcb2d2780bdabd7cef1fa880df7ff5f1d5bad3 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 22 Dec 2020 16:16:13 +0100
Subject: [PATCH 14/27] hotfix

---
 n3fit/src/n3fit/stopping.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py
index 786e14ca17..d16b80adb6 100644
--- a/n3fit/src/n3fit/stopping.py
+++ b/n3fit/src/n3fit/stopping.py
@@ -363,6 +363,10 @@ def register(self, epoch, training_info, validation_info):
         self._history.append(fitstate)
         return fitstate
 
+    def stop_training_replica(self, i):
+        """ Stop training replica i """
+        replica = self._replicas[i]
+
     def reload(self):
         """Reloads the best fit weights into the model
         if there are models to be reloaded
@@ -426,12 +430,12 @@ def __init__(
         # Initialize internal variables for the stopping
         self.n_replicas = len(pdf_models)
         self.threshold_chi2 = threshold_chi2
+        self.stopping_degree = np.zeros(self.n_replicas)
+        self.count = np.zeros(self.n_replicas)
 
         self.dont_stop = dont_stop
         self.stop_now = False
         self.stopping_patience = stopping_patience
-        self.stopping_degree = 0
-        self.count = 0
         self.total_epochs = total_epochs
 
     @property
@@ -520,28 +524,28 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
 
         # Step 4. Check whether this is a better fit
         #         this means improving vl_chi2 and passing positivity
-        passes_val = True
         # Don't start counting until the chi2 of the validation goes below a certain threshold
-        if self.count == 0:
-            passes_val &= fitstate.vl_chi2 < self.threshold_chi2
-        passes_val &= fitstate.vl_loss < self._history.all_best_vl_loss()
+        # once we start counting, don't bother anymore
+        passes = self.count or ( fitstate.vl_chi2 < self.threshold_chi2 )
+        passes = passes and fitstate.vl_loss < self._history.all_best_vl_loss()
         # And the ones that pass positivity
-        passes_pos = self._positivity(fitstate)
-        passes = passes_val & passes_pos
+        passes = passes and self._positivity(fitstate)
 
         self.stopping_degree += self.count
 
         # Step 5. loop over the valid indices to check whether the vl improved
         for i in np.where(passes)[0]:
             self._history.save_best_replica(i)
-            self.stopping_degree = 0
-            self.count = 1
+            self.stopping_degree[i] = 0
+            self.count[i] = 1
 
-        # By using the stopping degree we only stop when none of the replicas are improving anymore
-        if self.stopping_degree > self.stopping_patience:
-            stop_here = True
+        stop_replicas = self.count and self.stopping_degree > self.stopping_patience
+        for i in np.where(stop_replicas)[0]:
+            self.count[i] = 0
+            self._history.stop_training_replica(i)
 
-        if stop_here:
+        # By using the stopping degree we only stop when none of the replicas are improving anymore
+        if min(self.stopping_degree) > self.stopping_patience:
             self.make_stop()
         return True
 

From 74935338518337d306bba8471c12deb9cf05f74d Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 22 Dec 2020 16:33:31 +0100
Subject: [PATCH 15/27] stop at different points for each replica

---
 n3fit/src/n3fit/stopping.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py
index d16b80adb6..95dba489c3 100644
--- a/n3fit/src/n3fit/stopping.py
+++ b/n3fit/src/n3fit/stopping.py
@@ -274,14 +274,20 @@ def positivity_status(self):
             return POS_BAD
 
     def register_best(self, chi2, epoch):
+        """ Register a new best state and some metadata about it """
         self._weights = self._pdf_model.get_weights()
         self._best_epoch = epoch
         self._best_vl_chi2 = chi2
 
     def reload(self):
+        """ Reload the weights of the best state """
         if self._weights:
             self._pdf_model.set_weights(self._weights)
 
+    def stop_training(self):
+        """ Stop training this replica """
+        self._pdf_model.trainable = False
+
 
 class FitHistory:
     """
@@ -319,9 +325,13 @@ def __init__(self, pdf_models, tr_ndata, vl_ndata):
 
         # Save a list of status for the entire fit
         self._history = []
-        self.best_epoch = None
         self.final_epoch = None
 
+    @property
+    def best_epoch(self):
+        """ Return the best epoch per replica """
+        return [i.best_epoch for i in self._replicas]
+
     def get_state(self, epoch):
         """ Get the FitState of the system for a given epoch """
         try:
@@ -338,7 +348,6 @@ def save_best_replica(self, i, epoch=None):
         """
         if epoch is None:
             epoch = self.final_epoch
-        self.best_epoch = epoch  # TODO: makes sense for only one model in parallel
         loss = self.get_state(epoch).vl_loss[i]
         self._replicas[i].register_best(loss, epoch)
 
@@ -365,7 +374,7 @@ def register(self, epoch, training_info, validation_info):
 
     def stop_training_replica(self, i):
         """ Stop training replica i """
-        replica = self._replicas[i]
+        self._replicas[i].stop_training()
 
     def reload(self):
         """Reloads the best fit weights into the model
@@ -430,8 +439,8 @@ def __init__(
         # Initialize internal variables for the stopping
         self.n_replicas = len(pdf_models)
         self.threshold_chi2 = threshold_chi2
-        self.stopping_degree = np.zeros(self.n_replicas)
-        self.count = np.zeros(self.n_replicas)
+        self.stopping_degree = np.zeros(self.n_replicas, dtype=np.int)
+        self.count = np.zeros(self.n_replicas, dtype=np.int)
 
         self.dont_stop = dont_stop
         self.stop_now = False
@@ -526,10 +535,10 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
         #         this means improving vl_chi2 and passing positivity
         # Don't start counting until the chi2 of the validation goes below a certain threshold
         # once we start counting, don't bother anymore
-        passes = self.count or ( fitstate.vl_chi2 < self.threshold_chi2 )
-        passes = passes and fitstate.vl_loss < self._history.all_best_vl_loss()
+        passes = self.count | ( fitstate.vl_chi2 < self.threshold_chi2 )
+        passes &= fitstate.vl_loss < self._history.all_best_vl_loss()
         # And the ones that pass positivity
-        passes = passes and self._positivity(fitstate)
+        passes &= self._positivity(fitstate)
 
         self.stopping_degree += self.count
 
@@ -539,7 +548,7 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
             self.stopping_degree[i] = 0
             self.count[i] = 1
 
-        stop_replicas = self.count and self.stopping_degree > self.stopping_patience
+        stop_replicas = self.count & (self.stopping_degree > self.stopping_patience)
         for i in np.where(stop_replicas)[0]:
             self.count[i] = 0
             self._history.stop_training_replica(i)

From ab64c294e32af3bd237549230e104e54295be4e5 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@lairen.eu>
Date: Tue, 22 Dec 2020 17:21:39 +0100
Subject: [PATCH 16/27] 1-dataset runcards should also work

---
 n3fit/src/n3fit/backends/keras_backend/MetaModel.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
index d30a3c2aef..c7be3f4acc 100644
--- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
+++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -193,6 +193,9 @@ def compute_losses(self):
             @tf.function
             def losses_fun():
                 predictions = self(self._parse_input(None))
+                # If we only have one dataset the output changes
+                if len(out_names) == 2:
+                    predictions = [predictions]
                 total_loss = tf.reduce_sum(predictions, axis=0)
                 ret = [total_loss] + predictions
                 return dict(zip(out_names, ret))

From a875924d0c5dce0d855d84b773a3c9f65d036da3 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Mon, 4 Jan 2021 10:07:40 +0100
Subject: [PATCH 17/27] remove unnecesary middle layers

---
 .../n3fit/backends/keras_backend/MetaModel.py | 30 +++++------
 n3fit/src/n3fit/layers/losses.py              | 46 ++++++++++++----
 n3fit/src/n3fit/model_gen.py                  | 53 ++++++++++++-------
 n3fit/src/n3fit/performfit.py                 |  2 +
 n3fit/src/n3fit/stopping.py                   | 34 +++++++-----
 n3fit/src/n3fit/tests/test_losses.py          |  4 +-
 6 files changed, 110 insertions(+), 59 deletions(-)

diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
index c7be3f4acc..99f4a640db 100644
--- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
+++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -207,18 +207,6 @@ def losses_fun():
         # if it dissapears, equivalent to {k: i.numpy() for k, i in ret.items()}
         return tf_utils.to_numpy_or_python_type(ret)
 
-
-    def evaluate(self, x=None, y=None, **kwargs):
-        """
-        Wrapper around evaluate to take into account the case in which the data is already known
-        when the model is compiled.
-        """
-        x = self._parse_input(self.x_in)
-        if LEGACY and y is None:
-            y = self.target_tensors
-        result = super().evaluate(x=x, y=y, **kwargs)
-        return result
-
     def compile(
         self,
         optimizer_name="RMSprop",
@@ -286,16 +274,28 @@ def compile(
         super(MetaModel, self).compile(optimizer=opt, loss=loss)
 
     def make_test_function(self):
-        """ If the model has been compiled with target data, it creates
-        a specific evaluate function with the target data already evaluated.
-        Otherwise return the normal tensorflow behaviour.
         """
+        If the model has been compiled in the normal NNPDF way,
+        then the output of the prediction is already the loss, so we skip this part
+        by just summing over predictions.
+
+        Otherwise, just return the usual TF make_test_function
+        """
+
         if self.eval_fun is not None:
             return self.eval_fun
 
         if self.target_tensors is None:
             return super().make_test_function()
 
+        @tf.function
+        def eval_fun(*args):
+            predictions = self(self._parse_input(None))
+            return tf.reduce_sum(predictions)
+
+        self.eval_fun = eval_fun
+        return eval_fun
+
         # Recover the target tensors and their lengths, we cannot rely
         # directly on the output from the model as we might have target_tensors
         # with 0 data points (if the tr/vl mask covers the whole set)
diff --git a/n3fit/src/n3fit/layers/losses.py b/n3fit/src/n3fit/layers/losses.py
index f56dc0f461..2d89c387c9 100644
--- a/n3fit/src/n3fit/layers/losses.py
+++ b/n3fit/src/n3fit/layers/losses.py
@@ -5,11 +5,13 @@
 from n3fit.backends import MetaLayer
 from n3fit.backends import operations as op
 
-class L_invcovmat(MetaLayer):
+
+class LossInvcovmat(MetaLayer):
     """
     Loss function such that:
     L = \sum_{ij} (yt - yp)_{i} invcovmat_{ij} (yt - yp)_{j}
     """
+
     def __init__(self, invcovmat, y_true, **kwargs):
         self.invcovmat = op.numpy_to_tensor(invcovmat)
         self.y_true = op.numpy_to_tensor(y_true)
@@ -17,10 +19,32 @@ def __init__(self, invcovmat, y_true, **kwargs):
 
     def call(self, y_pred, **kwargs):
         tmp = self.y_true - y_pred
-        res = op.einsum('bri, ij, brj -> r', tmp, self.invcovmat, tmp)
+        res = op.einsum("bri, ij, brj -> r", tmp, self.invcovmat, tmp)
         return res
 
-class L_positivity(MetaLayer):
+
+class LossLagrange(MetaLayer):
+    def __init__(self, c=1.0, **kwargs):
+        self._initial_multiplier = c
+        super().__init__(**kwargs)
+
+    def build(self, input_shape):
+        multiplier = MetaLayer.init_constant(self._initial_multiplier)
+        self.kernel = self.builder_helper("lagMult", (1,), multiplier, trainable=False)
+        super().build(input_shape)
+
+    def apply_multiplier(self, y):
+        return self.kernel * y
+
+    def apply_loss(self, y):
+        return y
+
+    def call(self, y_pred, **kwargs):
+        y = self.apply_multiplier(y_pred)
+        return self.apply_loss(y)
+
+
+class LossPositivity(LossLagrange):
     """
     Returns L = elu(y_pred) (considers y_true as 0)
 
@@ -32,21 +56,23 @@ class L_positivity(MetaLayer):
     the lagrange multiplier is very big.
     In practice this function can produce results in the range (-alpha, inf)
     """
+
     def __init__(self, alpha=1e-7, **kwargs):
         self.alpha = alpha
         super().__init__(**kwargs)
 
-    def call(self, y_pred, **kwargs):
-        y = -y_pred
-        loss = op.backend_function("elu", y, alpha=self.alpha)
+    def apply_loss(self, y_pred):
+        loss = op.backend_function("elu", -y_pred, alpha=self.alpha)
         # Sum over the batch and the datapoints
-        return op.sum(loss, axis=[0,-1])
+        return op.sum(loss, axis=[0, -1])
+
 
-class L_integrability(MetaLayer):
+class LossIntegrability(LossLagrange):
     """
     Returns L = (y_pred)*(y_pred)
     """
-    def call(self, y_pred, **kwargs):
+
+    def apply_loss(self, y_pred):
         y = op.backend_function("square", y_pred)
         # Sum over the batch and the datapoints
-        return op.sum(y, axis=[0,-1])
+        return op.sum(y, axis=[0, -1])
diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 113c148b65..38468ac130 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -17,7 +17,9 @@
 from n3fit.backends import base_layer_selector, regularizer_selector
 
 
-def observable_generator(spec_dict, positivity_initial=1.0, integrability=False):  # pylint: disable=too-many-locals
+def observable_generator(
+    spec_dict, positivity_initial=1.0, integrability=False
+):  # pylint: disable=too-many-locals
     """
     This function generates the observable model for each experiment.
     These are models which takes as input a PDF tensor (1 x size_of_xgrid x flavours) and outputs
@@ -148,7 +150,9 @@ def gen_concat():
         concat_vl = concat_ex
 
     # creating the experiment as a model turns out to bad for performance
-    def experiment_layer(pdf, model_obs=model_obs_ex, concat=concat_ex, rotation=None, datasets_out=None):
+    def experiment_layer(
+        pdf, model_obs=model_obs_ex, concat=concat_ex, rotation=None, datasets_out=None
+    ):
         """ By default works with the experiment observable """
         output_layers = []
         # First split the pdf layer into the different datasets if needed
@@ -177,22 +181,19 @@ def experiment_layer(pdf, model_obs=model_obs_ex, concat=concat_ex, rotation=Non
 
     # Now create the model for this experiment
     full_nx = sum(dataset_xsizes)
-    loss = lambda x,y: operations.sum(y)
 
-    if spec_dict["positivity"]:
-        out_mask = Mask(
-            c=positivity_initial,
-            axis=1,
-        )
+    def loss(y_true, y_pred):
+        return operations.sum(y_pred)
 
+    if spec_dict["positivity"]:
         if integrability:
-            loss_pos = losses.L_integrability(name=spec_name)
+            loss_pos = losses.LossIntegrability(name=spec_name, c=positivity_initial)
         else:
-            loss_pos = losses.L_positivity(name=spec_name)
+            loss_pos = losses.LossPositivity(name=spec_name, c=positivity_initial)
 
         def out_positivity(pdf_layer, datasets_out=None):
             exp_result = experiment_layer(pdf_layer)
-            return loss_pos(out_mask(exp_result))
+            return loss_pos(exp_result)
 
         layer_info = {
             "inputs": model_inputs,
@@ -207,9 +208,9 @@ def out_positivity(pdf_layer, datasets_out=None):
     invcovmat = spec_dict["invcovmat_true"]
 
     # Prepare the loss function
-    loss_tr = losses.L_invcovmat(invcovmat_tr, spec_dict["expdata"], name=tr_name)
-    loss_vl = losses.L_invcovmat(invcovmat_vl, spec_dict["expdata_vl"], name=vl_name)
-    loss_ex = losses.L_invcovmat(invcovmat, spec_dict["expdata_true"], name=ex_name)
+    loss_tr = losses.LossInvcovmat(invcovmat_tr, spec_dict["expdata"], name=tr_name)
+    loss_vl = losses.LossInvcovmat(invcovmat_vl, spec_dict["expdata_vl"], name=vl_name)
+    loss_ex = losses.LossInvcovmat(invcovmat, spec_dict["expdata_true"], name=ex_name)
 
     # Generate the loss function and rotations of the final data (if any)
     if spec_dict.get("data_transformation") is not None:
@@ -223,13 +224,21 @@ def out_positivity(pdf_layer, datasets_out=None):
 
     def out_tr(pdf_layer, datasets_out=None):
         exp_result = experiment_layer(
-            pdf_layer, model_obs=model_obs_tr, concat=concat_tr, datasets_out=datasets_out, rotation=obsrot_tr
+            pdf_layer,
+            model_obs=model_obs_tr,
+            concat=concat_tr,
+            datasets_out=datasets_out,
+            rotation=obsrot_tr,
         )
         return loss_tr(exp_result)
 
     def out_vl(pdf_layer, datasets_out=None):
         exp_result = experiment_layer(
-            pdf_layer, model_obs=model_obs_vl, concat=concat_vl, datasets_out=datasets_out, rotation=obsrot_vl
+            pdf_layer,
+            model_obs=model_obs_vl,
+            concat=concat_vl,
+            datasets_out=datasets_out,
+            rotation=obsrot_vl,
         )
         return loss_vl(exp_result)
 
@@ -497,7 +506,7 @@ def pdfNN_layer_generator(
 
     # Now we need a trainable network per model to be trained in parallel
     for i in range(parallel_models):
-        layer_seed = seed + i*number_of_layers
+        layer_seed = seed + i * number_of_layers
         if layer_type == "dense":
             reg = regularizer_selector(regularizer, **regularizer_args)
             list_of_pdf_layers = generate_dense_network(
@@ -514,10 +523,14 @@ def pdfNN_layer_generator(
             # TODO: this information should come from the basis information
             #       once the basis information is passed to this class
             list_of_pdf_layers = generate_dense_per_flavour_network(
-                inp, nodes, activations, initializer_name, seed=layer_seed, basis_size=last_layer_nodes,
+                inp,
+                nodes,
+                activations,
+                initializer_name,
+                seed=layer_seed,
+                basis_size=last_layer_nodes,
             )
 
-
         def dense_me(x):
             """Takes an input tensor `x` and applies all layers
             from the `list_of_pdf_layers` in order"""
@@ -531,7 +544,7 @@ def dense_me(x):
             return curr_fun
 
         # Preprocessing layer (will be multiplied to the last of the denses)
-        preproseed = seed + number_of_layers*(i+1)
+        preproseed = seed + number_of_layers * (i + 1)
         layer_preproc = Preprocessing(
             input_shape=(1,), name=f"pdf_prepro_{i}", flav_info=flav_info, seed=preproseed
         )
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 4016690cff..ec59f9e385 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -312,11 +312,13 @@ def performfit(
             """
         > > The stopping point has been at: {0} with a loss of {1}
                 which it got at {2}. Stopping degree {3}
+                Positivity status: {4}
                 """.format(
                 stopping_object.stop_epoch,
                 stopping_object.vl_chi2,
                 stopping_object.e_best_chi2,
                 stopping_object.stopping_degree,
+                stopping_object.positivity_status
             )
         )
 
diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py
index 95dba489c3..a0e750e403 100644
--- a/n3fit/src/n3fit/stopping.py
+++ b/n3fit/src/n3fit/stopping.py
@@ -351,6 +351,10 @@ def save_best_replica(self, i, epoch=None):
         loss = self.get_state(epoch).vl_loss[i]
         self._replicas[i].register_best(loss, epoch)
 
+    def all_positivity_status(self):
+        """ Returns whether the positivity passed or not per replica """
+        return np.array([i.positivity_status for i in self._replicas])
+
     def all_best_vl_loss(self):
         """ Returns the best validation loss for each replica """
         return np.array([i.best_vl for i in self._replicas])
@@ -474,6 +478,12 @@ def stop_epoch(self):
         """ Epoch in which the fit is stopped """
         return self._history.final_epoch + 1
 
+    @property
+    def positivity_status(self):
+        """Returns POS_PASS if positivity passes or veto if it doesn't
+        for each replica"""
+        return self._history.all_positivity_status()
+
     def evaluate_training(self, training_model):
         """Given the training model, evaluates the
         model and parses the chi2 of the training datasets
@@ -535,7 +545,7 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
         #         this means improving vl_chi2 and passing positivity
         # Don't start counting until the chi2 of the validation goes below a certain threshold
         # once we start counting, don't bother anymore
-        passes = self.count | ( fitstate.vl_chi2 < self.threshold_chi2 )
+        passes = self.count | (fitstate.vl_chi2 < self.threshold_chi2)
         passes &= fitstate.vl_loss < self._history.all_best_vl_loss()
         # And the ones that pass positivity
         passes &= self._positivity(fitstate)
@@ -659,17 +669,17 @@ def __init__(self, threshold, positivity_sets):
 
     def check_positivity(self, history_object):
         """
-        This function receives a history objects and loops over the
-        positivity_sets to check the value of the positivity loss.
-
-        If the positivity loss is above the threshold, the positivity fails
-        otherwise, it passes.
-        It returns an array booleans which are True if positivity passed
-
-        Parameters
-        ----------
-            history_object: dict
-                dictionary of entries in the form  {'name': loss}, output of a MetaModel .fit()
+                This function receives a history objects and loops over the
+                positivity_sets to check the value of the positivity loss.
+
+                If the positivity loss is above the threshold, the positivity fails
+                otherwise, it passes.
+                It returns an array booleans which are True if positivity passed
+        story_object[key_loss] < self.threshold
+                Parameters
+                ----------
+                    history_object: dict
+                        dictionary of entries in the form  {'name': loss}, output of a MetaModel .fit()
         """
         positivity_pass = True
         for key in self.positivity_sets:
diff --git a/n3fit/src/n3fit/tests/test_losses.py b/n3fit/src/n3fit/tests/test_losses.py
index 44a51e9fc3..784c9aeed7 100644
--- a/n3fit/src/n3fit/tests/test_losses.py
+++ b/n3fit/src/n3fit/tests/test_losses.py
@@ -13,7 +13,7 @@
 
 # Tests loss functions
 def test_l_invcovmat():
-    loss_f = losses.L_invcovmat(INVCOVMAT, ARR1)
+    loss_f = losses.LossInvcovmat(INVCOVMAT, ARR1)
     # Add a replica and batch dimension to T2
     result = loss_f(np.expand_dims(ARR2, [0, 1]))
     y = ARR1 - ARR2
@@ -24,7 +24,7 @@ def test_l_invcovmat():
 
 def test_l_positivity():
     alpha = 1e-7
-    loss_f = losses.L_positivity(alpha=alpha)
+    loss_f = losses.LossPositivity(alpha=alpha)
     result = loss_f(np.expand_dims(ARR2, [0, 1]))
 
     def elu_sum(yarr_in):

From f7ba7e4e28dbf30e90e39dee190c4a12ba8b9f1d Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 5 Jan 2021 13:37:23 +0100
Subject: [PATCH 18/27] rebase to test with json

---
 n3fit/src/n3fit/io/writer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
index 8c325b2910..22f2ecd530 100644
--- a/n3fit/src/n3fit/io/writer.py
+++ b/n3fit/src/n3fit/io/writer.py
@@ -96,7 +96,7 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
         # export all metadata from the fit to a single yaml file
         output_file = f"{replica_path_set}/{fitname}.json"
         json_dict = jsonfit(
-            replica_status, self.pdf_object, tr_chi2, true_chi2, stop_epoch, self.timings
+            replica_status, self.pdf_object, tr_chi2, vl_chi2, true_chi2, stop_epoch, self.timings
         )
         with open(output_file, "w") as fs:
             json.dump(json_dict, fs, indent=2, cls = SuperEncoder)
@@ -110,7 +110,7 @@ def default(self, o):
         return super().default(o)
 
 
-def jsonfit(replica_status, pdf_object, tr_chi2, true_chi2, epoch_stop, timing):
+def jsonfit(replica_status, pdf_object, tr_chi2, vl_chi2, true_chi2, epoch_stop, timing):
     """Generates a dictionary containing all relevant metadata for the fit
 
     Parameters
@@ -136,7 +136,7 @@ def jsonfit(replica_status, pdf_object, tr_chi2, true_chi2, epoch_stop, timing):
     all_info["stop_epoch"] = epoch_stop
     all_info["best_epoch"] = replica_status.best_epoch
     all_info["erf_tr"] = tr_chi2
-    all_info["erf_vl"] = replica_status.best_vl
+    all_info["erf_vl"] = vl_chi2
     all_info["chi2"] = true_chi2
     all_info["pos_state"] = replica_status.positivity_status
     all_info["arc_lengths"] = pdf_object.compute_arclength().tolist()

From a559be12c772a976c38d43b3e426e0f7abf52d46 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 5 Jan 2021 19:35:48 +0100
Subject: [PATCH 19/27] changes to ensure everything (hyperopt, kfolding,
 diagonal covmat) works

---
 n3fit/runcards/Basic_runcard_parallel.yml     | 106 ++++++
 n3fit/runcards/DIS_diagonal_l2reg_example.yml |   2 +-
 n3fit/src/n3fit/ModelTrainer.py               | 321 +++++++-----------
 .../n3fit/backends/keras_backend/MetaModel.py |  96 ++----
 .../backends/keras_backend/operations.py      |   4 +-
 n3fit/src/n3fit/checks.py                     |   4 +-
 n3fit/src/n3fit/io/writer.py                  |   6 +-
 n3fit/src/n3fit/layers/Rotations.py           |   2 +-
 n3fit/src/n3fit/layers/losses.py              |  71 +++-
 n3fit/src/n3fit/model_gen.py                  |  52 +--
 n3fit/src/n3fit/msr.py                        |  17 +-
 n3fit/src/n3fit/performfit.py                 |   5 +-
 12 files changed, 355 insertions(+), 331 deletions(-)
 create mode 100644 n3fit/runcards/Basic_runcard_parallel.yml

diff --git a/n3fit/runcards/Basic_runcard_parallel.yml b/n3fit/runcards/Basic_runcard_parallel.yml
new file mode 100644
index 0000000000..52fdcae4a3
--- /dev/null
+++ b/n3fit/runcards/Basic_runcard_parallel.yml
@@ -0,0 +1,106 @@
+#
+# Configuration file for n3fit
+#
+
+############################################################
+description: Basic runcard
+
+############################################################
+# frac: training fraction
+# ewk: apply ewk k-factors
+# sys: systematics treatment (see systypes)
+dataset_inputs:
+- { dataset: SLACP, frac: 0.5}
+- { dataset: NMC,   frac: 0.5  }
+- { dataset: NMCPD, frac: 0.5 }
+- { dataset: CMSJETS11, frac: 0.5, sys: 10 }
+
+############################################################
+datacuts:
+  t0pdfset     : NNPDF31_nlo_as_0118 # PDF set to generate t0 covmat
+  q2min        : 3.49                # Q2 minimum
+  w2min        : 12.5                # W2 minimum
+  combocuts    : NNPDF31             # NNPDF3.0 final kin. cuts
+  jetptcut_tev : 0                   # jet pt cut for tevatron
+  jetptcut_lhc : 0                   # jet pt cut for lhc
+  wptcut_lhc   : 30.0                # Minimum pT for W pT diff distributions
+  jetycut_tev  : 1e30                # jet rap. cut for tevatron
+  jetycut_lhc  : 1e30                # jet rap. cut for lhc
+  dymasscut_min: 0                   # dy inv.mass. min cut
+  dymasscut_max: 1e30                # dy inv.mass. max cut
+  jetcfactcut  : 1e30                # jet cfact. cut
+
+############################################################
+theory:
+  theoryid: 53        # database id
+
+############################################################
+fitting:
+  trvlseed: 1
+  nnseed: 2
+  mcseed: 3
+
+  genrep: False     # true = generate MC replicas, false = use real data
+
+  parameters: # This defines the parameter dictionary that is passed to the Model Trainer
+    nodes_per_layer: [15, 10, 8]
+    activation_per_layer: ['sigmoid', 'sigmoid', 'linear']
+    initializer: 'glorot_normal'
+    optimizer:
+      optimizer_name: 'RMSprop'
+      learning_rate: 0.01
+      clipnorm: 1.0
+    epochs: 900
+    positivity:
+      multiplier: 1.05 # When any of the multiplier and/or the initial is not set
+      initial: # the poslambda will be used instead to compute these values per dataset
+      threshold: 1e-5
+    stopping_patience: 0.30 # percentage of the number of epochs
+    layer_type: 'dense'
+    dropout: 0.0
+    threshold_chi2: 5.0
+
+  # NN23(QED) = sng=0,g=1,v=2,t3=3,ds=4,sp=5,sm=6,(pht=7)
+  # EVOL(QED) = sng=0,g=1,v=2,v3=3,v8=4,t3=5,t8=6,(pht=7)
+  # EVOLS(QED)= sng=0,g=1,v=2,v8=4,t3=4,t8=5,ds=6,(pht=7)
+  # FLVR(QED) = g=0, u=1, ubar=2, d=3, dbar=4, s=5, sbar=6, (pht=7)
+  fitbasis: NN31IC # EVOL (7), EVOLQED (8), etc.
+  basis:
+      # remeber to change the name of PDF accordingly with fitbasis
+      # pos: True for NN squared
+      # mutsize: mutation size
+      # mutprob: mutation probability
+      # smallx, largex: preprocessing ranges
+      - { fl: sng, pos: False, mutsize: [15], mutprob: [0.05], smallx: [1.05,1.19], largex: [1.47,2.70], trainable: False }
+      - { fl: g,   pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.94,1.25], largex: [0.11,5.87], trainable: False }
+      - { fl: v,   pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.54,0.75], largex: [1.15,2.76], trainable: False }
+      - { fl: v3,  pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.21,0.57], largex: [1.35,3.08] }
+      - { fl: v8,  pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.52,0.76], largex: [0.77,3.56], trainable: True }
+      - { fl: t3,  pos: False, mutsize: [15], mutprob: [0.05], smallx: [-0.37,1.52], largex: [1.74,3.39] }
+      - { fl: t8,  pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.56,1.29], largex: [1.45,3.03] }
+      - { fl: cp,  pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.12,1.19], largex: [1.83,6.70] }
+
+############################################################
+positivity:
+  posdatasets:
+    - { dataset: POSF2U,   poslambda: 1e6 }  # Positivity Lagrange Multiplier
+    - { dataset: POSFLL,   poslambda: 1e4 }
+
+############################################################
+integrability:
+  integdatasets:
+    - {dataset: INTEGXT3,   poslambda: 1e2}
+
+############################################################
+lhagrid:
+  nx  : 150
+  xmin: 1e-9
+  xmed: 0.1
+  xmax: 1.0
+  nq  : 50
+  qmax: 1e5
+
+############################################################
+debug: True
+maxcores: 8
+parallel_models: 2
diff --git a/n3fit/runcards/DIS_diagonal_l2reg_example.yml b/n3fit/runcards/DIS_diagonal_l2reg_example.yml
index f392199f07..7b56a2fdc9 100644
--- a/n3fit/runcards/DIS_diagonal_l2reg_example.yml
+++ b/n3fit/runcards/DIS_diagonal_l2reg_example.yml
@@ -76,7 +76,7 @@ fitting:
     optimizer:
       learning_rate: 1.0
       optimizer_name: 'Adadelta'
-    epochs: 40000
+    epochs: 4000
     positivity:
       multiplier: 1.09
       initial: 10.0
diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index e307be3401..755011bbfb 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -9,6 +9,7 @@
     between iterations while at the same time keeping the amount of redundant calls to a minimum
 """
 import logging
+from itertools import zip_longest
 import numpy as np
 import n3fit.model_gen as model_gen
 from n3fit.backends import MetaModel, clear_backend_state, operations, callbacks
@@ -29,101 +30,12 @@
 PUSH_INTEGRABILITY_EACH = 100
 
 
-def _assign_data_to_model(model, data_dict, fold_k=0):
-    """
-    Reads the data dictionary (``data_dict``) and assings the target data to the model
-    It returns a dictionary containing:
-    {
-        'model': the backend.MetaModel to be trained
-        'target_ndata': an array of target output
-        'ndata': the number of datapoints
-        'losses': the list of loss functions of the model
-    }
-
-    If kfolding is active applies the (``fold_k``) fold to the target data.
-    in this case ndata is a count of the non_zero entries of the fold
-
-    Note: this function is transitional. Eventually a validphys action should
-    provide an experiment object with a .target_data(fold_indx) method which
-    should return the necessary information:
-        - number of datapoints
-        - target data with the right entries set to 0*
-    *or masked away if it is able to also return a list of loss functions that will mask away
-    the corresponding entries of the prediction
-
-
-    Parameters
-    ----------
-        model: backend.MetaModel
-            model to be added to the dictionary
-        data_dict: dict
-            dictionary containing: {
-                'expdata' : list of experimental data which the model will target,
-                'folds' : a list (size=expdata) of lists (size=kfolds) with the folding masks)
-                'losses': a list of loss functions for the model
-                }
-        fold_k: int
-            when kfolding, index of the fold, so that for every experiment we apply the
-            folds[index_experiment][mask]
-
-    Returns
-    -------
-        ret: dict
-            dictionary containing the model and its associated ndata and loss
-    """
-    # Array with all data
-    all_data = data_dict["expdata"]
-    # Each element of this list correspond to the set of folds for one experiment
-    all_folds = data_dict["folds"]
-    n_exps = len(all_folds)
-    # Now set to 0 the data folded away
-    active_data = []
-    ndata = 0
-    for exp_data, exp_fold in zip(all_data, all_folds):
-        if exp_fold:
-            mask = exp_fold[fold_k]
-            active_data.append(exp_data * mask)
-            ndata += np.count_nonzero(mask)
-        else:
-            active_data.append(exp_data)
-            ndata += exp_data.size
-    # There might be special outputs (like positivitiy) that is not
-    # affected by the folding.
-    # They don't count for the chi2 (which is only for reporting)
-    active_data += all_data[n_exps:]
-    ret = {
-        "model": model,
-        "target_data": active_data,
-        "ndata": ndata,
-        "losses": data_dict["losses"],
-    }
-    return ret
-
-
-def _model_compilation(models, optimizer_params):
-    """
-        Compiles all models
-
-    Parameters
-    ----------
-        models: list(dict)
-            A ditionary defining the model
-        optimizer_params: dict
-            Optimizer parameters to be passes to the compile method of the model
-    """
-    for _, model_dict in models.items():
-        model = model_dict["model"]
-        target = model_dict["target_data"]
-        losses = model_dict["losses"]
-        model.compile(loss=losses, target_output=target, **optimizer_params)
-
-
-def _pdf_injection(pdf_layers, observables, datasets_out=None):
+def _pdf_injection(pdf_layers, observables, masks):
     """
     Takes as input a list of output layers and returns a corresponding list
     where all output layers call the pdf layer at self.pdf_layer
     """
-    return [f(x, datasets_out=datasets_out) for f, x in zip(observables, pdf_layers)]
+    return [f(x, mask=m) for f, x, m in zip_longest(observables, pdf_layers, masks)]
 
 
 def _LM_initial_and_multiplier(input_initial, input_multiplier, max_lambda, steps):
@@ -183,16 +95,34 @@ def __init__(
         """
         Parameters
         ----------
-            exp_info: list of dictionaries containing experiments
-            pos_info: list of dictionaries containing positivity sets
-            integ_info: list of dictionaries containing integrability sets
-            flavinfo: the object returned by fitting['basis']
-            nnseed: the seed used to initialise the Neural Network, will be passed to model_gen
-            pass_status: flag to signal a good run
-            failed_status: flag to signal a bad run
-            debug: flag to activate some debug options
-            model_file: str whether to save the models
-            sum_rules: str whether sum rules should be enabled (All, MSR, VSR, False)
+            exp_info: list
+                list of dictionaries containing experiments
+            pos_info: list
+                list of dictionaries containing positivity sets
+            integ_info: list
+                list of dictionaries containing integrability sets
+            flavinfo: list
+                the object returned by fitting['basis']
+            fitbasis: str
+                the name of the basis being fitted
+            nnseed: int
+                the seed used to initialise the Neural Network, will be passed to model_gen
+            pass_status: str
+                flag to signal a good run
+            failed_status: str
+                flag to signal a bad run
+            debug: bool
+                flag to activate some debug options
+            kfold_parameters: dict
+                parameters defining the kfolding method
+            max_cores: int
+                maximum number of cores the fitting can use to run
+            model_file: str
+                whether to save the models
+            sum_rules: str
+		whether sum rules should be enabled (All, MSR, VSR, False)
+            parallel_models: int
+                number of models to fit in parallel
         """
 
         # Save all input information
@@ -249,7 +179,6 @@ def __init__(
         self.training = {
             "output": [],
             "expdata": [],
-            "losses": [],
             "ndata": 0,
             "model": None,
             "posdatasets": [],
@@ -263,7 +192,6 @@ def __init__(
         self.validation = {
             "output": [],
             "expdata": [],
-            "losses": [],
             "ndata": 0,
             "model": None,
             "folds": [],
@@ -272,13 +200,10 @@ def __init__(
         self.experimental = {
             "output": [],
             "expdata": [],
-            "losses": [],
             "ndata": 0,
             "model": None,
             "folds": [],
         }
-        self.model_dicts = None
-
         self._fill_the_dictionaries()
 
         if self.validation["ndata"] == 0:
@@ -357,33 +282,37 @@ def _fill_the_dictionaries(self):
                 self.training["expdata"].append(integ_dict["expdata"])
                 self.training["integdatasets"].append(integ_dict["name"])
 
-    def _model_generation(self, pdf_models, partition):
+    def _model_generation(self, pdf_models, partition, partition_idx):
         """
         Fills the three dictionaries (``training``, ``validation``, ``experimental``)
-        with the ``model`` entry
-
+        with the ``model`` entry.
 
         Compiles the validation and experimental models with fakes optimizers and learning rate
         as they are never trained, but this is needed by some backends
-        in order to run evaluate on them
+        in order to run evaluate on them.
 
         Before entering this function the dictionaries contain a list of inputs
         and a list of outputs, but they are not connected.
         This function connects inputs with outputs by injecting the PDF.
         At this point we have a PDF model that takes an input (1, None, 1)
-        and outputs in return (1, none, 14)
+        and outputs in return (1, none, 14).
 
         The injection of the PDF is done by concatenating all inputs and calling
         pdf_model on it.
         This in turn generates an output_layer that needs to be splitted for every experiment
         as we have a set of observable "functions" that each take (1, exp_xgrid_size, 14)
         and output (1, masked_ndata) where masked_ndata can be the training/validation
-        or the experimental mask (in which cased masked_ndata == ndata)
+        or the experimental mask (in which cased masked_ndata == ndata).
+
+        Several models can be fitted at once by passing a list of models with a shared input
+        this function will give the same input to every model and will concatenate the output at the end
+        so that the final output of the model is (1, None, 14, n) (with n=number of parallel models)
+
 
         Parameters
         ----------
-            pdf_model: MetaModel
-                model producing pdf values
+            pdf_models: list(n3fit.backend.MetaModel)
+                a list of models that produce PDF values
 
         Returns
         -------
@@ -424,21 +353,28 @@ def _model_generation(self, pdf_models, partition):
 
         # If we are in a kfolding partition, select which datasets are out
         if partition:
-            kfold_datasets = partition["datasets"]
-            negate_k_datasets = [d for d in self.all_datasets if d not in kfold_datasets]
+            training_mask = [i[partition_idx] for i in self.training["folds"]]
+            validation_mask = [i[partition_idx] for i in self.validation["folds"]]
+            experimental_mask = [i[partition_idx] for i in self.experimental["folds"]]
         else:
-            kfold_datasets = None
-            negate_k_datasets = None
+            training_mask = validation_mask = experimental_mask = [None]
 
         # Training and validation leave out the kofld dataset
         # experiment leaves out the negation
-        output_tr = _pdf_injection(splitted_pdf, self.training["output"], kfold_datasets)
+        output_tr = _pdf_injection(splitted_pdf, self.training["output"], training_mask)
         training = MetaModel(full_model_input_dict, output_tr)
 
-        output_vl = _pdf_injection(splitted_pdf, self.validation["output"], kfold_datasets)
+        # For validation we don't have integrability
+        n_val = len(self.validation["output"])
+        output_vl = _pdf_injection(splitted_pdf[:n_val], self.validation["output"], validation_mask)
         validation = MetaModel(full_model_input_dict, output_vl)
 
-        output_ex = _pdf_injection(splitted_pdf, self.experimental["output"], negate_k_datasets)
+        # And for experimental we don't have positivity
+        # TODO: risky to rely that much in the order
+        n_exps = len(self.experimental["output"])
+        output_ex = _pdf_injection(
+            splitted_pdf[:n_exps], self.experimental["output"], experimental_mask
+        )
 
         experimental = MetaModel(full_model_input_dict, output_ex)
 
@@ -463,7 +399,7 @@ def _reset_observables(self):
         """
         self.input_list = []
         self.input_sizes = []
-        for key in ["output", "losses", "posmultipliers", "integmultipliers"]:
+        for key in ["output", "posmultipliers", "integmultipliers"]:
             self.training[key] = []
             self.validation[key] = []
             self.experimental[key] = []
@@ -519,10 +455,6 @@ def _generate_observables(
             self.validation["output"].append(exp_layer["output_vl"])
             self.experimental["output"].append(exp_layer["output"])
 
-            self.training["losses"].append(exp_layer["loss_tr"])
-            self.validation["losses"].append(exp_layer["loss_vl"])
-            self.experimental["losses"].append(exp_layer["loss"])
-
         # Generate the positivity penalty
         for pos_dict in self.pos_info:
             if not self.mode_hyperopt:
@@ -542,9 +474,7 @@ def _generate_observables(
 
             # The positivity should be on both training and validation models
             self.training["output"].append(pos_layer["output_tr"])
-            self.training["losses"].append(pos_layer["loss_tr"])
             self.validation["output"].append(pos_layer["output_tr"])
-            self.validation["losses"].append(pos_layer["loss_tr"])
 
             self.training["posmultipliers"].append(pos_multiplier)
             self.training["posinitials"].append(pos_initial)
@@ -571,7 +501,6 @@ def _generate_observables(
 
                 # The integrability all falls to the training
                 self.training["output"].append(integ_layer["output_tr"])
-                self.training["losses"].append(integ_layer["loss_tr"])
                 self.training["integmultipliers"].append(integ_multiplier)
                 self.training["integinitials"].append(integ_initial)
 
@@ -634,29 +563,10 @@ def _generate_pdf(
             regularizer=regularizer,
             regularizer_args=regularizer_args,
             impose_sumrule=self.impose_sumrule,
-            parallel_models=self.parallel_models
+            parallel_models=self.parallel_models,
         )
         return pdf_models
 
-    def _assign_data(self, models, fold_k=0):
-        """Assign to each model the data to compare with as well as the
-        number of data points in the model.
-        In the most general case training and validation get assigned the replic'd data
-        while experimental gets the actual data.
-        In the kfolding case (i.e, partition != None), they all receive the same data
-        but the folded data is set to 0 for training and validation
-        """
-        training = _assign_data_to_model(models["training"], self.training, fold_k)
-        validation = _assign_data_to_model(models["validation"], self.validation, fold_k)
-        experimental = _assign_data_to_model(models["experimental"], self.experimental, fold_k)
-
-        ret = {
-            "training": training,
-            "validation": validation,
-            "experimental": experimental,
-        }
-        return ret
-
     def _prepare_reporting(self, partition):
         """Parses the information received by the :py:class:`n3fit.ModelTrainer.ModelTrainer`
         to select the bits necessary for reporting the chi2.
@@ -706,9 +616,11 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100):
             callbacks=self.callbacks + [callback_st, callback_pos, callback_integ],
         )
 
-        # TODO: this needs to be changed for hyperopt
-        return self.pass_status
-#         return self.failed_status
+        # TODO: in order to use multireplica in hyperopt is is necessary to define what "passing" means
+        # for now consider the run as good if any replica passed
+        if any([bool(i) for i in stopping_object.e_best_chi2]):
+            return self.pass_status
+        return self.failed_status
 
     def _hyperopt_override(self, params):
         """ Unrolls complicated hyperopt structures into very simple dictionaries"""
@@ -719,8 +631,8 @@ def _hyperopt_override(self, params):
                 for key, value in item.items():
                     params[key] = value
 
-    def enable_tensorboard(self, logdir, weight_freq = 0, profiling=False):
-        """ Enables tensorboard callback for further runs of the fitting procedure 
+    def enable_tensorboard(self, logdir, weight_freq=0, profiling=False):
+        """Enables tensorboard callback for further runs of the fitting procedure
 
         Parameters
         ----------
@@ -731,7 +643,9 @@ def enable_tensorboard(self, logdir, weight_freq = 0, profiling=False):
             profiling: bool
                 flag to enable the tensorboard profiler
         """
-        callback_tb = callbacks.gen_tensorboard_callback(logdir, profiling=profiling, histogram_freq=weight_freq)
+        callback_tb = callbacks.gen_tensorboard_callback(
+            logdir, profiling=profiling, histogram_freq=weight_freq
+        )
         self.callbacks.append(callback_tb)
 
     def evaluate(self, stopping_object):
@@ -749,15 +663,13 @@ def evaluate(self, stopping_object):
             val_chi2 : chi2 of the validation set
             exp_chi2: chi2 of the experimental data (without replica or tr/vl split)
         """
-        if self.model_dicts is None:
+        if self.training["model"] is None:
             raise RuntimeError("Modeltrainer.evaluate was called before any training")
         # Needs to receive a `stopping_object` in order to select the part of the
         # training and the validation which are actually `chi2` and not part of the penalty
-        training = self.model_dicts["training"]
-        train_chi2 = stopping_object.evaluate_training(training["model"])
+        train_chi2 = stopping_object.evaluate_training(self.training["model"])
         val_chi2 = stopping_object.vl_chi2
-        experimental = self.model_dicts["experimental"]
-        exp_chi2 = experimental["model"].compute_losses()["loss"] / experimental["ndata"]
+        exp_chi2 = self.experimental["model"].compute_losses()["loss"] / self.experimental["ndata"]
         return train_chi2, val_chi2, exp_chi2
 
     def hyperparametrizable(self, params):
@@ -835,7 +747,7 @@ def hyperparametrizable(self, params):
 
             # Model generation joins all the different observable layers
             # together with pdf model generated above
-            models = self._model_generation(pdf_models, partition)
+            models = self._model_generation(pdf_models, partition, k)
 
             # Only after model generation, apply possible weight file
             # TODO: not sure whether it is a good idea that all of them start at the same point
@@ -850,17 +762,12 @@ def hyperparametrizable(self, params):
                 initial_values = self.training["posinitials"] + self.training["posinitials"]
                 models["training"].reset_layer_weights_to(pos_and_int, initial_values)
 
-            # Assign data to each model
-            # model dicts is similar to model but includes information about
-            # the target data and number of points
-            model_dicts = self._assign_data(models, k)
-
             # Generate the list containing reporting info necessary for chi2
             reporting = self._prepare_reporting(partition)
 
             if self.no_validation:
                 # Substitute the validation model with the training model
-                model_dicts["validation"] = model_dicts["training"]
+                models["validation"] = models["training"]
                 validation_model = models["training"]
             else:
                 validation_model = models["validation"]
@@ -874,11 +781,12 @@ def hyperparametrizable(self, params):
                 total_epochs=epochs,
                 stopping_patience=stopping_epochs,
                 threshold_positivity=threshold_pos,
-                threshold_chi2=threshold_chi2
+                threshold_chi2=threshold_chi2,
             )
 
             # Compile each of the models with the right parameters
-            _model_compilation(model_dicts, params["optimizer"])
+            for model in models.values():
+                model.compile(**params["optimizer"])
 
             passed = self._train_and_fit(
                 models["training"],
@@ -886,14 +794,18 @@ def hyperparametrizable(self, params):
                 epochs=epochs,
             )
 
-            # Save validation chi2
-            validation_loss = stopping_object.vl_chi2
+            if self.mode_hyperopt:
+                # TODO: currently only working for one single replica
+                # If doing a hyperparameter scan we need to keep track at this point of the loss function
+                validation_loss = stopping_object.vl_chi2
 
-            # Compute experimental loss
-            exp_loss_raw = np.take(models["experimental"].compute_losses()["loss"], -1)
-            experimental_loss = exp_loss_raw / model_dicts["experimental"]["ndata"]
+                # Compute experimental loss
+                exp_loss_raw = np.take(models["experimental"].compute_losses()["loss"], -1)
+                # And divide by the number of active points in this fold
+                # it would be nice to have a ndata_per_fold variable coming in the vp object...
+                ndata = np.sum([np.count_nonzero(i[k]) for i in self.experimental["folds"]])
+                experimental_loss = exp_loss_raw / ndata
 
-            if self.mode_hyperopt:
                 hyper_loss = experimental_loss
                 if passed != self.pass_status:
                     log.info("Hyperparameter combination fail to find a good fit, breaking")
@@ -902,46 +814,45 @@ def hyperparametrizable(self, params):
                 for penalty in self.hyper_penalties:
                     hyper_loss += penalty(pdf_model, stopping_object)
                 l_hyper.append(hyper_loss)
-                log.info("Fold %d finished, loss=%.1f, pass=%s", k+1, hyper_loss, passed)
+                log.info("Fold %d finished, loss=%.1f, pass=%s", k + 1, hyper_loss, passed)
                 if hyper_loss > self.hyper_threshold:
-                    log.info("Loss above threshold (%.1f > %.1f), breaking", hyper_loss, self.hyper_threshold)
+                    log.info(
+                        "Loss above threshold (%.1f > %.1f), breaking",
+                        hyper_loss,
+                        self.hyper_threshold,
+                    )
                     # Apply a penalty proportional to the number of folds that have not been computed
                     pen_mul = len(self.kpartitions) - k
                     l_hyper = [i * pen_mul for i in l_hyper]
                     break
 
-            # Save all losses
-            l_valid.append(validation_loss)
-            l_exper.append(experimental_loss)
-
-        dict_out = {
-            "status": passed,
-            "validation_loss": np.average(l_valid),
-            "experimental_loss": np.average(l_exper),
-        }
+                # Save all losses
+                l_valid.append(validation_loss)
+                l_exper.append(experimental_loss)
 
         if self.mode_hyperopt:
-            dict_out["loss"] = self.hyper_loss(l_hyper)
-            dict_out["kfold_meta"] = {
-                "validation_losses": l_valid,
-                "experimental_losses": l_exper,
-                "hyper_losses": l_hyper,
+            # Hyperopt needs a dictionary with information about the losses
+            # it is possible to store arbitrary information in the trial file by adding it to this dictionary
+            dict_out = {
+                "status": passed,
+                "loss": self.hyper_loss(l_hyper),
+                "validation_loss": np.average(l_valid),
+                "experimental_loss": np.average(l_exper),
+                "kfold_meta": {
+                    "validation_losses": l_valid,
+                    "experimental_losses": l_exper,
+                    "hyper_losses": l_hyper,
+                },
             }
-            # If we are using hyperopt we don't need to output any other information
             return dict_out
 
-        dict_out["loss"] = experimental_loss
-
-        # Add to the output dictionary things that are needed by performfit.py
-        # to generate the output pdf, check the arc-length, gather stats, etc
-        # some of them are already attributes of the class so they are redundant here
-        # but I think it's good to present them explicitly
-        dict_out["stopping_object"] = stopping_object
-        dict_out["experimental"] = self.experimental
-        dict_out["training"] = self.training
-        dict_out["pdf_models"] = pdf_models
-
-        # Only after the training has finished, we save all models for future reporting
-        self.model_dicts = model_dicts
+        # Keep a reference to the models after training for future reporting
+        self.training["model"] = models["training"]
+        self.experimental["model"] = models["experimental"]
+        self.validation["model"] = models["validation"]
 
+        # In a normal run, the only information we need to output is the stopping object
+        # (which contains metadata about the stopping)
+        # and the pdf models (which are used to generate the PDF grids and compute arclengths)
+        dict_out = {"status": passed, "stopping_object": stopping_object, "pdf_models": pdf_models}
         return dict_out
diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
index 99f4a640db..23b3125cd5 100644
--- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
+++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -10,14 +10,7 @@
 from tensorflow.keras.models import Model
 from tensorflow.keras import optimizers as Kopt
 from tensorflow.python.keras.utils import tf_utils
-from n3fit.backends.keras_backend.operations import numpy_to_tensor
-
-# Check the TF version to check if legacy-mode is needed (TF < 2.2)
-tf_version = tf.__version__.split('.')
-if int(tf_version[0]) == 2 and int(tf_version[1]) < 2:
-    LEGACY = True
-else:
-    LEGACY = False
+from n3fit.backends.keras_backend import operations as op
 
 # Define in this dictionary new optimizers as well as the arguments they accept
 # (with default values if needed be)
@@ -36,6 +29,12 @@
     v[1]["clipnorm"] = 1.0
 
 
+def _default_loss(y_true, y_pred):
+    """ Default loss to be used when the model is compiled with loss = Null
+    (for instance if the prediction of the model is already the loss """
+    return op.sum(y_pred)
+
+
 def _fill_placeholders(original_input, new_input=None):
     """
     Fills the placeholders of the original input with a new set of input
@@ -114,7 +113,7 @@ def __init__(self, input_tensors, output_tensors, **kwargs):
             # otherwise, put a placeholder None as it will come from the outside
             name = input_tensor.name.rsplit(":",1)[0]
             try:
-                self.x_in[name] = numpy_to_tensor(input_tensor.tensor_content)
+                self.x_in[name] = op.numpy_to_tensor(input_tensor.tensor_content)
                 self.tensors_in[name] = input_tensor
             except AttributeError:
                 self.x_in[name] = None
@@ -123,7 +122,6 @@ def __init__(self, input_tensors, output_tensors, **kwargs):
         self.all_inputs = input_list
         self.all_outputs = output_list
         self.target_tensors = None
-        self.eval_fun = None
         self.compute_losses_function = None
 
     def _parse_input(self, extra_input=None, pass_content=True):
@@ -159,7 +157,7 @@ def perform_fit(self, x=None, y=None, epochs=1, **kwargs):
         x = self._parse_input(self.x_in)
         if y is None:
             y = self.target_tensors
-        history = super().fit(x=x, y=y, epochs=epochs, **kwargs,)
+        history = self.fit(x=x, y=y, epochs=epochs, **kwargs)
         loss_dict = history.history
         return loss_dict
 
@@ -176,7 +174,7 @@ def compute_losses(self):
         The losses reported in the ``evaluate`` method for n3fit are, however, summed over replicas.
         Instead the loss we are interested in is usually the output of the model (i.e., predict)
 
-        This function then generates a dictionary of partial losses of the model separated per replica.
+        This function then generates a dict of partial losses of the model separated per replica.
         i.e., the output for experiment {'LHC_exp'} will be an array of Nrep elements.
 
         Returns
@@ -184,8 +182,9 @@ def compute_losses(self):
             dict
                 a dictionary with all partial losses of the model
         """
-        # TODO might not work for TF < 2.2, we might not care either
+        # TODO might not work for TF < 2.2
         if self.compute_losses_function is None:
+            # If it is the first time we are passing through, compile the function and save it
             out_names = [f"{i}_loss" for i in self.output_names]
             out_names.insert(0, "loss")
 
@@ -203,8 +202,11 @@ def losses_fun():
             self.compute_losses_function = losses_fun
 
         ret = self.compute_losses_function()
-        # undocumented TF function that converts all the tensors from the ret dictionary to numpy arrays
-        # if it dissapears, equivalent to {k: i.numpy() for k, i in ret.items()}
+
+        # The output of this function is to be used by python (and numpy) so we need to convert
+        # the tensorflow variable to python primitives or numpy arrays.
+        # Undocumented TF function that converts all the tensors from the ret dictionary to numpy arrays
+        # if it dissapears, equivalent for us to {k: i.numpy() for k, i in ret.items()}
         return tf_utils.to_numpy_or_python_type(ret)
 
     def compile(
@@ -248,6 +250,9 @@ def compile(
                 f"[MetaModel.select_initializer] optimizer not implemented: {optimizer_name}"
             ) from e
 
+        if loss is None:
+            loss = _default_loss
+
         opt_function = opt_tuple[0]
         opt_args = opt_tuple[1]
 
@@ -261,69 +266,16 @@ def compile(
         # Instantiate the optimizer
         opt = opt_function(**opt_args)
 
-        # If given target output, compile it together with the model for better performance
-        if target_output is not None:
+        # If given target output is None, target_output is unnecesary, save just a zero per output
+        if target_output is None:
+            self.target_tensors = [np.zeros((1,1)) for i in self.output_shape]
+        else:
             if not isinstance(target_output, list):
                 target_output = [target_output]
-            # Tensorize
             self.target_tensors = target_output
 
-        # Reset the evaluation function (if any)
-        self.eval_fun = None
-
         super(MetaModel, self).compile(optimizer=opt, loss=loss)
 
-    def make_test_function(self):
-        """
-        If the model has been compiled in the normal NNPDF way,
-        then the output of the prediction is already the loss, so we skip this part
-        by just summing over predictions.
-
-        Otherwise, just return the usual TF make_test_function
-        """
-
-        if self.eval_fun is not None:
-            return self.eval_fun
-
-        if self.target_tensors is None:
-            return super().make_test_function()
-
-        @tf.function
-        def eval_fun(*args):
-            predictions = self(self._parse_input(None))
-            return tf.reduce_sum(predictions)
-
-        self.eval_fun = eval_fun
-        return eval_fun
-
-        # Recover the target tensors and their lengths, we cannot rely
-        # directly on the output from the model as we might have target_tensors
-        # with 0 data points (if the tr/vl mask covers the whole set)
-        lens = []
-        tt = []
-        for target in self.target_tensors:
-            lens.append(target.size)
-            tt.append(numpy_to_tensor(target))
-        # Save target_tensors as tensors, as it might be useful for LEGACY
-        self.target_tensors = tt
-
-        # Get the name of the output layer
-        # and add the suffix _loss to match TF behaviour
-        out_names = [f"{i}_loss" for i in self.output_names]
-        out_names.insert(0, "loss")
-
-        @tf.function
-        def eval_fun(*args):
-            predictions = self(self._parse_input(None))
-            loss_list = [lfun(target, pred) for target, pred, lfun in zip(tt, predictions, self.loss)]
-            ret = [tf.reduce_sum(loss_list)] + loss_list
-            return dict(zip(out_names, ret))
-
-        # Save the function so we don't go through this again
-        self.eval_fun = eval_fun
-
-        return eval_fun
-
     def set_masks_to(self, names, val=0.0):
         """ Set all mask value to the selected value
         Masks in MetaModel should be named {name}_mask
diff --git a/n3fit/src/n3fit/backends/keras_backend/operations.py b/n3fit/src/n3fit/backends/keras_backend/operations.py
index 35f6a632fb..3e2308e19c 100644
--- a/n3fit/src/n3fit/backends/keras_backend/operations.py
+++ b/n3fit/src/n3fit/backends/keras_backend/operations.py
@@ -266,7 +266,6 @@ def pdf_masked_convolution(raw_pdf, basis_mask):
     return pdf_x_pdf
 
 
-
 def tensor_product(*args, **kwargs):
     """
     Computes the tensordot product between tensor_x and tensor_y
@@ -283,7 +282,6 @@ def einsum(equation, *args, **kwargs):
     return tf.einsum(equation, *args, **kwargs)
 
 
-
 @tf.function(experimental_relax_shapes=True)
 def op_log(o_tensor, **kwargs):
     """
@@ -321,7 +319,7 @@ def scatter_to_one(values, indices=[[1]], output_dim=14):
 @tf.function
 def backend_function(fun_name, *args, **kwargs):
     """
-    Calls the (``fun_name``) backend function
+    Wrapper to call non-explicitly implemented backend functions by name: (``fun_name``)
     see full `docs <https://keras.io/api/utils/backend_utils/>`_ for some possibilities
     """
     fun = getattr(K, fun_name)
diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
index 31fdae7929..69c5c43290 100644
--- a/n3fit/src/n3fit/checks.py
+++ b/n3fit/src/n3fit/checks.py
@@ -355,7 +355,7 @@ def check_consistent_basis(fitting, theoryid):
 
 
 @make_argcheck
-def can_run_in_parallel(fitting, replica, parallel_models=1):
+def can_run_in_parallel(fitting, replica, hyperopt, parallel_models=1):
     """ Checks whether a runcard which is trying to run several replicas at once (parallel_models =/= 1) is valid
     """
     rp = len(replica)
@@ -364,6 +364,8 @@ def can_run_in_parallel(fitting, replica, parallel_models=1):
         raise CheckError("Can't run more than one replica at once if no replicas are to be generated")
     if parallel_models == 1:
         return
+    if hyperopt:
+        raise CheckError("Running replicas in parallel with hyperopt is still not supported")
     if genrep:
         raise CheckError("Replica generation is not supported yet for parallel models")
     if fitting["parameters"].get("layer_type") != "dense":
diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
index 22f2ecd530..1b28a01de8 100644
--- a/n3fit/src/n3fit/io/writer.py
+++ b/n3fit/src/n3fit/io/writer.py
@@ -110,7 +110,7 @@ def default(self, o):
         return super().default(o)
 
 
-def jsonfit(replica_status, pdf_object, tr_chi2, vl_chi2, true_chi2, epoch_stop, timing):
+def jsonfit(replica_status, pdf_object, tr_chi2, vl_chi2, true_chi2, stop_epoch, timing):
     """Generates a dictionary containing all relevant metadata for the fit
 
     Parameters
@@ -126,6 +126,8 @@ def jsonfit(replica_status, pdf_object, tr_chi2, vl_chi2, true_chi2, epoch_stop,
             chi2 for the validation
         true_chi2: float
             chi2 for the exp (unreplica'd data)
+        epoch_stop: int
+            epoch at which the stopping stopped (not the one for the best fit!)
         timing: dict
             dictionary of the timing of the different events that happened
     """
@@ -133,7 +135,7 @@ def jsonfit(replica_status, pdf_object, tr_chi2, vl_chi2, true_chi2, epoch_stop,
     # Generate preprocessing information
     all_info["preprocessing"] = pdf_object.get_preprocessing_factors()
     # .fitinfo-like info
-    all_info["stop_epoch"] = epoch_stop
+    all_info["stop_epoch"] = stop_epoch
     all_info["best_epoch"] = replica_status.best_epoch
     all_info["erf_tr"] = tr_chi2
     all_info["erf_vl"] = vl_chi2
diff --git a/n3fit/src/n3fit/layers/Rotations.py b/n3fit/src/n3fit/layers/Rotations.py
index 0ccca32c0b..c5f08a1a49 100644
--- a/n3fit/src/n3fit/layers/Rotations.py
+++ b/n3fit/src/n3fit/layers/Rotations.py
@@ -66,7 +66,7 @@ class FkRotation(MetaLayer):
     # i.e., create the matrix and inherit from the Rotation layer above
     def __init__(self, output_dim=14, name="evolution", **kwargs):
         self.output_dim = output_dim
-        super().__init__(name, **kwargs)
+        super().__init__(name=name, **kwargs)
 
     def call(self, pdf_raw):
         # Transpose the PDF so that the flavour index is the first one
diff --git a/n3fit/src/n3fit/layers/losses.py b/n3fit/src/n3fit/layers/losses.py
index 2d89c387c9..78b7f3e0ea 100644
--- a/n3fit/src/n3fit/layers/losses.py
+++ b/n3fit/src/n3fit/layers/losses.py
@@ -1,7 +1,13 @@
 """
     Module containg the losses to be apply to the models as layers
-"""
 
+    The layer take the input from the model and acts on it producing a score function.
+    For instance, in the case of the chi2 (``LossInvcovmat``) the function takes only
+    the prediction of the model and, during instantiation, took the real data to compare with
+    and the covmat.
+
+"""
+import numpy as np
 from n3fit.backends import MetaLayer
 from n3fit.backends import operations as op
 
@@ -10,20 +16,57 @@ class LossInvcovmat(MetaLayer):
     """
     Loss function such that:
     L = \sum_{ij} (yt - yp)_{i} invcovmat_{ij} (yt - yp)_{j}
+
+    Takes as argument the inverse of the covmat and the target data.
+    It also takes an optional argument to mask part of the predictions
+
+    Example
+    -------
+    >>> import numpy as np
+    >>> from n3fit.layers import losses
+    >>> C = np.random.rand(5,5)
+    >>> data = np.random.rand(1, 1, 5)
+    >>> pred = np.random.rand(1, 1, 5)
+    >>> invC = np.linalg.inv( C @ C.T)
+    >>> loss_f = losses.LossInvcovmat(invC, data)
+    >>> loss_f(pred).shape == 1
+    True
     """
 
-    def __init__(self, invcovmat, y_true, **kwargs):
+    def __init__(self, invcovmat, y_true, mask=None, **kwargs):
+        # If we have a diagonal matrix, padd with 0s and hope it's not too heavy on memory
+        if len(invcovmat.shape) == 1:
+            invcovmat = np.diag(invcovmat)
         self.invcovmat = op.numpy_to_tensor(invcovmat)
         self.y_true = op.numpy_to_tensor(y_true)
+        if mask is None or all(mask):
+            self.mask = None
+        else:
+            mask = np.array(mask, dtype=np.float32).reshape(1,1,-1)
+            self.mask = op.numpy_to_tensor(mask)
         super().__init__(**kwargs)
 
     def call(self, y_pred, **kwargs):
         tmp = self.y_true - y_pred
+        if self.mask is not None:
+            tmp = op.op_multiply([tmp, self.mask])
         res = op.einsum("bri, ij, brj -> r", tmp, self.invcovmat, tmp)
         return res
 
 
 class LossLagrange(MetaLayer):
+    """
+    Abstract loss function to apply lagrange multipliers to a model.
+
+        L = \lambda * f(y)
+
+    The form of f(y) is given by modifying the ``apply_loss`` method.
+    It is possible to modify how the multiplication of the lambda factor is implemented
+    by modifying the ``apply_multiplier`` method.
+
+    The (non trainable) weight containing the multiplier is named ``lagMult``.
+    """
+
     def __init__(self, c=1.0, **kwargs):
         self._initial_multiplier = c
         super().__init__(**kwargs)
@@ -46,7 +89,7 @@ def call(self, y_pred, **kwargs):
 
 class LossPositivity(LossLagrange):
     """
-    Returns L = elu(y_pred) (considers y_true as 0)
+    Returns L = \lambda*elu(y_pred)
 
     The positivity loss is computed by inverting the sign of the
     datapoints and then applying the elu function, this function is
@@ -55,6 +98,19 @@ class LossPositivity(LossLagrange):
     This is done to avoid a big discontinuity in the derivative at 0 when
     the lagrange multiplier is very big.
     In practice this function can produce results in the range (-alpha, inf)
+
+    Example
+    -------
+    >>> import numpy as np
+    >>> from n3fit.layers import losses
+    >>> pred = np.random.rand(1, 1, 5)
+    >>> alpha = 1e-7
+    >>> c = 1e8
+    >>> loss_f = losses.LossPositivity(c=c, alpha=alpha)
+    >>> loss_f(pred) == -5*alpha
+    True
+    >>> loss_f(-pred) > c
+    True
     """
 
     def __init__(self, alpha=1e-7, **kwargs):
@@ -70,6 +126,15 @@ def apply_loss(self, y_pred):
 class LossIntegrability(LossLagrange):
     """
     Returns L = (y_pred)*(y_pred)
+
+    Example
+    -------
+    >>> import numpy as np
+    >>> from n3fit.layers import losses
+    >>> pred = np.random.rand(1, 1, 5)
+    >>> loss_f = losses.LossIntegrability(c=1e2)
+    >>> loss_f(pred) > 0
+    True
     """
 
     def apply_loss(self, y_pred):
diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 38468ac130..4484b42472 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -134,11 +134,6 @@ def observable_generator(
     def gen_concat():
         return operations.as_layer(operations.concatenate, op_kwargs={"axis": 2})
 
-    # Tensorflow operations have ugly name,
-    # we want the final observables to be named just {spec_name} (with'val/exp' if needed)
-    tr_name = spec_name
-    vl_name = f"{spec_name}_val"
-    ex_name = f"{spec_name}_exp"
 
     concat_ex = gen_concat()
     # For data transformation all concatenations are the same
@@ -167,12 +162,7 @@ def experiment_layer(
         else:
             split_pdf = [pdf]
         # every obs gets its share of the split
-        for partial_pdf, obs in zip(split_pdf, model_obs):
-            obs_output = obs(partial_pdf)
-            if datasets_out and obs.name[4:] in datasets_out:
-                mask_out = Mask(c=0.0, name=f"zero_{obs.name}")
-                obs_output = mask_out(obs_output)
-            output_layers.append(obs_output)
+        output_layers = [obs(p_pdf) for p_pdf, obs in zip(split_pdf, model_obs)]
         # Concatenate all datasets as experiments are one single entity if needed
         ret = concat(output_layers)
         if rotation is not None:
@@ -182,77 +172,71 @@ def experiment_layer(
     # Now create the model for this experiment
     full_nx = sum(dataset_xsizes)
 
-    def loss(y_true, y_pred):
-        return operations.sum(y_pred)
-
     if spec_dict["positivity"]:
         if integrability:
             loss_pos = losses.LossIntegrability(name=spec_name, c=positivity_initial)
         else:
             loss_pos = losses.LossPositivity(name=spec_name, c=positivity_initial)
 
-        def out_positivity(pdf_layer, datasets_out=None):
+        def out_positivity(pdf_layer, mask=None, datasets_out=None):
             exp_result = experiment_layer(pdf_layer)
             return loss_pos(exp_result)
 
         layer_info = {
             "inputs": model_inputs,
             "output_tr": out_positivity,
-            "loss_tr": loss,
             "experiment_xsize": full_nx,
         }
         return layer_info
 
-    invcovmat_tr = spec_dict["invcovmat"]
-    invcovmat_vl = spec_dict["invcovmat_vl"]
-    invcovmat = spec_dict["invcovmat_true"]
-
-    # Prepare the loss function
-    loss_tr = losses.LossInvcovmat(invcovmat_tr, spec_dict["expdata"], name=tr_name)
-    loss_vl = losses.LossInvcovmat(invcovmat_vl, spec_dict["expdata_vl"], name=vl_name)
-    loss_ex = losses.LossInvcovmat(invcovmat, spec_dict["expdata_true"], name=ex_name)
+    # Prepare the loss function, important! the loss function must carry the name of the experiment!
 
     # Generate the loss function and rotations of the final data (if any)
     if spec_dict.get("data_transformation") is not None:
         # TODO: I'm asuming that the diagonal covmat will work ootb, check
         # The rotation is the last layer so it should carry The Name
-        obsrot_tr = ObsRotation(spec_dict.get("data_transformation"), name=tr_name)
-        obsrot_vl = ObsRotation(spec_dict.get("data_transformation_vl"), name=vl_name)
+        obsrot_tr = ObsRotation(spec_dict.get("data_transformation"))
+        obsrot_vl = ObsRotation(spec_dict.get("data_transformation_vl"))
     else:
         obsrot_tr = None
         obsrot_vl = None
 
-    def out_tr(pdf_layer, datasets_out=None):
+
+    # Prepare the inverse covmats for each of the loss functions 
+    # (that are only instantiated when the output layer is created)
+    invcovmat_tr = spec_dict["invcovmat"]
+    invcovmat_vl = spec_dict["invcovmat_vl"]
+    invcovmat = spec_dict["invcovmat_true"]
+
+    def out_tr(pdf_layer, mask=None, datasets_out=None):
+        loss_tr = losses.LossInvcovmat(invcovmat_tr, spec_dict["expdata"], mask, name=spec_name)
         exp_result = experiment_layer(
             pdf_layer,
             model_obs=model_obs_tr,
             concat=concat_tr,
-            datasets_out=datasets_out,
             rotation=obsrot_tr,
         )
         return loss_tr(exp_result)
 
-    def out_vl(pdf_layer, datasets_out=None):
+    def out_vl(pdf_layer, mask=None, datasets_out=None):
+        loss_vl = losses.LossInvcovmat(invcovmat_vl, spec_dict["expdata_vl"], mask, name=f"{spec_name}_val")
         exp_result = experiment_layer(
             pdf_layer,
             model_obs=model_obs_vl,
             concat=concat_vl,
-            datasets_out=datasets_out,
             rotation=obsrot_vl,
         )
         return loss_vl(exp_result)
 
-    def out_exp(pdf_layer, datasets_out=None):
+    def out_exp(pdf_layer, mask=None, datasets_out=None):
+        loss_ex = losses.LossInvcovmat(invcovmat, spec_dict["expdata_true"], mask, name=f"{spec_name}_exp")
         return loss_ex(experiment_layer(pdf_layer))
 
     layer_info = {
         "inputs": model_inputs,
         "output": out_exp,
-        "loss": loss,
         "output_tr": out_tr,
-        "loss_tr": loss,
         "output_vl": out_vl,
-        "loss_vl": loss,
         "experiment_xsize": full_nx,
     }
 
diff --git a/n3fit/src/n3fit/msr.py b/n3fit/src/n3fit/msr.py
index d009e26054..11a19b1bfc 100644
--- a/n3fit/src/n3fit/msr.py
+++ b/n3fit/src/n3fit/msr.py
@@ -39,11 +39,16 @@ def gen_integration_input(nx):
 
 def msr_impose(nx=int(2e3), basis_size=8, mode='All'):
     """
-        This function receives:
-            - fit_layer: the 8-basis layer of PDF which we fit
-            - final_layer: the 14-basis which is fed to the fktable
-        It uses pdf_fit to compute the sum rule and returns a modified version of
-        the final_pdf layer with a normalisation by which the sum rule is imposed
+        Generates a function that applies a normalization layer to the fit.
+        The normalization is computed from the direct output of the NN (so the 7,8-flavours basis)
+        and it is applied to the input of the fktable (i.e., to the 14-flavours fk-basis).
+
+        Parameters
+        ----------
+            nx: int
+                number of points for the integration grid
+            basis_size: int
+                number of flavours output of the NN
     """
     # 1. Generate the fake input which will be used to integrate
     xgrid, weights_array = gen_integration_input(nx)
@@ -61,7 +66,7 @@ def msr_impose(nx=int(2e3), basis_size=8, mode='All'):
     # 5. Make the xgrid array into a backend input layer so it can be given to the normalization
     xgrid_input = operations.numpy_to_input(xgrid)
 
-    # Now parepare a function that takes as input the 8-flavours output of the NN
+    # Now prepare a function that takes as input the 8-flavours output of the NN
     # and the 14-flavours after the fk rotation and returns a 14-flavours normalized output
     # note + TODO:
     # the idea was that the normalization should always be applied at the fktable 14-flavours
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index ec59f9e385..995d5bb6af 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -143,6 +143,8 @@ def performfit(
                 activate some debug options
             maxcores: int
                 maximum number of (logical) cores that the backend should be aware of
+            parallel_models: int
+                number of models to be run in parallel
     """
     from n3fit.backends import set_initial_state
 
@@ -303,9 +305,6 @@ def performfit(
         # After the fit is run we get a 'result' dictionary with the following items:
         stopping_object = result["stopping_object"]
         pdf_models = result["pdf_models"]
-        true_chi2 = result["loss"]
-        training = result["training"]
-        log.info("Total exp chi2: %s", true_chi2)
 
         # Where has the stopping point happened (this is only for debugging purposes)
         print(

From 96bf3b50237fc295b810799ce34fc2db1f8bdf76 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 5 Jan 2021 20:53:51 +0100
Subject: [PATCH 20/27] abstract out the generation of the experimental layers

---
 n3fit/src/n3fit/ModelTrainer.py               |   2 +-
 .../src/n3fit/hyper_optimization/penalties.py |   4 +-
 n3fit/src/n3fit/model_gen.py                  | 173 ++++++++++--------
 3 files changed, 97 insertions(+), 82 deletions(-)

diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index 755011bbfb..e8dc93084d 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -812,7 +812,7 @@ def hyperparametrizable(self, params):
                     # If the fit failed to fit, no need to add a penalty to the loss
                     break
                 for penalty in self.hyper_penalties:
-                    hyper_loss += penalty(pdf_model, stopping_object)
+                    hyper_loss += penalty(pdf_models[0], stopping_object)
                 l_hyper.append(hyper_loss)
                 log.info("Fold %d finished, loss=%.1f, pass=%s", k + 1, hyper_loss, passed)
                 if hyper_loss > self.hyper_threshold:
diff --git a/n3fit/src/n3fit/hyper_optimization/penalties.py b/n3fit/src/n3fit/hyper_optimization/penalties.py
index 7d3ef24708..b6bad1c8f3 100644
--- a/n3fit/src/n3fit/hyper_optimization/penalties.py
+++ b/n3fit/src/n3fit/hyper_optimization/penalties.py
@@ -83,11 +83,11 @@ def patience(pdf_model, stopping_object, alpha=1e-4):
     3.434143467595683
 
     """
-    epoch_best = stopping_object.e_best_chi2
+    epoch_best = np.take(stopping_object.e_best_chi2, 0)
     patience = stopping_object.stopping_patience
     max_epochs = stopping_object.total_epochs
     diff = abs(max_epochs - patience - epoch_best)
-    vl_loss = stopping_object.vl_chi2
+    vl_loss = np.take(stopping_object.vl_chi2, 0)
     return vl_loss * np.exp(alpha * diff)
 
 
diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 4484b42472..00430e8a2f 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -7,6 +7,9 @@
         # pdfNN_layer_generator:
             Generates the PDF NN layer to be fitted
 """
+from dataclasses import dataclass
+import numpy as np
+
 import n3fit.msr as msr_constraints
 from n3fit.layers import DIS, DY, Mask, ObsRotation, losses
 from n3fit.layers import Preprocessing, FkRotation, FlavourToEvolution
@@ -17,6 +20,61 @@
 from n3fit.backends import base_layer_selector, regularizer_selector
 
 
+@dataclass
+class ObservableWrapper:
+    """Wrapper to generate the observable layer once the PDF model is prepared
+    It can take normal datasets or lagrande-multiplier-like datasets
+    (such as positivity or integrability)
+    """
+
+    name: str
+    observables: list
+    dataset_xsizes: list
+    invcovmat: np.array = None
+    multiplier: float = 1.0
+    integrability: bool = False
+    positivity: bool = False
+    data: np.array = None
+    rotation: ObsRotation = None  # only used for diagonal covmat
+
+    def _generate_loss(self, mask=None):
+        """Generates the corresponding loss function depending on the values the wrapper
+        was initialized with"""
+        if self.invcovmat is not None:
+            loss = losses.LossInvcovmat(self.invcovmat, self.data, mask, name=self.name)
+        elif self.positivity:
+            loss = losses.LossPositivity(name=self.name, c=self.multiplier)
+        elif self.integrability:
+            loss = losses.LossIntegrability(name=self.name, c=self.multiplier)
+        return loss
+
+    def _generate_experimental_layer(self, pdf):
+        """ Generates the experimental layer from the PDF """
+        # First split the layer into the different datasets (if needed!)
+        if len(self.dataset_xsizes) > 1:
+            splitting_layer = operations.as_layer(
+                operations.split,
+                op_args=[self.dataset_xsizes],
+                op_kwargs={"axis": 1},
+                name=f"{self.name}_split",
+            )
+            split_pdf = splitting_layer(pdf)
+        else:
+            split_pdf = [pdf]
+        # Every obs gets its share of the split
+        output_layers = [obs(p_pdf) for p_pdf, obs in zip(split_pdf, self.observables)]
+        # Concatenate all datasets (so that experiments are one single entity)
+        ret = operations.concatenate(output_layers, axis=2)
+        if self.rotation is not None:
+            ret = self.rotation(ret)
+        return ret
+
+    def __call__(self, pdf_layer, mask=None):
+        loss_f = self._generate_loss(mask)
+        experiment_prediction = self._generate_experimental_layer(pdf_layer)
+        return loss_f(experiment_prediction)
+
+
 def observable_generator(
     spec_dict, positivity_initial=1.0, integrability=False
 ):  # pylint: disable=too-many-locals
@@ -93,10 +151,7 @@ def observable_generator(
             operation_name,
             name=f"dat_{dataset_name}",
         )
-        if spec_dict["positivity"]:
-            obs_layer_ex = obs_layer_tr
-            obs_layer_vl = obs_layer_tr
-        else:
+        if not spec_dict["positivity"]:
             obs_layer_ex = Obs_Layer(
                 dataset_dict["fktables"],
                 dataset_dict["ex_fktables"],
@@ -109,6 +164,8 @@ def observable_generator(
                 operation_name,
                 name=f"val_{dataset_name}",
             )
+        else:
+            obs_layer_ex = obs_layer_vl = None
 
         # Data transformation might need access to the full array of output data
         # therefore the validation and training layers should point to the full exp
@@ -130,57 +187,17 @@ def observable_generator(
         model_obs_vl.append(obs_layer_vl)
         model_obs_ex.append(obs_layer_ex)
 
-    # Prepare a concatenation as experiments are one single entity formed by many datasets
-    def gen_concat():
-        return operations.as_layer(operations.concatenate, op_kwargs={"axis": 2})
-
-
-    concat_ex = gen_concat()
-    # For data transformation all concatenations are the same
-    if spec_dict.get("data_transformation") is None:
-        concat_tr = gen_concat()
-        concat_vl = gen_concat()
-    else:
-        concat_tr = concat_ex
-        concat_vl = concat_ex
-
-    # creating the experiment as a model turns out to bad for performance
-    def experiment_layer(
-        pdf, model_obs=model_obs_ex, concat=concat_ex, rotation=None, datasets_out=None
-    ):
-        """ By default works with the experiment observable """
-        output_layers = []
-        # First split the pdf layer into the different datasets if needed
-        if len(dataset_xsizes) > 1:
-            splitting_layer = operations.as_layer(
-                operations.split,
-                op_args=[dataset_xsizes],
-                op_kwargs={"axis": 1},
-                name=f"{spec_name}_split",
-            )
-            split_pdf = splitting_layer(pdf)
-        else:
-            split_pdf = [pdf]
-        # every obs gets its share of the split
-        output_layers = [obs(p_pdf) for p_pdf, obs in zip(split_pdf, model_obs)]
-        # Concatenate all datasets as experiments are one single entity if needed
-        ret = concat(output_layers)
-        if rotation is not None:
-            ret = rotation(ret)
-        return ret
-
-    # Now create the model for this experiment
     full_nx = sum(dataset_xsizes)
 
     if spec_dict["positivity"]:
-        if integrability:
-            loss_pos = losses.LossIntegrability(name=spec_name, c=positivity_initial)
-        else:
-            loss_pos = losses.LossPositivity(name=spec_name, c=positivity_initial)
-
-        def out_positivity(pdf_layer, mask=None, datasets_out=None):
-            exp_result = experiment_layer(pdf_layer)
-            return loss_pos(exp_result)
+        out_positivity = ObservableWrapper(
+            spec_name,
+            model_obs_tr,
+            dataset_xsizes,
+            multiplier=positivity_initial,
+            positivity=not integrability,
+            integrability=integrability,
+        )
 
         layer_info = {
             "inputs": model_inputs,
@@ -189,8 +206,6 @@ def out_positivity(pdf_layer, mask=None, datasets_out=None):
         }
         return layer_info
 
-    # Prepare the loss function, important! the loss function must carry the name of the experiment!
-
     # Generate the loss function and rotations of the final data (if any)
     if spec_dict.get("data_transformation") is not None:
         # TODO: I'm asuming that the diagonal covmat will work ootb, check
@@ -201,36 +216,36 @@ def out_positivity(pdf_layer, mask=None, datasets_out=None):
         obsrot_tr = None
         obsrot_vl = None
 
-
-    # Prepare the inverse covmats for each of the loss functions 
+    # Prepare the inverse covmats for each of the loss functions
     # (that are only instantiated when the output layer is created)
     invcovmat_tr = spec_dict["invcovmat"]
     invcovmat_vl = spec_dict["invcovmat_vl"]
     invcovmat = spec_dict["invcovmat_true"]
 
-    def out_tr(pdf_layer, mask=None, datasets_out=None):
-        loss_tr = losses.LossInvcovmat(invcovmat_tr, spec_dict["expdata"], mask, name=spec_name)
-        exp_result = experiment_layer(
-            pdf_layer,
-            model_obs=model_obs_tr,
-            concat=concat_tr,
-            rotation=obsrot_tr,
-        )
-        return loss_tr(exp_result)
-
-    def out_vl(pdf_layer, mask=None, datasets_out=None):
-        loss_vl = losses.LossInvcovmat(invcovmat_vl, spec_dict["expdata_vl"], mask, name=f"{spec_name}_val")
-        exp_result = experiment_layer(
-            pdf_layer,
-            model_obs=model_obs_vl,
-            concat=concat_vl,
-            rotation=obsrot_vl,
-        )
-        return loss_vl(exp_result)
-
-    def out_exp(pdf_layer, mask=None, datasets_out=None):
-        loss_ex = losses.LossInvcovmat(invcovmat, spec_dict["expdata_true"], mask, name=f"{spec_name}_exp")
-        return loss_ex(experiment_layer(pdf_layer))
+    out_tr = ObservableWrapper(
+        spec_name,
+        model_obs_tr,
+        dataset_xsizes,
+        invcovmat=invcovmat_tr,
+        data=spec_dict["expdata"],
+        rotation=obsrot_tr,
+    )
+    out_vl = ObservableWrapper(
+        f"{spec_name}_val",
+        model_obs_vl,
+        dataset_xsizes,
+        invcovmat=invcovmat_vl,
+        data=spec_dict["expdata_vl"],
+        rotation=obsrot_vl,
+    )
+    out_exp = ObservableWrapper(
+        f"{spec_name}_exp",
+        model_obs_ex,
+        dataset_xsizes,
+        invcovmat=invcovmat,
+        data=spec_dict["expdata_true"],
+        rotation=None,
+    )
 
     layer_info = {
         "inputs": model_inputs,

From 99eea82b94171e52769bf93b1e75af96eb9ef486 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Wed, 6 Jan 2021 13:39:57 +0100
Subject: [PATCH 21/27] possible fix for mac

---
 n3fit/src/n3fit/backends/keras_backend/MetaModel.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
index 23b3125cd5..de6b855f3d 100644
--- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
+++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -207,7 +207,13 @@ def losses_fun():
         # the tensorflow variable to python primitives or numpy arrays.
         # Undocumented TF function that converts all the tensors from the ret dictionary to numpy arrays
         # if it dissapears, equivalent for us to {k: i.numpy() for k, i in ret.items()}
-        return tf_utils.to_numpy_or_python_type(ret)
+        try:
+            dict_result = tf_utils.to_numpy_or_python_type(ret)
+        except AttributeError:
+            # For TF < 2.2
+            dict_result = {k: i.numpy() for k, i in ret.items()}
+        return dict_result
+
 
     def compile(
         self,

From 0c33dbcf03c3001aa54e5f5893f18b985aa308c9 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Thu, 7 Jan 2021 09:55:05 +0100
Subject: [PATCH 22/27] add some docs

---
 doc/sphinx/source/n3fit/runcard_detailed.rst | 72 ++++++++++++++------
 1 file changed, 53 insertions(+), 19 deletions(-)

diff --git a/doc/sphinx/source/n3fit/runcard_detailed.rst b/doc/sphinx/source/n3fit/runcard_detailed.rst
index f59137c1af..89cf5bc07f 100644
--- a/doc/sphinx/source/n3fit/runcard_detailed.rst
+++ b/doc/sphinx/source/n3fit/runcard_detailed.rst
@@ -9,8 +9,9 @@ In this section we fine-grain the explanation of the different parameters that e
 - :ref:`networkarch-label`
 - :ref:`optimizer-label`
 - :ref:`positivity-label`
-- :ref:`otheroptions-label`
 - :ref:`tensorboard-label`
+- :ref:`parallel-label`
+- :ref:`otheroptions-label`
 
 
 .. _preprocessing-label:
@@ -206,24 +207,6 @@ Threshold :math:`\chi2`
 - ``threshold_chi2``: sets a maximum validation :math:`\chi2` for the stopping to activate. Avoids (too) early stopping.
 
 
-Save and load weights of the model
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code-block:: yaml
-
-    fitting:
-        save: "weights.h5"
-        load: "weights.h5"
-
-- ``save``: saves the weights of the PDF model in the selected file in the replica folder.
-- ``load``: loads the weights of the PDF model from the selected file.
-
-Since the weights depend only on the architecture of the Neural Network,
-it is possible to save the weights of a Neural Network trained with one set of hyperparameters and experiments
-and load it in a different runcard and continue the training from there.
-
-While the load file is read as an absolute path, the file to save to will be found
-inside the replica folder.
 
 
 .. _tensorboard-label:
@@ -258,3 +241,54 @@ Logging details can be visualized in the browser with the following command:
 Logging details will include the value of the loss for each experiment over time,
 the values of the weights of the NN,
 as well as a detailed analysis of the amount of time that TensorFlow spent on each operation.
+
+          
+.. _parallel-label:
+
+Running fits in parallel
+------------------------
+
+It is possible to run fits in parallel with ``n3fit`` by using the ``parallel_models``
+flag in the runcard (by default the number of ``parallel_models`` is set to 1).
+Running in parallel can be quite hard on memory and it is only advantageous when
+fitting on a GPU, where one can find a speed up equal to the number of models run
+in parallel (each model being a different replica).
+
+At present it cannot be used together with the ``hyperopt`` module.
+
+
+.. _otheroptions-label:
+
+Other options
+-------------
+
+Threshold :math:`\chi2`
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+    fitting:
+        parameters:
+            threshold_chi2: 4.0
+
+- ``threshold_chi2``: sets a maximum validation :math:`\chi2` for the stopping to activate. Avoids (too) early stopping.
+
+
+Save and load weights of the model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+    fitting:
+        save: "weights.h5"
+        load: "weights.h5"
+
+- ``save``: saves the weights of the PDF model in the selected file in the replica folder.
+- ``load``: loads the weights of the PDF model from the selected file.
+
+Since the weights depend only on the architecture of the Neural Network,
+it is possible to save the weights of a Neural Network trained with one set of hyperparameters and experiments
+and load it in a different runcard and continue the training from there.
+
+While the load file is read as an absolute path, the file to save to will be found
+inside the replica folder.

From 40d590a79c4a5eb4c7b2e62a3ef4c189fa06cd67 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 26 Jan 2021 11:42:12 +0100
Subject: [PATCH 23/27] fix the problems with the rebasing

---
 .../n3fit/backends/keras_backend/MetaModel.py    |  7 ++++++-
 n3fit/src/n3fit/io/writer.py                     |  2 +-
 n3fit/src/n3fit/msr.py                           |  3 +--
 n3fit/src/n3fit/performfit.py                    |  5 +----
 n3fit/src/n3fit/stopping.py                      | 12 +-----------
 n3fit/src/n3fit/vpinterface.py                   | 16 ++++++++++++++--
 6 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
index de6b855f3d..9b1692f997 100644
--- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
+++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -4,7 +4,7 @@
     Extension of the backend Model class containing some wrappers in order to absorb other
     backend-dependent calls.
 """
-
+import re
 import numpy as np
 import tensorflow as tf
 from tensorflow.keras.models import Model
@@ -334,3 +334,8 @@ def apply_as_layer(self, x):
             # TF 2.0 seems to fail with ValueError when passing a dictionary as an input
             y = x.values()
             return super().__call__(y)
+
+    def get_layer_re(self, regex):
+        """ Get all layers matching the given regular expression """
+        check = lambda x: re.match(regex, x.name)
+        return list(filter(check, self.layers))
diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
index 1b28a01de8..f8060119bc 100644
--- a/n3fit/src/n3fit/io/writer.py
+++ b/n3fit/src/n3fit/io/writer.py
@@ -69,7 +69,7 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
         # Check the directory exist, if it doesn't, generate it
         os.makedirs(replica_path_set, exist_ok=True)
 
-        stop_epoch = self.stopping_object.epoch_of_the_stop
+        stop_epoch = self.stopping_object.stop_epoch
 
         # Get the replica status for this object
         replica_status = self.stopping_object.get_next_replica()
diff --git a/n3fit/src/n3fit/msr.py b/n3fit/src/n3fit/msr.py
index 11a19b1bfc..7bd2cd969f 100644
--- a/n3fit/src/n3fit/msr.py
+++ b/n3fit/src/n3fit/msr.py
@@ -79,12 +79,11 @@ def apply_normalization(layer_fitbasis, layer_pdf):
             layer_fitbasis: output of the NN
             layer_pdf: output for the fktable
         """
-
         pdf_integrand = operations.op_multiply([division_by_x(xgrid_input), layer_fitbasis(xgrid_input)])
         normalization = normalizer(integrator(pdf_integrand))
 
         def ultimate_pdf(x):
-            return operations.op_multiply([layer_pdf(x), normalization])
+            return layer_pdf(x)*normalization
 
         return ultimate_pdf
 
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 995d5bb6af..1f642a02db 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -246,11 +246,8 @@ def performfit(
             kfold_parameters=kfold_parameters,
             max_cores=maxcores,
             model_file=fitting.get("load"),
-<<<<<<< HEAD
-            sum_rules=fitting.get("sum_rules", True)
-=======
+            sum_rules=fitting.get("sum_rules", True),
             parallel_models=parallel_models
->>>>>>> 6e0fc5f2c (fit many models at once)
         )
 
         # This is just to give a descriptive name to the fit function
diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py
index a0e750e403..45b5990406 100644
--- a/n3fit/src/n3fit/stopping.py
+++ b/n3fit/src/n3fit/stopping.py
@@ -460,18 +460,8 @@ def vl_chi2(self):
 
     @property
     def e_best_chi2(self):
-<<<<<<< HEAD
-        """ Epoch of the best chi2, if there is no best epoch
-        return the last epoch"""
-        be = self.history.best_epoch
-        if be is None:
-            return self.stop_epoch
-        return be
-
-=======
-        """ Epoch of the best chi2 """
+        """ Epoch of the best chi2, if there is no best epoch, return None"""
         return self._history.best_epoch
->>>>>>> 5fb6df564 (many changes to stopping, remove deprecated options)
 
     @property
     def stop_epoch(self):
diff --git a/n3fit/src/n3fit/vpinterface.py b/n3fit/src/n3fit/vpinterface.py
index dd5077e8c5..b4d4a975b6 100644
--- a/n3fit/src/n3fit/vpinterface.py
+++ b/n3fit/src/n3fit/vpinterface.py
@@ -18,12 +18,14 @@
 
 
 """
+import logging
 import numpy as np
 import numpy.linalg as la
 from validphys.core import PDF, MCStats
 from validphys.pdfbases import ALL_FLAVOURS, check_basis
 from validphys.arclength import integrability_number, arc_lengths
 
+log = logging.getLogger(__name__)
 # Order of the evolution basis output from n3fit
 EVOL_LIST = [
     "photon",
@@ -83,13 +85,23 @@ def get_nn_weights(self):
         """Outputs all weights of the NN as numpy.ndarrays """
         return self.model.get_weights()
 
-    def get_preprocessing_factors(self):
+    def get_preprocessing_factors(self, replica=None):
         """Loads the preprocessing alpha and beta arrays from the PDF trained model.
         If a ``fit_basis`` given in the format of ``n3fit`` runcards is given it will be used
         to generate a new dictionary with the names, the exponent and whether they are trainable
         otherwise outputs a Nx2 array where [:,0] are alphas and [:,1] betas
         """
-        preprocessing_layer = self.model.get_layer("pdf_prepro")
+        # If the replica is given, get the requested preprocessing layer
+        # otherwise, search for any pdf_prepro_X layer within the model
+        if replica is not None:
+            preprocessing_layer = self.model.get_layer(f"pdf_prepro_{replica}")
+        else:
+            preprocessing_layers = self.model.get_layer_re("pdf_prepro_\d")
+            if len(preprocessing_layers) != 1:
+                # We really don't want to fail at this point, but print a warning at least...
+                log.warning("More than one preprocessing layer found within the model!")
+            preprocessing_layer = preprocessing_layers[0]
+
         if self.fit_basis is not None:
             output_dictionaries = []
             for d in self.fit_basis:

From 117a9119db663fdb11fd9bb21b633ba73089b8eb Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 26 Jan 2021 12:24:24 +0100
Subject: [PATCH 24/27] fix the tests

---
 n3fit/src/n3fit/io/writer.py      |  2 +-
 n3fit/src/n3fit/stopping.py       | 31 ++++++++++++++++++++-----------
 n3fit/src/n3fit/tests/test_fit.py |  3 +++
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
index f8060119bc..48dabd5c2a 100644
--- a/n3fit/src/n3fit/io/writer.py
+++ b/n3fit/src/n3fit/io/writer.py
@@ -81,7 +81,7 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
             replica_path_set,
             fitname,
             self.q2,
-            self.stopping_object.e_best_chi2,
+            replica_status.best_epoch,
             vl_chi2,
             tr_chi2,
             true_chi2,
diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py
index 45b5990406..94fb7e8dee 100644
--- a/n3fit/src/n3fit/stopping.py
+++ b/n3fit/src/n3fit/stopping.py
@@ -249,6 +249,7 @@ def __init__(self, pdf_model):
         self._pdf_model = pdf_model
         self._weights = None
         self._best_epoch = None
+        self._stop_epoch = None
         self._best_vl_chi2 = INITIAL_CHI2
 
     def positivity_pass(self):
@@ -260,8 +261,14 @@ def positivity_pass(self):
 
     @property
     def best_epoch(self):
+        if self._best_epoch is None:
+            return self.stop_epoch
         return self._best_epoch
 
+    @property
+    def stop_epoch(self):
+        return self._stop_epoch
+
     @property
     def best_vl(self):
         return float(self._best_vl_chi2)
@@ -284,9 +291,11 @@ def reload(self):
         if self._weights:
             self._pdf_model.set_weights(self._weights)
 
-    def stop_training(self):
-        """ Stop training this replica """
-        self._pdf_model.trainable = False
+    def stop_training(self, epoch = None):
+        """ Stop training this replica if not stopped before """
+        if self._pdf_model.trainable:
+            self._pdf_model.trainable = False
+            self._stop_epoch = epoch
 
 
 class FitHistory:
@@ -376,16 +385,16 @@ def register(self, epoch, training_info, validation_info):
         self._history.append(fitstate)
         return fitstate
 
-    def stop_training_replica(self, i):
-        """ Stop training replica i """
-        self._replicas[i].stop_training()
+    def stop_training_replica(self, i, e):
+        """ Stop training replica i in epoch e"""
+        self._replicas[i].stop_training(e)
 
     def reload(self):
-        """Reloads the best fit weights into the model
-        if there are models to be reloaded
-        A set of weights can be enforced as an optional argument
+        """Reloads the best fit weights into the model if there are models to be reloaded
+        Ensure that all replicas have stopped at this point.
         """
         for replica in self._replicas:
+            replica.stop_training(self.final_epoch)
             replica.reload()
 
     def __next__(self):
@@ -460,7 +469,7 @@ def vl_chi2(self):
 
     @property
     def e_best_chi2(self):
-        """ Epoch of the best chi2, if there is no best epoch, return None"""
+        """ Epoch of the best chi2, if there is no best epoch, return last"""
         return self._history.best_epoch
 
     @property
@@ -551,7 +560,7 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
         stop_replicas = self.count & (self.stopping_degree > self.stopping_patience)
         for i in np.where(stop_replicas)[0]:
             self.count[i] = 0
-            self._history.stop_training_replica(i)
+            self._history.stop_training_replica(i, epoch)
 
         # By using the stopping degree we only stop when none of the replicas are improving anymore
         if min(self.stopping_degree) > self.stopping_patience:
diff --git a/n3fit/src/n3fit/tests/test_fit.py b/n3fit/src/n3fit/tests/test_fit.py
index 2c3f509405..b423589ec8 100644
--- a/n3fit/src/n3fit/tests/test_fit.py
+++ b/n3fit/src/n3fit/tests/test_fit.py
@@ -180,3 +180,6 @@ def test_weirdbasis(tmp_path, timing=30):
 #     with pytest.raises(sp.TimeoutExpired):
     with pytest.raises(sp.CalledProcessError):
         sp.run(f"{EXE} {quickcard} {REPLICA}".split(), cwd=tmp_path, timeout=timing, check=True)
+
+
+# test_performfit_and_timing(pathlib.Path("/tmp/random_path"))

From 702a325fd93aae9fd898fe04654cef0a4b40f87b Mon Sep 17 00:00:00 2001
From: Juacrumar <juacrumar@lairen.eu>
Date: Wed, 27 Jan 2021 15:54:18 +0100
Subject: [PATCH 25/27] Update n3fit/src/n3fit/model_gen.py

Co-authored-by: Roy Stegeman <roystegeman@live.nl>
---
 n3fit/src/n3fit/model_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 00430e8a2f..33aa2dffc0 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -23,7 +23,7 @@
 @dataclass
 class ObservableWrapper:
     """Wrapper to generate the observable layer once the PDF model is prepared
-    It can take normal datasets or lagrande-multiplier-like datasets
+    It can take normal datasets or Lagrange-multiplier-like datasets
     (such as positivity or integrability)
     """
 

From 35467800e07c84b7efe80e7e0947a83208d96adf Mon Sep 17 00:00:00 2001
From: Juacrumar <juacrumar@lairen.eu>
Date: Wed, 27 Jan 2021 16:05:39 +0100
Subject: [PATCH 26/27] Update n3fit/src/n3fit/model_gen.py

Co-authored-by: Roy Stegeman <roystegeman@live.nl>
---
 n3fit/src/n3fit/model_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 33aa2dffc0..4554eebdbc 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -450,7 +450,7 @@ def pdfNN_layer_generator(
 
     Returns
     -------
-        model_pdf: n3fit.backends.MetaModel
+        pdf_models: list with a number equal to `parallel_models` of type n3fit.backends.MetaModel
             a model f(x) = y where x is a tensor (1, xgrid, 1) and y a tensor (1, xgrid, out)
     """
     # Parse the input configuration

From b6e8543038f0e638dded2050345a7b474b0f051c Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Thu, 28 Jan 2021 11:51:44 +0100
Subject: [PATCH 27/27] apply comments

---
 n3fit/src/n3fit/ModelTrainer.py               |  1 -
 .../n3fit/backends/keras_backend/MetaModel.py |  2 +-
 n3fit/src/n3fit/checks.py                     |  2 +-
 n3fit/src/n3fit/layers/DIS.py                 |  8 +++----
 n3fit/src/n3fit/model_gen.py                  | 22 +++++++++----------
 n3fit/src/n3fit/msr.py                        | 10 ++++-----
 n3fit/src/n3fit/performfit.py                 |  2 +-
 n3fit/src/n3fit/tests/test_fit.py             |  3 ---
 8 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index e8dc93084d..b5401983e0 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -720,7 +720,6 @@ def hyperparametrizable(self, params):
         threshold_chi2 = params.get("threshold_chi2", CHI2_THRESHOLD)
 
         # Initialize the chi2 dictionaries
-        l_train = []
         l_valid = []
         l_exper = []
         l_hyper = []
diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
index 9b1692f997..3bc7862cd1 100644
--- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
+++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -157,7 +157,7 @@ def perform_fit(self, x=None, y=None, epochs=1, **kwargs):
         x = self._parse_input(self.x_in)
         if y is None:
             y = self.target_tensors
-        history = self.fit(x=x, y=y, epochs=epochs, **kwargs)
+        history = super().fit(x=x, y=y, epochs=epochs, **kwargs)
         loss_dict = history.history
         return loss_dict
 
diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
index 69c5c43290..8c99e631c7 100644
--- a/n3fit/src/n3fit/checks.py
+++ b/n3fit/src/n3fit/checks.py
@@ -355,7 +355,7 @@ def check_consistent_basis(fitting, theoryid):
 
 
 @make_argcheck
-def can_run_in_parallel(fitting, replica, hyperopt, parallel_models=1):
+def can_run_multiple_replicas(fitting, replica, hyperopt, parallel_models=1):
     """ Checks whether a runcard which is trying to run several replicas at once (parallel_models =/= 1) is valid
     """
     rp = len(replica)
diff --git a/n3fit/src/n3fit/layers/DIS.py b/n3fit/src/n3fit/layers/DIS.py
index 570eb11456..1782ece969 100644
--- a/n3fit/src/n3fit/layers/DIS.py
+++ b/n3fit/src/n3fit/layers/DIS.py
@@ -18,8 +18,8 @@ class DIS(Observable):
         the incoming pdf.
 
         The fktable is expected to be rank 3 (ndata, xgrid, flavours)
-        while the input pdf is also rank 3 where the first dimension is the batch dimension
-        (1, xgrid, flavours)
+        while the input pdf is rank 4 where the first dimension is the batch dimension
+        and the last dimension the number of replicas being fitted (1, xgrid, flavours, replicas)
     """
 
     def gen_mask(self, basis):
@@ -50,12 +50,12 @@ def call(self, pdf):
             Parameters
             ----------
                 pdf:  backend tensor
-                    rank 3 tensor (batch_size, xgrid, flavours)
+                    rank 4 tensor (batch_size, xgrid, flavours, replicas)
 
             Returns
             -------
                 result: backend tensor
-                    rank 1 tensor (batchsize, ndata)
+                    rank 3 tensor (batchsize, replicas, ndata)
         """
         # DIS never needs splitting
         if self.splitting is not None:
diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 4554eebdbc..fa0c541cef 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -15,7 +15,7 @@
 from n3fit.layers import Preprocessing, FkRotation, FlavourToEvolution
 
 from n3fit.backends import MetaModel, Input
-from n3fit.backends import operations
+from n3fit.backends import operations as op
 from n3fit.backends import MetaLayer, Lambda
 from n3fit.backends import base_layer_selector, regularizer_selector
 
@@ -52,8 +52,8 @@ def _generate_experimental_layer(self, pdf):
         """ Generates the experimental layer from the PDF """
         # First split the layer into the different datasets (if needed!)
         if len(self.dataset_xsizes) > 1:
-            splitting_layer = operations.as_layer(
-                operations.split,
+            splitting_layer = op.as_layer(
+                op.split,
                 op_args=[self.dataset_xsizes],
                 op_kwargs={"axis": 1},
                 name=f"{self.name}_split",
@@ -64,7 +64,7 @@ def _generate_experimental_layer(self, pdf):
         # Every obs gets its share of the split
         output_layers = [obs(p_pdf) for p_pdf, obs in zip(split_pdf, self.observables)]
         # Concatenate all datasets (so that experiments are one single entity)
-        ret = operations.concatenate(output_layers, axis=2)
+        ret = op.concatenate(output_layers, axis=2)
         if self.rotation is not None:
             ret = self.rotation(ret)
         return ret
@@ -113,11 +113,9 @@ def observable_generator(
             a dictionary with:
             - `inputs`: input layer
             - `output`: output layer (unmasked)
-            - `loss` : loss function (unmasked)
             - `output_tr`: output layer (training)
-            - `loss_tr` : loss function (training)
             - `output_vl`: output layer (validation)
-            - `loss_vl` : loss function (validation)
+            - `experiment_xsize` : int (size of the output array) 
     """
     spec_name = spec_dict["name"]
     dataset_xsizes = []
@@ -484,7 +482,7 @@ def pdfNN_layer_generator(
     # If the input is of type (x, logx)
     # create a x --> (x, logx) layer to preppend to everything
     if inp == 2:
-        add_log = Lambda(lambda x: operations.concatenate([x, operations.op_log(x)], axis=-1))
+        add_log = Lambda(lambda x: op.concatenate([x, op.op_log(x)], axis=-1))
 
     # Evolution layer
     layer_evln = FkRotation(input_shape=(last_layer_nodes,), output_dim=out)
@@ -494,10 +492,10 @@ def pdfNN_layer_generator(
 
     # Normalization and sum rules
     if impose_sumrule:
-        sumrule_imposition, integrator_input = msr_constraints.msr_impose(mode=impose_sumrule)
+        sumrule_layer, integrator_input = msr_constraints.msr_impose(mode=impose_sumrule)
         model_input = [integrator_input, placeholder_input]
     else:
-        sumrule_imposition = lambda x: x
+        sumrule_layer = lambda x: x
         integrator_input = None
         model_input = [placeholder_input]
 
@@ -550,7 +548,7 @@ def dense_me(x):
 
         # Apply preprocessing and basis
         def layer_fitbasis(x):
-            ret = operations.op_multiply([dense_me(x), layer_preproc(x)])
+            ret = op.op_multiply([dense_me(x), layer_preproc(x)])
             if basis_rotation.is_identity():
                 # if we don't need to rotate basis we don't want spurious layers
                 return ret
@@ -561,7 +559,7 @@ def layer_pdf(x):
             return layer_evln(layer_fitbasis(x))
 
         # Final PDF
-        final_pdf = sumrule_imposition(layer_fitbasis, layer_pdf)
+        final_pdf = sumrule_layer(layer_fitbasis, layer_pdf)
 
         pdf_model = MetaModel(model_input, final_pdf(placeholder_input), name=f"PDF_{i}")
 
diff --git a/n3fit/src/n3fit/msr.py b/n3fit/src/n3fit/msr.py
index 7bd2cd969f..20c43804cc 100644
--- a/n3fit/src/n3fit/msr.py
+++ b/n3fit/src/n3fit/msr.py
@@ -5,7 +5,7 @@
 import numpy as np
 
 from n3fit.layers import xDivide, MSR_Normalization, xIntegrator
-from n3fit.backends import operations
+from n3fit.backends import operations as op
 from n3fit.backends import MetaModel
 
 
@@ -64,7 +64,7 @@ def msr_impose(nx=int(2e3), basis_size=8, mode='All'):
     normalizer = MSR_Normalization(input_shape=(basis_size,), mode=mode)
 
     # 5. Make the xgrid array into a backend input layer so it can be given to the normalization
-    xgrid_input = operations.numpy_to_input(xgrid)
+    xgrid_input = op.numpy_to_input(xgrid)
 
     # Now prepare a function that takes as input the 8-flavours output of the NN
     # and the 14-flavours after the fk rotation and returns a 14-flavours normalized output
@@ -79,7 +79,7 @@ def apply_normalization(layer_fitbasis, layer_pdf):
             layer_fitbasis: output of the NN
             layer_pdf: output for the fktable
         """
-        pdf_integrand = operations.op_multiply([division_by_x(xgrid_input), layer_fitbasis(xgrid_input)])
+        pdf_integrand = op.op_multiply([division_by_x(xgrid_input), layer_fitbasis(xgrid_input)])
         normalization = normalizer(integrator(pdf_integrand))
 
         def ultimate_pdf(x):
@@ -99,12 +99,12 @@ def check_integration(ultimate_pdf, integration_input):
     """
     nx = int(1e4)
     xgrid, weights_array = gen_integration_input(nx)
-    xgrid_input = operations.numpy_to_input(xgrid)
+    xgrid_input = op.numpy_to_input(xgrid)
 
     multiplier = xDivide(output_dim=14, div_list=range(3, 9))
 
     def pdf_integrand(x):
-        res = operations.op_multiply([multiplier(x), ultimate_pdf(x)])
+        res = op.op_multiply([multiplier(x), ultimate_pdf(x)])
         return res
 
     modelito = MetaModel([xgrid_input, integration_input], pdf_integrand(xgrid_input))
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 1f642a02db..dd912d997e 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -75,7 +75,7 @@ def initialize_seeds(replica: list, trvlseed: int, nnseed: int, mcseed: int, gen
 
 # Action to be called by valid phys
 # All information defining the NN should come here in the "parameters" dict
-@n3fit.checks.can_run_in_parallel
+@n3fit.checks.can_run_multiple_replicas
 @n3fit.checks.check_consistent_basis
 @n3fit.checks.wrapper_check_NN
 @n3fit.checks.wrapper_hyperopt
diff --git a/n3fit/src/n3fit/tests/test_fit.py b/n3fit/src/n3fit/tests/test_fit.py
index b423589ec8..2c3f509405 100644
--- a/n3fit/src/n3fit/tests/test_fit.py
+++ b/n3fit/src/n3fit/tests/test_fit.py
@@ -180,6 +180,3 @@ def test_weirdbasis(tmp_path, timing=30):
 #     with pytest.raises(sp.TimeoutExpired):
     with pytest.raises(sp.CalledProcessError):
         sp.run(f"{EXE} {quickcard} {REPLICA}".split(), cwd=tmp_path, timeout=timing, check=True)
-
-
-# test_performfit_and_timing(pathlib.Path("/tmp/random_path"))