diff --git a/doc/sphinx/source/n3fit/runcard_detailed.rst b/doc/sphinx/source/n3fit/runcard_detailed.rst
index f59137c1af..89cf5bc07f 100644
--- a/doc/sphinx/source/n3fit/runcard_detailed.rst
+++ b/doc/sphinx/source/n3fit/runcard_detailed.rst
@@ -9,8 +9,9 @@ In this section we fine-grain the explanation of the different parameters that e
 - :ref:`networkarch-label`
 - :ref:`optimizer-label`
 - :ref:`positivity-label`
-- :ref:`otheroptions-label`
 - :ref:`tensorboard-label`
+- :ref:`parallel-label`
+- :ref:`otheroptions-label`
 
 
 .. _preprocessing-label:
@@ -206,24 +207,6 @@ Threshold :math:`\chi2`
 - ``threshold_chi2``: sets a maximum validation :math:`\chi2` for the stopping to activate. Avoids (too) early stopping.
 
 
-Save and load weights of the model
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code-block:: yaml
-
-    fitting:
-        save: "weights.h5"
-        load: "weights.h5"
-
-- ``save``: saves the weights of the PDF model in the selected file in the replica folder.
-- ``load``: loads the weights of the PDF model from the selected file.
-
-Since the weights depend only on the architecture of the Neural Network,
-it is possible to save the weights of a Neural Network trained with one set of hyperparameters and experiments
-and load it in a different runcard and continue the training from there.
-
-While the load file is read as an absolute path, the file to save to will be found
-inside the replica folder.
 
 
 .. _tensorboard-label:
@@ -258,3 +241,54 @@ Logging details can be visualized in the browser with the following command:
 Logging details will include the value of the loss for each experiment over time,
 the values of the weights of the NN,
 as well as a detailed analysis of the amount of time that TensorFlow spent on each operation.
+
+          
+.. _parallel-label:
+
+Running fits in parallel
+------------------------
+
+It is possible to run fits in parallel with ``n3fit`` by using the ``parallel_models``
+flag in the runcard (by default the number of ``parallel_models`` is set to 1).
+Running in parallel can be quite hard on memory and it is only advantageous when
+fitting on a GPU, where one can find a speed up equal to the number of models run
+in parallel (each model being a different replica).
+
+At present it cannot be used together with the ``hyperopt`` module.
+
+
+.. _otheroptions-label:
+
+Other options
+-------------
+
+Threshold :math:`\chi2`
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+    fitting:
+        parameters:
+            threshold_chi2: 4.0
+
+- ``threshold_chi2``: sets a maximum validation :math:`\chi2` for the stopping to activate. Avoids (too) early stopping.
+
+
+Save and load weights of the model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+    fitting:
+        save: "weights.h5"
+        load: "weights.h5"
+
+- ``save``: saves the weights of the PDF model in the selected file in the replica folder.
+- ``load``: loads the weights of the PDF model from the selected file.
+
+Since the weights depend only on the architecture of the Neural Network,
+it is possible to save the weights of a Neural Network trained with one set of hyperparameters and experiments
+and load it in a different runcard and continue the training from there.
+
+While the load file is read as an absolute path, the file to save to will be found
+inside the replica folder.
diff --git a/n3fit/runcards/Basic_runcard_parallel.yml b/n3fit/runcards/Basic_runcard_parallel.yml
new file mode 100644
index 0000000000..52fdcae4a3
--- /dev/null
+++ b/n3fit/runcards/Basic_runcard_parallel.yml
@@ -0,0 +1,106 @@
+#
+# Configuration file for n3fit
+#
+
+############################################################
+description: Basic runcard
+
+############################################################
+# frac: training fraction
+# ewk: apply ewk k-factors
+# sys: systematics treatment (see systypes)
+dataset_inputs:
+- { dataset: SLACP, frac: 0.5}
+- { dataset: NMC,   frac: 0.5  }
+- { dataset: NMCPD, frac: 0.5 }
+- { dataset: CMSJETS11, frac: 0.5, sys: 10 }
+
+############################################################
+datacuts:
+  t0pdfset     : NNPDF31_nlo_as_0118 # PDF set to generate t0 covmat
+  q2min        : 3.49                # Q2 minimum
+  w2min        : 12.5                # W2 minimum
+  combocuts    : NNPDF31             # NNPDF3.0 final kin. cuts
+  jetptcut_tev : 0                   # jet pt cut for tevatron
+  jetptcut_lhc : 0                   # jet pt cut for lhc
+  wptcut_lhc   : 30.0                # Minimum pT for W pT diff distributions
+  jetycut_tev  : 1e30                # jet rap. cut for tevatron
+  jetycut_lhc  : 1e30                # jet rap. cut for lhc
+  dymasscut_min: 0                   # dy inv.mass. min cut
+  dymasscut_max: 1e30                # dy inv.mass. max cut
+  jetcfactcut  : 1e30                # jet cfact. cut
+
+############################################################
+theory:
+  theoryid: 53        # database id
+
+############################################################
+fitting:
+  trvlseed: 1
+  nnseed: 2
+  mcseed: 3
+
+  genrep: False     # true = generate MC replicas, false = use real data
+
+  parameters: # This defines the parameter dictionary that is passed to the Model Trainer
+    nodes_per_layer: [15, 10, 8]
+    activation_per_layer: ['sigmoid', 'sigmoid', 'linear']
+    initializer: 'glorot_normal'
+    optimizer:
+      optimizer_name: 'RMSprop'
+      learning_rate: 0.01
+      clipnorm: 1.0
+    epochs: 900
+    positivity:
+      multiplier: 1.05 # When any of the multiplier and/or the initial is not set
+      initial: # the poslambda will be used instead to compute these values per dataset
+      threshold: 1e-5
+    stopping_patience: 0.30 # percentage of the number of epochs
+    layer_type: 'dense'
+    dropout: 0.0
+    threshold_chi2: 5.0
+
+  # NN23(QED) = sng=0,g=1,v=2,t3=3,ds=4,sp=5,sm=6,(pht=7)
+  # EVOL(QED) = sng=0,g=1,v=2,v3=3,v8=4,t3=5,t8=6,(pht=7)
+  # EVOLS(QED)= sng=0,g=1,v=2,v8=4,t3=4,t8=5,ds=6,(pht=7)
+  # FLVR(QED) = g=0, u=1, ubar=2, d=3, dbar=4, s=5, sbar=6, (pht=7)
+  fitbasis: NN31IC # EVOL (7), EVOLQED (8), etc.
+  basis:
+      # remeber to change the name of PDF accordingly with fitbasis
+      # pos: True for NN squared
+      # mutsize: mutation size
+      # mutprob: mutation probability
+      # smallx, largex: preprocessing ranges
+      - { fl: sng, pos: False, mutsize: [15], mutprob: [0.05], smallx: [1.05,1.19], largex: [1.47,2.70], trainable: False }
+      - { fl: g,   pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.94,1.25], largex: [0.11,5.87], trainable: False }
+      - { fl: v,   pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.54,0.75], largex: [1.15,2.76], trainable: False }
+      - { fl: v3,  pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.21,0.57], largex: [1.35,3.08] }
+      - { fl: v8,  pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.52,0.76], largex: [0.77,3.56], trainable: True }
+      - { fl: t3,  pos: False, mutsize: [15], mutprob: [0.05], smallx: [-0.37,1.52], largex: [1.74,3.39] }
+      - { fl: t8,  pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.56,1.29], largex: [1.45,3.03] }
+      - { fl: cp,  pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.12,1.19], largex: [1.83,6.70] }
+
+############################################################
+positivity:
+  posdatasets:
+    - { dataset: POSF2U,   poslambda: 1e6 }  # Positivity Lagrange Multiplier
+    - { dataset: POSFLL,   poslambda: 1e4 }
+
+############################################################
+integrability:
+  integdatasets:
+    - {dataset: INTEGXT3,   poslambda: 1e2}
+
+############################################################
+lhagrid:
+  nx  : 150
+  xmin: 1e-9
+  xmed: 0.1
+  xmax: 1.0
+  nq  : 50
+  qmax: 1e5
+
+############################################################
+debug: True
+maxcores: 8
+parallel_models: 2
diff --git a/n3fit/runcards/DIS_diagonal_l2reg_example.yml b/n3fit/runcards/DIS_diagonal_l2reg_example.yml
index f392199f07..7b56a2fdc9 100644
--- a/n3fit/runcards/DIS_diagonal_l2reg_example.yml
+++ b/n3fit/runcards/DIS_diagonal_l2reg_example.yml
@@ -76,7 +76,7 @@ fitting:
     optimizer:
       learning_rate: 1.0
       optimizer_name: 'Adadelta'
-    epochs: 40000
+    epochs: 4000
     positivity:
       multiplier: 1.09
       initial: 10.0
diff --git a/n3fit/src/n3fit/ModelTrainer.py b/n3fit/src/n3fit/ModelTrainer.py
index 1fa73109b8..b5401983e0 100644
--- a/n3fit/src/n3fit/ModelTrainer.py
+++ b/n3fit/src/n3fit/ModelTrainer.py
@@ -9,6 +9,7 @@
     between iterations while at the same time keeping the amount of redundant calls to a minimum
 """
 import logging
+from itertools import zip_longest
 import numpy as np
 import n3fit.model_gen as model_gen
 from n3fit.backends import MetaModel, clear_backend_state, operations, callbacks
@@ -29,101 +30,12 @@
 PUSH_INTEGRABILITY_EACH = 100
 
 
-def _assign_data_to_model(model, data_dict, fold_k=0):
-    """
-    Reads the data dictionary (``data_dict``) and assings the target data to the model
-    It returns a dictionary containing:
-    {
-        'model': the backend.MetaModel to be trained
-        'target_ndata': an array of target output
-        'ndata': the number of datapoints
-        'losses': the list of loss functions of the model
-    }
-
-    If kfolding is active applies the (``fold_k``) fold to the target data.
-    in this case ndata is a count of the non_zero entries of the fold
-
-    Note: this function is transitional. Eventually a validphys action should
-    provide an experiment object with a .target_data(fold_indx) method which
-    should return the necessary information:
-        - number of datapoints
-        - target data with the right entries set to 0*
-    *or masked away if it is able to also return a list of loss functions that will mask away
-    the corresponding entries of the prediction
-
-
-    Parameters
-    ----------
-        model: backend.MetaModel
-            model to be added to the dictionary
-        data_dict: dict
-            dictionary containing: {
-                'expdata' : list of experimental data which the model will target,
-                'folds' : a list (size=expdata) of lists (size=kfolds) with the folding masks)
-                'losses': a list of loss functions for the model
-                }
-        fold_k: int
-            when kfolding, index of the fold, so that for every experiment we apply the
-            folds[index_experiment][mask]
-
-    Returns
-    -------
-        ret: dict
-            dictionary containing the model and its associated ndata and loss
-    """
-    # Array with all data
-    all_data = data_dict["expdata"]
-    # Each element of this list correspond to the set of folds for one experiment
-    all_folds = data_dict["folds"]
-    n_exps = len(all_folds)
-    # Now set to 0 the data folded away
-    active_data = []
-    ndata = 0
-    for exp_data, exp_fold in zip(all_data, all_folds):
-        if exp_fold:
-            mask = exp_fold[fold_k]
-            active_data.append(exp_data * mask)
-            ndata += np.count_nonzero(mask)
-        else:
-            active_data.append(exp_data)
-            ndata += exp_data.size
-    # There might be special outputs (like positivitiy) that is not
-    # affected by the folding.
-    # They don't count for the chi2 (which is only for reporting)
-    active_data += all_data[n_exps:]
-    ret = {
-        "model": model,
-        "target_data": active_data,
-        "ndata": ndata,
-        "losses": data_dict["losses"],
-    }
-    return ret
-
-
-def _model_compilation(models, optimizer_params):
-    """
-        Compiles all models
-
-    Parameters
-    ----------
-        models: list(dict)
-            A ditionary defining the model
-        optimizer_params: dict
-            Optimizer parameters to be passes to the compile method of the model
-    """
-    for _, model_dict in models.items():
-        model = model_dict["model"]
-        target = model_dict["target_data"]
-        losses = model_dict["losses"]
-        model.compile(loss=losses, target_output=target, **optimizer_params)
-
-
-def _pdf_injection(pdf_layers, observables, datasets_out=None):
+def _pdf_injection(pdf_layers, observables, masks):
     """
     Takes as input a list of output layers and returns a corresponding list
     where all output layers call the pdf layer at self.pdf_layer
     """
-    return [f(x, datasets_out=datasets_out) for f, x in zip(observables, pdf_layers)]
+    return [f(x, mask=m) for f, x, m in zip_longest(observables, pdf_layers, masks)]
 
 
 def _LM_initial_and_multiplier(input_initial, input_multiplier, max_lambda, steps):
@@ -174,27 +86,43 @@ def __init__(
         pass_status="ok",
         failed_status="fail",
         debug=False,
-        save_weights_each=False,
         kfold_parameters=None,
         max_cores=None,
-        model_file=None, 
+        model_file=None,
         sum_rules=True,
+        parallel_models=1
     ):
         """
         Parameters
         ----------
-            exp_info: list of dictionaries containing experiments
-            pos_info: list of dictionaries containing positivity sets
-            integ_info: list of dictionaries containing integrability sets
-            flavinfo: the object returned by fitting['basis']
-            nnseed: the seed used to initialise the Neural Network, will be passed to model_gen
-            pass_status: flag to signal a good run
-            failed_status: flag to signal a bad run
-            debug: flag to activate some debug options
-            save_weights_each: if set, save the state of the fit
-                                    every ``save_weights_each`` epochs
-            model_file: str whether to save the models
-            sum_rules: str whether sum rules should be enabled (All, MSR, VSR, False)
+            exp_info: list
+                list of dictionaries containing experiments
+            pos_info: list
+                list of dictionaries containing positivity sets
+            integ_info: list
+                list of dictionaries containing integrability sets
+            flavinfo: list
+                the object returned by fitting['basis']
+            fitbasis: str
+                the name of the basis being fitted
+            nnseed: int
+                the seed used to initialise the Neural Network, will be passed to model_gen
+            pass_status: str
+                flag to signal a good run
+            failed_status: str
+                flag to signal a bad run
+            debug: bool
+                flag to activate some debug options
+            kfold_parameters: dict
+                parameters defining the kfolding method
+            max_cores: int
+                maximum number of cores the fitting can use to run
+            model_file: str
+                whether to save the models
+            sum_rules: str
+		whether sum rules should be enabled (All, MSR, VSR, False)
+            parallel_models: int
+                number of models to fit in parallel
         """
 
         # Save all input information
@@ -211,8 +139,8 @@ def __init__(
         self.pass_status = pass_status
         self.failed_status = failed_status
         self.debug = debug
-        self.save_weights_each = save_weights_each
         self.all_datasets = []
+        self.parallel_models = parallel_models
 
         # Initialise internal variables which define behaviour
         if debug:
@@ -251,7 +179,6 @@ def __init__(
         self.training = {
             "output": [],
             "expdata": [],
-            "losses": [],
             "ndata": 0,
             "model": None,
             "posdatasets": [],
@@ -265,7 +192,6 @@ def __init__(
         self.validation = {
             "output": [],
             "expdata": [],
-            "losses": [],
             "ndata": 0,
             "model": None,
             "folds": [],
@@ -274,13 +200,10 @@ def __init__(
         self.experimental = {
             "output": [],
             "expdata": [],
-            "losses": [],
             "ndata": 0,
             "model": None,
             "folds": [],
         }
-        self.model_dicts = None
-
         self._fill_the_dictionaries()
 
         if self.validation["ndata"] == 0:
@@ -359,33 +282,37 @@ def _fill_the_dictionaries(self):
                 self.training["expdata"].append(integ_dict["expdata"])
                 self.training["integdatasets"].append(integ_dict["name"])
 
-    def _model_generation(self, pdf_model, partition):
+    def _model_generation(self, pdf_models, partition, partition_idx):
         """
         Fills the three dictionaries (``training``, ``validation``, ``experimental``)
-        with the ``model`` entry
-
+        with the ``model`` entry.
 
         Compiles the validation and experimental models with fakes optimizers and learning rate
         as they are never trained, but this is needed by some backends
-        in order to run evaluate on them
+        in order to run evaluate on them.
 
         Before entering this function the dictionaries contain a list of inputs
         and a list of outputs, but they are not connected.
         This function connects inputs with outputs by injecting the PDF.
         At this point we have a PDF model that takes an input (1, None, 1)
-        and outputs in return (1, none, 14)
+        and outputs in return (1, none, 14).
 
         The injection of the PDF is done by concatenating all inputs and calling
         pdf_model on it.
         This in turn generates an output_layer that needs to be splitted for every experiment
         as we have a set of observable "functions" that each take (1, exp_xgrid_size, 14)
         and output (1, masked_ndata) where masked_ndata can be the training/validation
-        or the experimental mask (in which cased masked_ndata == ndata)
+        or the experimental mask (in which cased masked_ndata == ndata).
+
+        Several models can be fitted at once by passing a list of models with a shared input
+        this function will give the same input to every model and will concatenate the output at the end
+        so that the final output of the model is (1, None, 14, n) (with n=number of parallel models)
+
 
         Parameters
         ----------
-            pdf_model: MetaModel
-                model producing pdf values
+            pdf_models: list(n3fit.backend.MetaModel)
+                a list of models that produce PDF values
 
         Returns
         -------
@@ -398,13 +325,23 @@ def _model_generation(self, pdf_model, partition):
         input_arr = np.concatenate(self.input_list, axis=1)
         input_layer = operations.numpy_to_input(input_arr.T)
 
-        # The input to the full model is expected to be the input to the PDF
-        # by reutilizing `pdf_model.parse_input` we ensure any auxiliary input is also accunted fro
-        full_model_input_dict = pdf_model._parse_input([input_layer], pass_content=False)
+        # The trainable part of the model is a concatenation of all PDF models
+        # where each model corresponds to a different replica
+        all_replicas_pdf = []
+
+        for pdf_model in pdf_models:
+            # The input to the full model is expected to be the input to the PDF
+            # by reutilizing `pdf_model.parse_input` we ensure any auxiliary input is also accunted fro
+            full_model_input_dict = pdf_model._parse_input([input_layer], pass_content=False)
+
+            # The output of the pdf on input_layer will be thus a concatenation
+            # of the PDF values for all experiments
+            full_pdf = pdf_model.apply_as_layer([input_layer])
+
+            all_replicas_pdf.append(full_pdf)
+
+        full_pdf_per_replica = operations.stack(all_replicas_pdf, axis=-1)
 
-        # The output of the pdf on input_layer will be thus a concatenation
-        # of the PDF values for all experiments
-        full_pdf = pdf_model.apply_as_layer([input_layer])
         # The input layer is a concatenation of all experiments
         # we need now to split the output on a different array per experiment
         sp_ar = [self.input_sizes]
@@ -412,25 +349,32 @@ def _model_generation(self, pdf_model, partition):
         splitting_layer = operations.as_layer(
             operations.split, op_args=sp_ar, op_kwargs=sp_kw, name="pdf_split"
         )
-        splitted_pdf = splitting_layer(full_pdf)
+        splitted_pdf = splitting_layer(full_pdf_per_replica)
 
         # If we are in a kfolding partition, select which datasets are out
         if partition:
-            kfold_datasets = partition["datasets"]
-            negate_k_datasets = [d for d in self.all_datasets if d not in kfold_datasets]
+            training_mask = [i[partition_idx] for i in self.training["folds"]]
+            validation_mask = [i[partition_idx] for i in self.validation["folds"]]
+            experimental_mask = [i[partition_idx] for i in self.experimental["folds"]]
         else:
-            kfold_datasets = None
-            negate_k_datasets = None
+            training_mask = validation_mask = experimental_mask = [None]
 
         # Training and validation leave out the kofld dataset
         # experiment leaves out the negation
-        output_tr = _pdf_injection(splitted_pdf, self.training["output"], kfold_datasets)
+        output_tr = _pdf_injection(splitted_pdf, self.training["output"], training_mask)
         training = MetaModel(full_model_input_dict, output_tr)
 
-        output_vl = _pdf_injection(splitted_pdf, self.validation["output"], kfold_datasets)
+        # For validation we don't have integrability
+        n_val = len(self.validation["output"])
+        output_vl = _pdf_injection(splitted_pdf[:n_val], self.validation["output"], validation_mask)
         validation = MetaModel(full_model_input_dict, output_vl)
 
-        output_ex = _pdf_injection(splitted_pdf, self.experimental["output"], negate_k_datasets)
+        # And for experimental we don't have positivity
+        # TODO: risky to rely that much in the order
+        n_exps = len(self.experimental["output"])
+        output_ex = _pdf_injection(
+            splitted_pdf[:n_exps], self.experimental["output"], experimental_mask
+        )
 
         experimental = MetaModel(full_model_input_dict, output_ex)
 
@@ -455,7 +399,7 @@ def _reset_observables(self):
         """
         self.input_list = []
         self.input_sizes = []
-        for key in ["output", "losses", "posmultipliers", "integmultipliers"]:
+        for key in ["output", "posmultipliers", "integmultipliers"]:
             self.training[key] = []
             self.validation[key] = []
             self.experimental[key] = []
@@ -511,10 +455,6 @@ def _generate_observables(
             self.validation["output"].append(exp_layer["output_vl"])
             self.experimental["output"].append(exp_layer["output"])
 
-            self.training["losses"].append(exp_layer["loss_tr"])
-            self.validation["losses"].append(exp_layer["loss_vl"])
-            self.experimental["losses"].append(exp_layer["loss"])
-
         # Generate the positivity penalty
         for pos_dict in self.pos_info:
             if not self.mode_hyperopt:
@@ -534,9 +474,7 @@ def _generate_observables(
 
             # The positivity should be on both training and validation models
             self.training["output"].append(pos_layer["output_tr"])
-            self.training["losses"].append(pos_layer["loss_tr"])
             self.validation["output"].append(pos_layer["output_tr"])
-            self.validation["losses"].append(pos_layer["loss_tr"])
 
             self.training["posmultipliers"].append(pos_multiplier)
             self.training["posinitials"].append(pos_initial)
@@ -563,7 +501,6 @@ def _generate_observables(
 
                 # The integrability all falls to the training
                 self.training["output"].append(integ_layer["output_tr"])
-                self.training["losses"].append(integ_layer["loss_tr"])
                 self.training["integmultipliers"].append(integ_multiplier)
                 self.training["integinitials"].append(integ_initial)
 
@@ -610,11 +547,11 @@ def _generate_pdf(
             pdf_model: MetaModel
                 pdf model
         """
-        log.info("Generating PDF model")
+        log.info("Generating PDF models")
 
         # Set the parameters of the NN
         # Generate the NN layers
-        pdf_model = model_gen.pdfNN_layer_generator(
+        pdf_models = model_gen.pdfNN_layer_generator(
             nodes=nodes_per_layer,
             activations=activation_per_layer,
             layer_type=layer_type,
@@ -626,27 +563,9 @@ def _generate_pdf(
             regularizer=regularizer,
             regularizer_args=regularizer_args,
             impose_sumrule=self.impose_sumrule,
+            parallel_models=self.parallel_models,
         )
-        return pdf_model
-
-    def _assign_data(self, models, fold_k=0):
-        """Assign to each model the data to compare with as well as the
-        number of data points in the model.
-        In the most general case training and validation get assigned the replic'd data
-        while experimental gets the actual data.
-        In the kfolding case (i.e, partition != None), they all receive the same data
-        but the folded data is set to 0 for training and validation
-        """
-        training = _assign_data_to_model(models["training"], self.training, fold_k)
-        validation = _assign_data_to_model(models["validation"], self.validation, fold_k)
-        experimental = _assign_data_to_model(models["experimental"], self.experimental, fold_k)
-
-        ret = {
-            "training": training,
-            "validation": validation,
-            "experimental": experimental,
-        }
-        return ret
+        return pdf_models
 
     def _prepare_reporting(self, partition):
         """Parses the information received by the :py:class:`n3fit.ModelTrainer.ModelTrainer`
@@ -697,10 +616,11 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100):
             callbacks=self.callbacks + [callback_st, callback_pos, callback_integ],
         )
 
-        if stopping_object.positivity_pass():
+        # TODO: in order to use multireplica in hyperopt is is necessary to define what "passing" means
+        # for now consider the run as good if any replica passed
+        if any([bool(i) for i in stopping_object.e_best_chi2]):
             return self.pass_status
-        else:
-            return self.failed_status
+        return self.failed_status
 
     def _hyperopt_override(self, params):
         """ Unrolls complicated hyperopt structures into very simple dictionaries"""
@@ -711,8 +631,8 @@ def _hyperopt_override(self, params):
                 for key, value in item.items():
                     params[key] = value
 
-    def enable_tensorboard(self, logdir, weight_freq = 0, profiling=False):
-        """ Enables tensorboard callback for further runs of the fitting procedure 
+    def enable_tensorboard(self, logdir, weight_freq=0, profiling=False):
+        """Enables tensorboard callback for further runs of the fitting procedure
 
         Parameters
         ----------
@@ -723,7 +643,9 @@ def enable_tensorboard(self, logdir, weight_freq = 0, profiling=False):
             profiling: bool
                 flag to enable the tensorboard profiler
         """
-        callback_tb = callbacks.gen_tensorboard_callback(logdir, profiling=profiling, histogram_freq=weight_freq)
+        callback_tb = callbacks.gen_tensorboard_callback(
+            logdir, profiling=profiling, histogram_freq=weight_freq
+        )
         self.callbacks.append(callback_tb)
 
     def evaluate(self, stopping_object):
@@ -741,15 +663,13 @@ def evaluate(self, stopping_object):
             val_chi2 : chi2 of the validation set
             exp_chi2: chi2 of the experimental data (without replica or tr/vl split)
         """
-        if self.model_dicts is None:
+        if self.training["model"] is None:
             raise RuntimeError("Modeltrainer.evaluate was called before any training")
         # Needs to receive a `stopping_object` in order to select the part of the
         # training and the validation which are actually `chi2` and not part of the penalty
-        training = self.model_dicts["training"]
-        experimental = self.model_dicts["experimental"]
-        train_chi2 = stopping_object.evaluate_training(training["model"])
-        val_chi2, _ = stopping_object.validation.loss()
-        exp_chi2 = experimental["model"].compute_losses()["loss"] / experimental["ndata"]
+        train_chi2 = stopping_object.evaluate_training(self.training["model"])
+        val_chi2 = stopping_object.vl_chi2
+        exp_chi2 = self.experimental["model"].compute_losses()["loss"] / self.experimental["ndata"]
         return train_chi2, val_chi2, exp_chi2
 
     def hyperparametrizable(self, params):
@@ -800,7 +720,6 @@ def hyperparametrizable(self, params):
         threshold_chi2 = params.get("threshold_chi2", CHI2_THRESHOLD)
 
         # Initialize the chi2 dictionaries
-        l_train = []
         l_valid = []
         l_exper = []
         l_hyper = []
@@ -814,7 +733,7 @@ def hyperparametrizable(self, params):
                 seed = np.random.randint(0, pow(2, 31))
 
             # Generate the pdf model
-            pdf_model = self._generate_pdf(
+            pdf_models = self._generate_pdf(
                 params["nodes_per_layer"],
                 params["activation_per_layer"],
                 params["initializer"],
@@ -827,12 +746,14 @@ def hyperparametrizable(self, params):
 
             # Model generation joins all the different observable layers
             # together with pdf model generated above
-            models = self._model_generation(pdf_model, partition)
+            models = self._model_generation(pdf_models, partition, k)
 
             # Only after model generation, apply possible weight file
+            # TODO: not sure whether it is a good idea that all of them start at the same point
             if self.model_file:
                 log.info("Applying model file %s", self.model_file)
-                pdf_model.load_weights(self.model_file)
+                for pdf_model in pdf_models:
+                    pdf_model.load_weights(self.model_file)
 
             if k > 0:
                 # Reset the positivity and integrability multipliers
@@ -840,17 +761,12 @@ def hyperparametrizable(self, params):
                 initial_values = self.training["posinitials"] + self.training["posinitials"]
                 models["training"].reset_layer_weights_to(pos_and_int, initial_values)
 
-            # Assign data to each model
-            # model dicts is similar to model but includes information about
-            # the target data and number of points
-            model_dicts = self._assign_data(models, k)
-
             # Generate the list containing reporting info necessary for chi2
             reporting = self._prepare_reporting(partition)
 
             if self.no_validation:
                 # Substitute the validation model with the training model
-                model_dicts["validation"] = model_dicts["training"]
+                models["validation"] = models["training"]
                 validation_model = models["training"]
             else:
                 validation_model = models["validation"]
@@ -860,16 +776,16 @@ def hyperparametrizable(self, params):
             stopping_object = Stopping(
                 validation_model,
                 reporting,
-                pdf_model,
+                pdf_models,
                 total_epochs=epochs,
                 stopping_patience=stopping_epochs,
-                save_weights_each=self.save_weights_each,
                 threshold_positivity=threshold_pos,
-                threshold_chi2=threshold_chi2
+                threshold_chi2=threshold_chi2,
             )
 
             # Compile each of the models with the right parameters
-            _model_compilation(model_dicts, params["optimizer"])
+            for model in models.values():
+                model.compile(**params["optimizer"])
 
             passed = self._train_and_fit(
                 models["training"],
@@ -877,66 +793,65 @@ def hyperparametrizable(self, params):
                 epochs=epochs,
             )
 
-            # Save validation and training chi2
-            training_loss = stopping_object.tr_chi2
-            validation_loss = stopping_object.vl_chi2
+            if self.mode_hyperopt:
+                # TODO: currently only working for one single replica
+                # If doing a hyperparameter scan we need to keep track at this point of the loss function
+                validation_loss = stopping_object.vl_chi2
 
-            # Compute experimental loss
-            exp_loss_raw = models["experimental"].compute_losses()["loss"]
-            experimental_loss = exp_loss_raw / model_dicts["experimental"]["ndata"]
+                # Compute experimental loss
+                exp_loss_raw = np.take(models["experimental"].compute_losses()["loss"], -1)
+                # And divide by the number of active points in this fold
+                # it would be nice to have a ndata_per_fold variable coming in the vp object...
+                ndata = np.sum([np.count_nonzero(i[k]) for i in self.experimental["folds"]])
+                experimental_loss = exp_loss_raw / ndata
 
-            if self.mode_hyperopt:
                 hyper_loss = experimental_loss
                 if passed != self.pass_status:
                     log.info("Hyperparameter combination fail to find a good fit, breaking")
                     # If the fit failed to fit, no need to add a penalty to the loss
                     break
                 for penalty in self.hyper_penalties:
-                    hyper_loss += penalty(pdf_model, stopping_object)
+                    hyper_loss += penalty(pdf_models[0], stopping_object)
                 l_hyper.append(hyper_loss)
-                log.info("Fold %d finished, loss=%.1f, pass=%s", k+1, hyper_loss, passed)
+                log.info("Fold %d finished, loss=%.1f, pass=%s", k + 1, hyper_loss, passed)
                 if hyper_loss > self.hyper_threshold:
-                    log.info("Loss above threshold (%.1f > %.1f), breaking", hyper_loss, self.hyper_threshold)
+                    log.info(
+                        "Loss above threshold (%.1f > %.1f), breaking",
+                        hyper_loss,
+                        self.hyper_threshold,
+                    )
                     # Apply a penalty proportional to the number of folds that have not been computed
                     pen_mul = len(self.kpartitions) - k
                     l_hyper = [i * pen_mul for i in l_hyper]
                     break
 
-            # Save all losses
-            l_train.append(training_loss)
-            l_valid.append(validation_loss)
-            l_exper.append(experimental_loss)
-
-        dict_out = {
-            "status": passed,
-            "training_loss": np.average(l_train),
-            "validation_loss": np.average(l_valid),
-            "experimental_loss": np.average(l_exper),
-        }
+                # Save all losses
+                l_valid.append(validation_loss)
+                l_exper.append(experimental_loss)
 
         if self.mode_hyperopt:
-            dict_out["loss"] = self.hyper_loss(l_hyper)
-            dict_out["kfold_meta"] = {
-                "training_losses": l_train,
-                "validation_losses": l_valid,
-                "experimental_losses": l_exper,
-                "hyper_losses": l_hyper,
+            # Hyperopt needs a dictionary with information about the losses
+            # it is possible to store arbitrary information in the trial file by adding it to this dictionary
+            dict_out = {
+                "status": passed,
+                "loss": self.hyper_loss(l_hyper),
+                "validation_loss": np.average(l_valid),
+                "experimental_loss": np.average(l_exper),
+                "kfold_meta": {
+                    "validation_losses": l_valid,
+                    "experimental_losses": l_exper,
+                    "hyper_losses": l_hyper,
+                },
             }
-            # If we are using hyperopt we don't need to output any other information
             return dict_out
 
-        dict_out["loss"] = experimental_loss
-
-        # Add to the output dictionary things that are needed by performfit.py
-        # to generate the output pdf, check the arc-length, gather stats, etc
-        # some of them are already attributes of the class so they are redundant here
-        # but I think it's good to present them explicitly
-        dict_out["stopping_object"] = stopping_object
-        dict_out["experimental"] = self.experimental
-        dict_out["training"] = self.training
-        dict_out["pdf_model"] = pdf_model
-
-        # Only after the training has finished, we save all models for future reporting
-        self.model_dicts = model_dicts
+        # Keep a reference to the models after training for future reporting
+        self.training["model"] = models["training"]
+        self.experimental["model"] = models["experimental"]
+        self.validation["model"] = models["validation"]
 
+        # In a normal run, the only information we need to output is the stopping object
+        # (which contains metadata about the stopping)
+        # and the pdf models (which are used to generate the PDF grids and compute arclengths)
+        dict_out = {"status": passed, "stopping_object": stopping_object, "pdf_models": pdf_models}
         return dict_out
diff --git a/n3fit/src/n3fit/backends/__init__.py b/n3fit/src/n3fit/backends/__init__.py
index 3370ae75a8..726a752eda 100644
--- a/n3fit/src/n3fit/backends/__init__.py
+++ b/n3fit/src/n3fit/backends/__init__.py
@@ -12,7 +12,6 @@
     regularizer_selector,
     Concatenate,
 )
-from n3fit.backends.keras_backend import losses
 from n3fit.backends.keras_backend import operations
 from n3fit.backends.keras_backend import constraints
 from n3fit.backends.keras_backend import callbacks
diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
index fe17eaec99..3bc7862cd1 100644
--- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
+++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -4,19 +4,13 @@
     Extension of the backend Model class containing some wrappers in order to absorb other
     backend-dependent calls.
 """
-
+import re
 import numpy as np
 import tensorflow as tf
 from tensorflow.keras.models import Model
 from tensorflow.keras import optimizers as Kopt
-from n3fit.backends.keras_backend.operations import numpy_to_tensor
-
-# Check the TF version to check if legacy-mode is needed (TF < 2.2)
-tf_version = tf.__version__.split('.')
-if int(tf_version[0]) == 2 and int(tf_version[1]) < 2:
-    LEGACY = True
-else:
-    LEGACY = False
+from tensorflow.python.keras.utils import tf_utils
+from n3fit.backends.keras_backend import operations as op
 
 # Define in this dictionary new optimizers as well as the arguments they accept
 # (with default values if needed be)
@@ -35,6 +29,12 @@
     v[1]["clipnorm"] = 1.0
 
 
+def _default_loss(y_true, y_pred):
+    """ Default loss to be used when the model is compiled with loss = Null
+    (for instance if the prediction of the model is already the loss """
+    return op.sum(y_pred)
+
+
 def _fill_placeholders(original_input, new_input=None):
     """
     Fills the placeholders of the original input with a new set of input
@@ -113,7 +113,7 @@ def __init__(self, input_tensors, output_tensors, **kwargs):
             # otherwise, put a placeholder None as it will come from the outside
             name = input_tensor.name.rsplit(":",1)[0]
             try:
-                self.x_in[name] = numpy_to_tensor(input_tensor.tensor_content)
+                self.x_in[name] = op.numpy_to_tensor(input_tensor.tensor_content)
                 self.tensors_in[name] = input_tensor
             except AttributeError:
                 self.x_in[name] = None
@@ -122,7 +122,7 @@ def __init__(self, input_tensors, output_tensors, **kwargs):
         self.all_inputs = input_list
         self.all_outputs = output_list
         self.target_tensors = None
-        self.eval_fun = None
+        self.compute_losses_function = None
 
     def _parse_input(self, extra_input=None, pass_content=True):
         """ Returns the input tensors the model was compiled with.
@@ -157,7 +157,7 @@ def perform_fit(self, x=None, y=None, epochs=1, **kwargs):
         x = self._parse_input(self.x_in)
         if y is None:
             y = self.target_tensors
-        history = super().fit(x=x, y=y, epochs=epochs, **kwargs,)
+        history = super().fit(x=x, y=y, epochs=epochs, **kwargs)
         loss_dict = history.history
         return loss_dict
 
@@ -169,45 +169,51 @@ def predict(self, x=None, **kwargs):
 
     def compute_losses(self):
         """
-        This function is the fast-equivalent to the model ``evaluate(x,y)`` method.
+        This function is equivalent to the model ``evaluate(x,y)`` method of most TensorFlow models
+        which return a dictionary of losses per output layer.
+        The losses reported in the ``evaluate`` method for n3fit are, however, summed over replicas.
+        Instead the loss we are interested in is usually the output of the model (i.e., predict)
 
-        On first call it calls ``.evaluate(return_dict=True, verbose=0)`` to force
-        the initialization of the test function.
-        Subsequent calls of this method will (when applicable)
-        directly call the internal evaluation function ``eval_fun``.
-        This bypasses the pre- and post- evaluation steps, resulting in a ~10% speed up
-        with respect to ``.evaluate(...)``
+        This function then generates a dict of partial losses of the model separated per replica.
+        i.e., the output for experiment {'LHC_exp'} will be an array of Nrep elements.
 
         Returns
         -------
             dict
                 a dictionary with all partial losses of the model
         """
-        if self.eval_fun is None:
-            # We still need to perform some initialization
-            if LEGACY:
-                # For TF < 2.2 we need to generate the test_function ourselves
-                self.make_test_function()
-            else:
-                return self.evaluate(return_dict=True, verbose=False)
-        if LEGACY:
-            # For tF < 2.2 we need to force the output to be a float
-            ret = self.eval_fun()
-            ret['loss'] = ret['loss'].numpy()
-            return ret
-        else:
-            return self.eval_fun()
+        # TODO might not work for TF < 2.2
+        if self.compute_losses_function is None:
+            # If it is the first time we are passing through, compile the function and save it
+            out_names = [f"{i}_loss" for i in self.output_names]
+            out_names.insert(0, "loss")
+
+            # Compile a evaluation function
+            @tf.function
+            def losses_fun():
+                predictions = self(self._parse_input(None))
+                # If we only have one dataset the output changes
+                if len(out_names) == 2:
+                    predictions = [predictions]
+                total_loss = tf.reduce_sum(predictions, axis=0)
+                ret = [total_loss] + predictions
+                return dict(zip(out_names, ret))
+
+            self.compute_losses_function = losses_fun
+
+        ret = self.compute_losses_function()
+
+        # The output of this function is to be used by python (and numpy) so we need to convert
+        # the tensorflow variable to python primitives or numpy arrays.
+        # Undocumented TF function that converts all the tensors from the ret dictionary to numpy arrays
+        # if it dissapears, equivalent for us to {k: i.numpy() for k, i in ret.items()}
+        try:
+            dict_result = tf_utils.to_numpy_or_python_type(ret)
+        except AttributeError:
+            # For TF < 2.2
+            dict_result = {k: i.numpy() for k, i in ret.items()}
+        return dict_result
 
-    def evaluate(self, x=None, y=None, **kwargs):
-        """
-        Wrapper around evaluate to take into account the case in which the data is already known
-        when the model is compiled.
-        """
-        x = self._parse_input(self.x_in)
-        if LEGACY and y is None:
-            y = self.target_tensors
-        result = super().evaluate(x=x, y=y, **kwargs)
-        return result
 
     def compile(
         self,
@@ -250,6 +256,9 @@ def compile(
                 f"[MetaModel.select_initializer] optimizer not implemented: {optimizer_name}"
             ) from e
 
+        if loss is None:
+            loss = _default_loss
+
         opt_function = opt_tuple[0]
         opt_args = opt_tuple[1]
 
@@ -263,60 +272,16 @@ def compile(
         # Instantiate the optimizer
         opt = opt_function(**opt_args)
 
-        # If given target output, compile it together with the model for better performance
-        if target_output is not None:
+        # If given target output is None, target_output is unnecesary, save just a zero per output
+        if target_output is None:
+            self.target_tensors = [np.zeros((1,1)) for i in self.output_shape]
+        else:
             if not isinstance(target_output, list):
                 target_output = [target_output]
-            # Tensorize
             self.target_tensors = target_output
 
-        # Reset the evaluation function (if any)
-        self.eval_fun = None
-
         super(MetaModel, self).compile(optimizer=opt, loss=loss)
 
-    def make_test_function(self):
-        """ If the model has been compiled with target data, it creates
-        a specific evaluate function with the target data already evaluated.
-        Otherwise return the normal tensorflow behaviour.
-        """
-        if self.eval_fun is not None:
-            return self.eval_fun
-
-        if self.target_tensors is None:
-            return super().make_test_function()
-
-        # Recover the target tensors and their lengths, we cannot rely
-        # directly on the output from the model as we might have target_tensors
-        # with 0 data points (if the tr/vl mask covers the whole set)
-        lens = []
-        tt = []
-        for target in self.target_tensors:
-            lens.append(target.size)
-            tt.append(numpy_to_tensor(target))
-        # Save target_tensors as tensors, as it might be useful for LEGACY
-        self.target_tensors = tt
-
-        # Get the name of the output layer
-        # and add the suffix _loss to match TF behaviour
-        out_names = [f"{i}_loss" for i in self.output_names]
-        out_names.insert(0, "loss")
-
-        @tf.function
-        def eval_fun(*args):
-            predictions = self(self._parse_input(None))
-            # Concatenate the output to split them again as a list
-            ypred = tf.concat(predictions, axis=-1)
-            predspl = tf.split(ypred, lens, axis=-1)
-            loss_list = [lfun(target, pred) for target, pred, lfun in zip(tt, predspl, self.loss)]
-            ret = [tf.reduce_sum(loss_list)] + loss_list
-            return dict(zip(out_names, ret))
-
-        # Save the function so we don't go through this again
-        self.eval_fun = eval_fun
-
-        return eval_fun
-
     def set_masks_to(self, names, val=0.0):
         """ Set all mask value to the selected value
         Masks in MetaModel should be named {name}_mask
@@ -369,3 +334,8 @@ def apply_as_layer(self, x):
             # TF 2.0 seems to fail with ValueError when passing a dictionary as an input
             y = x.values()
             return super().__call__(y)
+
+    def get_layer_re(self, regex):
+        """ Get all layers matching the given regular expression """
+        check = lambda x: re.match(regex, x.name)
+        return list(filter(check, self.layers))
diff --git a/n3fit/src/n3fit/backends/keras_backend/losses.py b/n3fit/src/n3fit/backends/keras_backend/losses.py
deleted file mode 100644
index c4b427dd68..0000000000
--- a/n3fit/src/n3fit/backends/keras_backend/losses.py
+++ /dev/null
@@ -1,74 +0,0 @@
-"""
-    Module containing a list of loss functions availables to the fitting code
-"""
-
-import tensorflow as tf
-from tensorflow.keras import backend as K
-
-
-def l_invcovmat(invcovmat_np):
-    """
-    Returns a loss function such that:
-    L = \sum_{ij} (yt - yp)_{i} invcovmat_{ij} (yt - yp)_{j}
-    """
-    invcovmat = K.constant(invcovmat_np)
-
-    def true_loss(y_true, y_pred):
-        # (yt - yp) * covmat * (yt - yp)
-        tmp = y_true - y_pred
-        right_dot = tf.tensordot(invcovmat, K.transpose(tmp), axes=1)
-        res = tf.tensordot(tmp, right_dot, axes=1)
-        return tf.reshape(res, (-1,))
-
-    return true_loss
-
-
-def l_positivity(alpha=1e-7):
-    """
-    Returns L = elu(y_pred) (considers y_true as 0)
-
-    The positivity loss is computed by inverting the sign of the
-    datapoints and then applying the elu function, this function is
-        f(x) = x if x > 0
-        f(x) = alpha * (e^{x} - 1) if x < 0
-    This is done to avoid a big discontinuity in the derivative at 0 when
-    the lagrange multiplier is very big.
-    In practice this function can produce results in the range (-alpha, inf)
-    """
-
-    def true_loss(y_true, y_pred):
-        y = -y_pred
-        loss = K.elu(y, alpha=alpha)
-        res = K.sum(loss)
-        return tf.reshape(res, (-1,))
-
-    return true_loss
-
-
-def l_integrability():
-    """
-    Returns (y_pred)*(y_pred)
-    """
-
-    def true_loss(y_true, y_pred):
-        loss = K.square(y_pred)
-        res = K.sum(loss, keepdims=True)
-        return tf.reshape(res, (-1,))
-
-    return true_loss
-
-
-def l_diaginvcovmat(diaginvcovmat_np):
-    """
-    Returns a loss function such that:
-    L = sum_{i} (yt - yp)_{i} invcovmat_{ii} (yt - yp)_{i}
-    diaginvcovmat_np should be 1d
-    """
-    invcovmat = K.constant(diaginvcovmat_np)
-
-    def true_loss(y_true, y_pred):
-        tmp = y_true - y_pred
-        res = tf.tensordot(invcovmat, K.transpose(tmp * tmp), axes=1)
-        return tf.reshape(res, (-1,))
-
-    return true_loss
diff --git a/n3fit/src/n3fit/backends/keras_backend/operations.py b/n3fit/src/n3fit/backends/keras_backend/operations.py
index 5bad7ef570..3e2308e19c 100644
--- a/n3fit/src/n3fit/backends/keras_backend/operations.py
+++ b/n3fit/src/n3fit/backends/keras_backend/operations.py
@@ -234,6 +234,13 @@ def concatenate(tensor_list, axis=-1, target_shape=None, name=None):
         return concatenated_tensor
 
 
+def stack(tensor_list, axis=0, **kwargs):
+    """ Stack a list of tensors
+    see full `docs <https://www.tensorflow.org/api_docs/python/tf/stack>`_
+    """
+    return tf.stack(tensor_list, axis=axis, **kwargs)
+
+
 # Mathematical operations
 def pdf_masked_convolution(raw_pdf, basis_mask):
     """ Computes a masked convolution of two equal pdfs
@@ -243,7 +250,7 @@ def pdf_masked_convolution(raw_pdf, basis_mask):
     Parameters
     ----------
         pdf: tf.tensor
-            rank 3 (batchsize, xgrid, flavours)
+            rank 4 (batchsize, xgrid, flavours, replicas)
         basis_mask: tf.tensor
             rank  2 tensor (flavours, flavours)
             mask to apply to the pdf convolution
@@ -254,11 +261,8 @@ def pdf_masked_convolution(raw_pdf, basis_mask):
             rank3 (len(mask_true), xgrid, xgrid)
     """
     pdf = tf.squeeze(raw_pdf, axis=0)  # remove the batchsize
-    luminosity = tensor_product(pdf, pdf, axes=0)
-    # (xgrid, flavour, xgrid, flavour)
-    # reshape to put the flavour indices at the beginning to apply mask
-    lumi_tmp = K.permute_dimensions(luminosity, (3, 1, 2, 0))
-    pdf_x_pdf = boolean_mask(lumi_tmp, basis_mask)
+    luminosity = tf.einsum('air,bjr->jibar', pdf, pdf)
+    pdf_x_pdf = boolean_mask(luminosity, basis_mask)
     return pdf_x_pdf
 
 
@@ -270,6 +274,13 @@ def tensor_product(*args, **kwargs):
     return tf.tensordot(*args, **kwargs)
 
 
+def einsum(equation, *args, **kwargs):
+    """
+    Computes the tensor product using einsum
+    See full `docs <https://www.tensorflow.org/api_docs/python/tf/einsum>`_
+    """
+    return tf.einsum(equation, *args, **kwargs)
+
 
 @tf.function(experimental_relax_shapes=True)
 def op_log(o_tensor, **kwargs):
@@ -295,6 +306,7 @@ def split(*args, **kwargs):
     """
     return tf.split(*args, **kwargs)
 
+
 def scatter_to_one(values, indices=[[1]], output_dim=14):
     """
     Like scatter_nd initialized to one instead of zero
@@ -302,3 +314,13 @@ def scatter_to_one(values, indices=[[1]], output_dim=14):
     """
     ones = np.ones(output_dim, dtype=np.float32)
     return tf.tensor_scatter_nd_update(ones, indices, values)
+
+
+@tf.function
+def backend_function(fun_name, *args, **kwargs):
+    """
+    Wrapper to call non-explicitly implemented backend functions by name: (``fun_name``)
+    see full `docs <https://keras.io/api/utils/backend_utils/>`_ for some possibilities
+    """
+    fun = getattr(K, fun_name)
+    return fun(*args, **kwargs)
diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
index b96c2d0117..8c99e631c7 100644
--- a/n3fit/src/n3fit/checks.py
+++ b/n3fit/src/n3fit/checks.py
@@ -352,3 +352,23 @@ def check_consistent_basis(fitting, theoryid):
         raise CheckError(f"{theoryid} (intrinsic charm) is incompatible with basis {fitbasis}")
     if not theoryid.get_description()["IC"] and has_c:
         raise CheckError(f"{theoryid} (perturbative charm) is incompatible with basis {fitbasis}")
+
+
+@make_argcheck
+def can_run_multiple_replicas(fitting, replica, hyperopt, parallel_models=1):
+    """ Checks whether a runcard which is trying to run several replicas at once (parallel_models =/= 1) is valid
+    """
+    rp = len(replica)
+    genrep = fitting.get("genrep")
+    if rp > 1 and not genrep:
+        raise CheckError("Can't run more than one replica at once if no replicas are to be generated")
+    if parallel_models == 1:
+        return
+    if hyperopt:
+        raise CheckError("Running replicas in parallel with hyperopt is still not supported")
+    if genrep:
+        raise CheckError("Replica generation is not supported yet for parallel models")
+    if fitting["parameters"].get("layer_type") != "dense":
+        raise CheckError("Parallelization has only been tested with layer_type=='dense'")
+    if rp > 1:
+        raise CheckError("Parallel mode cannot be used together with multireplica runs")
diff --git a/n3fit/src/n3fit/hyper_optimization/penalties.py b/n3fit/src/n3fit/hyper_optimization/penalties.py
index 7d3ef24708..b6bad1c8f3 100644
--- a/n3fit/src/n3fit/hyper_optimization/penalties.py
+++ b/n3fit/src/n3fit/hyper_optimization/penalties.py
@@ -83,11 +83,11 @@ def patience(pdf_model, stopping_object, alpha=1e-4):
     3.434143467595683
 
     """
-    epoch_best = stopping_object.e_best_chi2
+    epoch_best = np.take(stopping_object.e_best_chi2, 0)
     patience = stopping_object.stopping_patience
     max_epochs = stopping_object.total_epochs
     diff = abs(max_epochs - patience - epoch_best)
-    vl_loss = stopping_object.vl_chi2
+    vl_loss = np.take(stopping_object.vl_chi2, 0)
     return vl_loss * np.exp(alpha * diff)
 
 
diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
index 7ae1fb24a0..48dabd5c2a 100644
--- a/n3fit/src/n3fit/io/writer.py
+++ b/n3fit/src/n3fit/io/writer.py
@@ -69,6 +69,11 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
         # Check the directory exist, if it doesn't, generate it
         os.makedirs(replica_path_set, exist_ok=True)
 
+        stop_epoch = self.stopping_object.stop_epoch
+
+        # Get the replica status for this object
+        replica_status = self.stopping_object.get_next_replica()
+
         # export PDF grid to file
         storefit(
             self.pdf_object,
@@ -76,7 +81,7 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
             replica_path_set,
             fitname,
             self.q2,
-            self.stopping_object.e_best_chi2,
+            replica_status.best_epoch,
             vl_chi2,
             tr_chi2,
             true_chi2,
@@ -84,26 +89,33 @@ def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2):
             integrability_numbers,
             allchi2_lines,
             preproc_lines,
-            self.stopping_object.positivity_status(),
+            replica_status.positivity_status,
             self.timings,
         )
 
-        # TODO: compute the chi2s directly from the stopping object
         # export all metadata from the fit to a single yaml file
         output_file = f"{replica_path_set}/{fitname}.json"
         json_dict = jsonfit(
-            self.stopping_object, self.pdf_object, tr_chi2, vl_chi2, true_chi2, self.timings
+            replica_status, self.pdf_object, tr_chi2, vl_chi2, true_chi2, stop_epoch, self.timings
         )
         with open(output_file, "w") as fs:
-            json.dump(json_dict, fs, indent=2)
+            json.dump(json_dict, fs, indent=2, cls = SuperEncoder)
+
+
+class SuperEncoder(json.JSONEncoder):
+    """ Custom json encoder to get around the fact that np.float32 =/= float """
+    def default(self, o):
+        if isinstance(o, np.float32):
+            return float(o)
+        return super().default(o)
 
 
-def jsonfit(stopping_object, pdf_object, tr_chi2, vl_chi2, true_chi2, timing):
+def jsonfit(replica_status, pdf_object, tr_chi2, vl_chi2, true_chi2, stop_epoch, timing):
     """Generates a dictionary containing all relevant metadata for the fit
 
     Parameters
     ----------
-        stopping_object: n3fit.stopping.Stopping
+        replica_status: n3fit.stopping.ReplicaBest
             a stopping.Validation object
         pdf_object: n3fit.vpinterface.N3PDF
             N3PDF object constructed from the pdf_model
@@ -114,6 +126,8 @@ def jsonfit(stopping_object, pdf_object, tr_chi2, vl_chi2, true_chi2, timing):
             chi2 for the validation
         true_chi2: float
             chi2 for the exp (unreplica'd data)
+        epoch_stop: int
+            epoch at which the stopping stopped (not the one for the best fit!)
         timing: dict
             dictionary of the timing of the different events that happened
     """
@@ -121,12 +135,12 @@ def jsonfit(stopping_object, pdf_object, tr_chi2, vl_chi2, true_chi2, timing):
     # Generate preprocessing information
     all_info["preprocessing"] = pdf_object.get_preprocessing_factors()
     # .fitinfo-like info
-    all_info["stop_epoch"] = stopping_object.stop_epoch
-    all_info["best_epoch"] = stopping_object.e_best_chi2
+    all_info["stop_epoch"] = stop_epoch
+    all_info["best_epoch"] = replica_status.best_epoch
     all_info["erf_tr"] = tr_chi2
     all_info["erf_vl"] = vl_chi2
     all_info["chi2"] = true_chi2
-    all_info["pos_state"] = stopping_object.positivity_status()
+    all_info["pos_state"] = replica_status.positivity_status
     all_info["arc_lengths"] = pdf_object.compute_arclength().tolist()
     all_info["integrability"] = pdf_object.integrability_numbers().tolist()
     all_info["timing"] = timing
diff --git a/n3fit/src/n3fit/layers/DIS.py b/n3fit/src/n3fit/layers/DIS.py
index 570eb11456..1782ece969 100644
--- a/n3fit/src/n3fit/layers/DIS.py
+++ b/n3fit/src/n3fit/layers/DIS.py
@@ -18,8 +18,8 @@ class DIS(Observable):
         the incoming pdf.
 
         The fktable is expected to be rank 3 (ndata, xgrid, flavours)
-        while the input pdf is also rank 3 where the first dimension is the batch dimension
-        (1, xgrid, flavours)
+        while the input pdf is rank 4 where the first dimension is the batch dimension
+        and the last dimension the number of replicas being fitted (1, xgrid, flavours, replicas)
     """
 
     def gen_mask(self, basis):
@@ -50,12 +50,12 @@ def call(self, pdf):
             Parameters
             ----------
                 pdf:  backend tensor
-                    rank 3 tensor (batch_size, xgrid, flavours)
+                    rank 4 tensor (batch_size, xgrid, flavours, replicas)
 
             Returns
             -------
                 result: backend tensor
-                    rank 1 tensor (batchsize, ndata)
+                    rank 3 tensor (batchsize, replicas, ndata)
         """
         # DIS never needs splitting
         if self.splitting is not None:
diff --git a/n3fit/src/n3fit/layers/DY.py b/n3fit/src/n3fit/layers/DY.py
index 0cacff7897..5d714731ea 100644
--- a/n3fit/src/n3fit/layers/DY.py
+++ b/n3fit/src/n3fit/layers/DY.py
@@ -30,12 +30,12 @@ def call(self, pdf_raw):
         Parameters
         ----------
             pdf_in: tensor
-                rank 3 tensor (batchsize, xgrid, flavours)
+                rank 4 tensor (batchsize, xgrid, flavours, replicas)
 
         Returns
         -------
             results: tensor
-                rank 2 tensor (batchsize, ndata)
+                rank 3 tensor (batchsize, replicas, ndata)
         """
         # Hadronic observables might need splitting of the input pdf in the x dimension
         # so we have 3 different paths for this layer
@@ -60,5 +60,5 @@ def call(self, pdf_raw):
                 results.append(res)
 
         # the masked convolution removes the batch dimension
-        ret = self.operation(results)
+        ret = op.transpose(self.operation(results))
         return op.batchit(ret)
diff --git a/n3fit/src/n3fit/layers/Rotations.py b/n3fit/src/n3fit/layers/Rotations.py
index 0933d0d0d0..c5f08a1a49 100644
--- a/n3fit/src/n3fit/layers/Rotations.py
+++ b/n3fit/src/n3fit/layers/Rotations.py
@@ -64,9 +64,9 @@ class FkRotation(MetaLayer):
     # TODO: Generate a rotation matrix in the input and just do tf.tensordot in call
     # the matrix should be: (8, 14) so that we can just do tf.tensordot(pdf, rotmat, axes=1)
     # i.e., create the matrix and inherit from the Rotation layer above
-    def __init__(self, output_dim=14, **kwargs):
+    def __init__(self, output_dim=14, name="evolution", **kwargs):
         self.output_dim = output_dim
-        super().__init__(**kwargs, name="evolution")
+        super().__init__(name=name, **kwargs)
 
     def call(self, pdf_raw):
         # Transpose the PDF so that the flavour index is the first one
diff --git a/n3fit/src/n3fit/layers/losses.py b/n3fit/src/n3fit/layers/losses.py
new file mode 100644
index 0000000000..78b7f3e0ea
--- /dev/null
+++ b/n3fit/src/n3fit/layers/losses.py
@@ -0,0 +1,143 @@
+"""
+    Module containg the losses to be apply to the models as layers
+
+    The layer take the input from the model and acts on it producing a score function.
+    For instance, in the case of the chi2 (``LossInvcovmat``) the function takes only
+    the prediction of the model and, during instantiation, took the real data to compare with
+    and the covmat.
+
+"""
+import numpy as np
+from n3fit.backends import MetaLayer
+from n3fit.backends import operations as op
+
+
+class LossInvcovmat(MetaLayer):
+    """
+    Loss function such that:
+    L = \sum_{ij} (yt - yp)_{i} invcovmat_{ij} (yt - yp)_{j}
+
+    Takes as argument the inverse of the covmat and the target data.
+    It also takes an optional argument to mask part of the predictions
+
+    Example
+    -------
+    >>> import numpy as np
+    >>> from n3fit.layers import losses
+    >>> C = np.random.rand(5,5)
+    >>> data = np.random.rand(1, 1, 5)
+    >>> pred = np.random.rand(1, 1, 5)
+    >>> invC = np.linalg.inv( C @ C.T)
+    >>> loss_f = losses.LossInvcovmat(invC, data)
+    >>> loss_f(pred).shape == 1
+    True
+    """
+
+    def __init__(self, invcovmat, y_true, mask=None, **kwargs):
+        # If we have a diagonal matrix, padd with 0s and hope it's not too heavy on memory
+        if len(invcovmat.shape) == 1:
+            invcovmat = np.diag(invcovmat)
+        self.invcovmat = op.numpy_to_tensor(invcovmat)
+        self.y_true = op.numpy_to_tensor(y_true)
+        if mask is None or all(mask):
+            self.mask = None
+        else:
+            mask = np.array(mask, dtype=np.float32).reshape(1,1,-1)
+            self.mask = op.numpy_to_tensor(mask)
+        super().__init__(**kwargs)
+
+    def call(self, y_pred, **kwargs):
+        tmp = self.y_true - y_pred
+        if self.mask is not None:
+            tmp = op.op_multiply([tmp, self.mask])
+        res = op.einsum("bri, ij, brj -> r", tmp, self.invcovmat, tmp)
+        return res
+
+
+class LossLagrange(MetaLayer):
+    """
+    Abstract loss function to apply lagrange multipliers to a model.
+
+        L = \lambda * f(y)
+
+    The form of f(y) is given by modifying the ``apply_loss`` method.
+    It is possible to modify how the multiplication of the lambda factor is implemented
+    by modifying the ``apply_multiplier`` method.
+
+    The (non trainable) weight containing the multiplier is named ``lagMult``.
+    """
+
+    def __init__(self, c=1.0, **kwargs):
+        self._initial_multiplier = c
+        super().__init__(**kwargs)
+
+    def build(self, input_shape):
+        multiplier = MetaLayer.init_constant(self._initial_multiplier)
+        self.kernel = self.builder_helper("lagMult", (1,), multiplier, trainable=False)
+        super().build(input_shape)
+
+    def apply_multiplier(self, y):
+        return self.kernel * y
+
+    def apply_loss(self, y):
+        return y
+
+    def call(self, y_pred, **kwargs):
+        y = self.apply_multiplier(y_pred)
+        return self.apply_loss(y)
+
+
+class LossPositivity(LossLagrange):
+    """
+    Returns L = \lambda*elu(y_pred)
+
+    The positivity loss is computed by inverting the sign of the
+    datapoints and then applying the elu function, this function is
+        f(x) = x if x > 0
+        f(x) = alpha * (e^{x} - 1) if x < 0
+    This is done to avoid a big discontinuity in the derivative at 0 when
+    the lagrange multiplier is very big.
+    In practice this function can produce results in the range (-alpha, inf)
+
+    Example
+    -------
+    >>> import numpy as np
+    >>> from n3fit.layers import losses
+    >>> pred = np.random.rand(1, 1, 5)
+    >>> alpha = 1e-7
+    >>> c = 1e8
+    >>> loss_f = losses.LossPositivity(c=c, alpha=alpha)
+    >>> loss_f(pred) == -5*alpha
+    True
+    >>> loss_f(-pred) > c
+    True
+    """
+
+    def __init__(self, alpha=1e-7, **kwargs):
+        self.alpha = alpha
+        super().__init__(**kwargs)
+
+    def apply_loss(self, y_pred):
+        loss = op.backend_function("elu", -y_pred, alpha=self.alpha)
+        # Sum over the batch and the datapoints
+        return op.sum(loss, axis=[0, -1])
+
+
+class LossIntegrability(LossLagrange):
+    """
+    Returns L = (y_pred)*(y_pred)
+
+    Example
+    -------
+    >>> import numpy as np
+    >>> from n3fit.layers import losses
+    >>> pred = np.random.rand(1, 1, 5)
+    >>> loss_f = losses.LossIntegrability(c=1e2)
+    >>> loss_f(pred) > 0
+    True
+    """
+
+    def apply_loss(self, y_pred):
+        y = op.backend_function("square", y_pred)
+        # Sum over the batch and the datapoints
+        return op.sum(y, axis=[0, -1])
diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 03857e4fbd..fa0c541cef 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -7,18 +7,77 @@
         # pdfNN_layer_generator:
             Generates the PDF NN layer to be fitted
 """
+from dataclasses import dataclass
+import numpy as np
+
 import n3fit.msr as msr_constraints
-from n3fit.layers import DIS, DY, Mask, ObsRotation
+from n3fit.layers import DIS, DY, Mask, ObsRotation, losses
 from n3fit.layers import Preprocessing, FkRotation, FlavourToEvolution
 
 from n3fit.backends import MetaModel, Input
-from n3fit.backends import operations
-from n3fit.backends import losses
+from n3fit.backends import operations as op
 from n3fit.backends import MetaLayer, Lambda
 from n3fit.backends import base_layer_selector, regularizer_selector
 
 
-def observable_generator(spec_dict, positivity_initial=1.0, integrability=False):  # pylint: disable=too-many-locals
+@dataclass
+class ObservableWrapper:
+    """Wrapper to generate the observable layer once the PDF model is prepared
+    It can take normal datasets or Lagrange-multiplier-like datasets
+    (such as positivity or integrability)
+    """
+
+    name: str
+    observables: list
+    dataset_xsizes: list
+    invcovmat: np.array = None
+    multiplier: float = 1.0
+    integrability: bool = False
+    positivity: bool = False
+    data: np.array = None
+    rotation: ObsRotation = None  # only used for diagonal covmat
+
+    def _generate_loss(self, mask=None):
+        """Generates the corresponding loss function depending on the values the wrapper
+        was initialized with"""
+        if self.invcovmat is not None:
+            loss = losses.LossInvcovmat(self.invcovmat, self.data, mask, name=self.name)
+        elif self.positivity:
+            loss = losses.LossPositivity(name=self.name, c=self.multiplier)
+        elif self.integrability:
+            loss = losses.LossIntegrability(name=self.name, c=self.multiplier)
+        return loss
+
+    def _generate_experimental_layer(self, pdf):
+        """ Generates the experimental layer from the PDF """
+        # First split the layer into the different datasets (if needed!)
+        if len(self.dataset_xsizes) > 1:
+            splitting_layer = op.as_layer(
+                op.split,
+                op_args=[self.dataset_xsizes],
+                op_kwargs={"axis": 1},
+                name=f"{self.name}_split",
+            )
+            split_pdf = splitting_layer(pdf)
+        else:
+            split_pdf = [pdf]
+        # Every obs gets its share of the split
+        output_layers = [obs(p_pdf) for p_pdf, obs in zip(split_pdf, self.observables)]
+        # Concatenate all datasets (so that experiments are one single entity)
+        ret = op.concatenate(output_layers, axis=2)
+        if self.rotation is not None:
+            ret = self.rotation(ret)
+        return ret
+
+    def __call__(self, pdf_layer, mask=None):
+        loss_f = self._generate_loss(mask)
+        experiment_prediction = self._generate_experimental_layer(pdf_layer)
+        return loss_f(experiment_prediction)
+
+
+def observable_generator(
+    spec_dict, positivity_initial=1.0, integrability=False
+):  # pylint: disable=too-many-locals
     """
     This function generates the observable model for each experiment.
     These are models which takes as input a PDF tensor (1 x size_of_xgrid x flavours) and outputs
@@ -54,11 +113,9 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
             a dictionary with:
             - `inputs`: input layer
             - `output`: output layer (unmasked)
-            - `loss` : loss function (unmasked)
             - `output_tr`: output layer (training)
-            - `loss_tr` : loss function (training)
             - `output_vl`: output layer (validation)
-            - `loss_vl` : loss function (validation)
+            - `experiment_xsize` : int (size of the output array) 
     """
     spec_name = spec_dict["name"]
     dataset_xsizes = []
@@ -92,10 +149,7 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
             operation_name,
             name=f"dat_{dataset_name}",
         )
-        if spec_dict["positivity"]:
-            obs_layer_ex = obs_layer_tr
-            obs_layer_vl = obs_layer_tr
-        else:
+        if not spec_dict["positivity"]:
             obs_layer_ex = Obs_Layer(
                 dataset_dict["fktables"],
                 dataset_dict["ex_fktables"],
@@ -108,6 +162,8 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
                 operation_name,
                 name=f"val_{dataset_name}",
             )
+        else:
+            obs_layer_ex = obs_layer_vl = None
 
         # Data transformation might need access to the full array of output data
         # therefore the validation and training layers should point to the full exp
@@ -129,119 +185,71 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
         model_obs_vl.append(obs_layer_vl)
         model_obs_ex.append(obs_layer_ex)
 
-    # Prepare a concatenation as experiments are one single entity formed by many datasets
-    def gen_concat(name):
-        return operations.as_layer(operations.concatenate, op_kwargs={"axis": 1}, name=name)
-
-    # Tensorflow operations have ugly name,
-    # we want the final observables to be named just {spec_name} (with'val/exp' if needed)
-    tr_name = spec_name
-    vl_name = f"{spec_name}_val"
-    ex_name = f"{spec_name}_exp"
-    concat_ex = gen_concat(ex_name)
-    # For data transformation all concatenations are the same
-    if spec_dict.get("data_transformation") is None:
-        concat_tr = gen_concat(tr_name)
-        concat_vl = gen_concat(vl_name)
-    else:
-        concat_tr = concat_ex
-        concat_vl = concat_ex
-
-    # creating the experiment as a model turns out to bad for performance
-    def experiment_layer(pdf, model_obs=model_obs_ex, concat=concat_ex, rotation=None, datasets_out=None):
-        """ By default works with the experiment observable """
-        output_layers = []
-        # First split the pdf layer into the different datasets if needed
-        if len(dataset_xsizes) > 1:
-            splitting_layer = operations.as_layer(
-                operations.split,
-                op_args=[dataset_xsizes],
-                op_kwargs={"axis": 1},
-                name=f"{spec_name}_split",
-            )
-            split_pdf = splitting_layer(pdf)
-        else:
-            split_pdf = [pdf]
-        # every obs gets its share of the split
-        for partial_pdf, obs in zip(split_pdf, model_obs):
-            obs_output = obs(partial_pdf)
-            if datasets_out and obs.name[4:] in datasets_out:
-                mask_out = Mask(c=0.0, name=f"zero_{obs.name}")
-                obs_output = mask_out(obs_output)
-            output_layers.append(obs_output)
-        # Concatenate all datasets as experiments are one single entity if needed
-        ret = concat(output_layers)
-        if rotation is not None:
-            ret = rotation(ret)
-        return ret
-
-    # Now create the model for this experiment
     full_nx = sum(dataset_xsizes)
 
     if spec_dict["positivity"]:
-        out_mask = Mask(
-            c=positivity_initial,
-            axis=1,
-            name=spec_name,
+        out_positivity = ObservableWrapper(
+            spec_name,
+            model_obs_tr,
+            dataset_xsizes,
+            multiplier=positivity_initial,
+            positivity=not integrability,
+            integrability=integrability,
         )
 
-        def out_positivity(pdf_layer, datasets_out=None):
-            exp_result = experiment_layer(pdf_layer)
-            return out_mask(exp_result)
-
-        if integrability:
-            loss = losses.l_integrability()
-        else:
-            loss = losses.l_positivity()
-
         layer_info = {
             "inputs": model_inputs,
             "output_tr": out_positivity,
-            "loss_tr": loss,
             "experiment_xsize": full_nx,
         }
         return layer_info
 
-    invcovmat_tr = spec_dict["invcovmat"]
-    invcovmat_vl = spec_dict["invcovmat_vl"]
-    invcovmat = spec_dict["invcovmat_true"]
-
     # Generate the loss function and rotations of the final data (if any)
     if spec_dict.get("data_transformation") is not None:
+        # TODO: I'm asuming that the diagonal covmat will work ootb, check
         # The rotation is the last layer so it should carry The Name
-        obsrot_tr = ObsRotation(spec_dict.get("data_transformation"), name=tr_name)
-        obsrot_vl = ObsRotation(spec_dict.get("data_transformation_vl"), name=vl_name)
-        loss_tr = losses.l_diaginvcovmat(invcovmat_tr)
-        loss_vl = losses.l_diaginvcovmat(invcovmat_vl)
+        obsrot_tr = ObsRotation(spec_dict.get("data_transformation"))
+        obsrot_vl = ObsRotation(spec_dict.get("data_transformation_vl"))
     else:
         obsrot_tr = None
         obsrot_vl = None
-        loss_tr = losses.l_invcovmat(invcovmat_tr)
-        # TODO At this point we need to intercept the data and compile the loss with it
-        # then the validation must have a list of None as an output
-        loss_vl = losses.l_invcovmat(invcovmat_vl)
-    loss = losses.l_invcovmat(invcovmat)
-
-    def out_tr(pdf_layer, datasets_out=None):
-        exp_result = experiment_layer(
-            pdf_layer, model_obs=model_obs_tr, concat=concat_tr, datasets_out=datasets_out, rotation=obsrot_tr
-        )
-        return exp_result
 
-    def out_vl(pdf_layer, datasets_out=None):
-        exp_result = experiment_layer(
-            pdf_layer, model_obs=model_obs_vl, concat=concat_vl, datasets_out=datasets_out, rotation=obsrot_vl
-        )
-        return exp_result
+    # Prepare the inverse covmats for each of the loss functions
+    # (that are only instantiated when the output layer is created)
+    invcovmat_tr = spec_dict["invcovmat"]
+    invcovmat_vl = spec_dict["invcovmat_vl"]
+    invcovmat = spec_dict["invcovmat_true"]
+
+    out_tr = ObservableWrapper(
+        spec_name,
+        model_obs_tr,
+        dataset_xsizes,
+        invcovmat=invcovmat_tr,
+        data=spec_dict["expdata"],
+        rotation=obsrot_tr,
+    )
+    out_vl = ObservableWrapper(
+        f"{spec_name}_val",
+        model_obs_vl,
+        dataset_xsizes,
+        invcovmat=invcovmat_vl,
+        data=spec_dict["expdata_vl"],
+        rotation=obsrot_vl,
+    )
+    out_exp = ObservableWrapper(
+        f"{spec_name}_exp",
+        model_obs_ex,
+        dataset_xsizes,
+        invcovmat=invcovmat,
+        data=spec_dict["expdata_true"],
+        rotation=None,
+    )
 
     layer_info = {
         "inputs": model_inputs,
-        "output": experiment_layer,
-        "loss": loss,
+        "output": out_exp,
         "output_tr": out_tr,
-        "loss_tr": loss_tr,
         "output_vl": out_vl,
-        "loss_vl": loss_vl,
         "experiment_xsize": full_nx,
     }
 
@@ -360,6 +368,7 @@ def pdfNN_layer_generator(
     regularizer=None,
     regularizer_args=None,
     impose_sumrule=False,
+    parallel_models=1,
 ):  # pylint: disable=too-many-locals
     """
     Generates the PDF model which takes as input a point in x (from 0 to 1)
@@ -439,9 +448,10 @@ def pdfNN_layer_generator(
 
     Returns
     -------
-        model_pdf: n3fit.backends.MetaModel
+        pdf_models: list with a number equal to `parallel_models` of type n3fit.backends.MetaModel
             a model f(x) = y where x is a tensor (1, xgrid, 1) and y a tensor (1, xgrid, out)
     """
+    # Parse the input configuration
     if nodes is None:
         nodes = [15, 8]
     ln = len(nodes)
@@ -462,84 +472,97 @@ def pdfNN_layer_generator(
         raise ValueError(
             "Number of activation functions does not match number of layers @ model_gen.py"
         )
-    # The number of nodes in the last layer is equal to the number of fitted flavours (== len(flav_info))
     last_layer_nodes = nodes[-1]
 
-    if layer_type == "dense":
-        reg = regularizer_selector(regularizer, **regularizer_args)
-        list_of_pdf_layers = generate_dense_network(
-            inp,
-            nodes,
-            activations,
-            initializer_name,
-            seed=seed,
-            dropout_rate=dropout,
-            regularizer=reg,
-        )
-    elif layer_type == "dense_per_flavour":
-        # Define the basis size attending to the last layer in the network
-        # TODO: this information should come from the basis information
-        #       once the basis information is passed to this class
-        list_of_pdf_layers = generate_dense_per_flavour_network(
-            inp, nodes, activations, initializer_name, seed=seed, basis_size=last_layer_nodes,
-        )
+    # Generate the generic layers
+
+    # Prepare the input for the PDF model
+    placeholder_input = Input(shape=(None, 1), batch_size=1)
 
     # If the input is of type (x, logx)
     # create a x --> (x, logx) layer to preppend to everything
     if inp == 2:
-        add_log = Lambda(lambda x: operations.concatenate([x, operations.op_log(x)], axis=-1))
-
-    def dense_me(x):
-        """Takes an input tensor `x` and applies all layers
-        from the `list_of_pdf_layers` in order"""
-        if inp == 1:
-            curr_fun = list_of_pdf_layers[0](x)
-        else:
-            curr_fun = list_of_pdf_layers[0](add_log(x))
-
-        for dense_layer in list_of_pdf_layers[1:]:
-            curr_fun = dense_layer(curr_fun)
-        return curr_fun
-
-    # Preprocessing layer (will be multiplied to the last of the denses)
-    preproseed = seed + number_of_layers
-    layer_preproc = Preprocessing(
-        input_shape=(1,),
-        name="pdf_prepro",
-        flav_info=flav_info,
-        seed=preproseed,
-        output_dim=last_layer_nodes
-    )
-    # Basis rotation
-    basis_rotation = FlavourToEvolution(flav_info=flav_info, fitbasis=fitbasis)
+        add_log = Lambda(lambda x: op.concatenate([x, op.op_log(x)], axis=-1))
 
     # Evolution layer
     layer_evln = FkRotation(input_shape=(last_layer_nodes,), output_dim=out)
 
+    # Basis rotation
+    basis_rotation = FlavourToEvolution(flav_info=flav_info, fitbasis=fitbasis)
 
-    # Apply preprocessing and basis
-    def layer_fitbasis(x):
-        ret = operations.op_multiply([dense_me(x), layer_preproc(x)])
-        if basis_rotation.is_identity():
-            # if we don't need to rotate basis we don't want spurious layers
-            return ret
-        return basis_rotation(ret)
-
-    # Rotation layer, changes from the 8-basis to the 14-basis
-    def layer_pdf(x):
-        return layer_evln(layer_fitbasis(x))
-
-    # Prepare the input for the PDF model
-    placeholder_input = Input(shape=(None, 1), batch_size=1)
-
-    # Impose sumrule if necessary
+    # Normalization and sum rules
     if impose_sumrule:
-        layer_pdf, integrator_input = msr_constraints.msr_impose(layer_fitbasis, layer_pdf, mode=impose_sumrule)
+        sumrule_layer, integrator_input = msr_constraints.msr_impose(mode=impose_sumrule)
         model_input = [integrator_input, placeholder_input]
     else:
+        sumrule_layer = lambda x: x
         integrator_input = None
         model_input = [placeholder_input]
 
-    pdf_model = MetaModel(model_input, layer_pdf(placeholder_input), name="PDF")
+    pdf_models = []
+
+    # Now we need a trainable network per model to be trained in parallel
+    for i in range(parallel_models):
+        layer_seed = seed + i * number_of_layers
+        if layer_type == "dense":
+            reg = regularizer_selector(regularizer, **regularizer_args)
+            list_of_pdf_layers = generate_dense_network(
+                inp,
+                nodes,
+                activations,
+                initializer_name,
+                seed=seed,
+                dropout_rate=dropout,
+                regularizer=reg,
+            )
+        elif layer_type == "dense_per_flavour":
+            # Define the basis size attending to the last layer in the network
+            # TODO: this information should come from the basis information
+            #       once the basis information is passed to this class
+            list_of_pdf_layers = generate_dense_per_flavour_network(
+                inp,
+                nodes,
+                activations,
+                initializer_name,
+                seed=layer_seed,
+                basis_size=last_layer_nodes,
+            )
+
+        def dense_me(x):
+            """Takes an input tensor `x` and applies all layers
+            from the `list_of_pdf_layers` in order"""
+            if inp == 1:
+                curr_fun = list_of_pdf_layers[0](x)
+            else:
+                curr_fun = list_of_pdf_layers[0](add_log(x))
+
+            for dense_layer in list_of_pdf_layers[1:]:
+                curr_fun = dense_layer(curr_fun)
+            return curr_fun
+
+        # Preprocessing layer (will be multiplied to the last of the denses)
+        preproseed = seed + number_of_layers * (i + 1)
+        layer_preproc = Preprocessing(
+            input_shape=(1,), name=f"pdf_prepro_{i}", flav_info=flav_info, seed=preproseed
+        )
+
+        # Apply preprocessing and basis
+        def layer_fitbasis(x):
+            ret = op.op_multiply([dense_me(x), layer_preproc(x)])
+            if basis_rotation.is_identity():
+                # if we don't need to rotate basis we don't want spurious layers
+                return ret
+            return basis_rotation(ret)
+
+        # Rotation layer, changes from the 8-basis to the 14-basis
+        def layer_pdf(x):
+            return layer_evln(layer_fitbasis(x))
+
+        # Final PDF
+        final_pdf = sumrule_layer(layer_fitbasis, layer_pdf)
+
+        pdf_model = MetaModel(model_input, final_pdf(placeholder_input), name=f"PDF_{i}")
+
+        pdf_models.append(pdf_model)
 
-    return pdf_model
+    return pdf_models
diff --git a/n3fit/src/n3fit/msr.py b/n3fit/src/n3fit/msr.py
index 1f552888f1..20c43804cc 100644
--- a/n3fit/src/n3fit/msr.py
+++ b/n3fit/src/n3fit/msr.py
@@ -5,7 +5,7 @@
 import numpy as np
 
 from n3fit.layers import xDivide, MSR_Normalization, xIntegrator
-from n3fit.backends import operations
+from n3fit.backends import operations as op
 from n3fit.backends import MetaModel
 
 
@@ -37,51 +37,57 @@ def gen_integration_input(nx):
     return xgrid, weights_array
 
 
-def msr_impose(fit_layer, final_pdf_layer, mode='All', verbose=False):
+def msr_impose(nx=int(2e3), basis_size=8, mode='All'):
     """
-        This function receives:
-            - fit_layer: the 8-basis layer of PDF which we fit
-            - final_layer: the 14-basis which is fed to the fktable
-        It uses pdf_fit to compute the sum rule and returns a modified version of
-        the final_pdf layer with a normalisation by which the sum rule is imposed
+        Generates a function that applies a normalization layer to the fit.
+        The normalization is computed from the direct output of the NN (so the 7,8-flavours basis)
+        and it is applied to the input of the fktable (i.e., to the 14-flavours fk-basis).
+
+        Parameters
+        ----------
+            nx: int
+                number of points for the integration grid
+            basis_size: int
+                number of flavours output of the NN
     """
     # 1. Generate the fake input which will be used to integrate
-    nx = int(2e3)
     xgrid, weights_array = gen_integration_input(nx)
 
     # 2. Prepare the pdf for integration
     #    for that we need to multiply several flavours with 1/x
     division_by_x = xDivide()
 
-    def pdf_integrand(x):
-        res = operations.op_multiply([division_by_x(x), fit_layer(x)])
-        return res
-
     # 3. Now create the integration layer (the layer that will simply integrate, given some weight
     integrator = xIntegrator(weights_array, input_shape=(nx,))
 
     # 4. Now create the normalization by selecting the right integrations
-    normalizer = MSR_Normalization(input_shape=(8,), mode=mode)
-
-    # 5. Make the xgrid numpy array into a backend input layer so it can be given
-    xgrid_input = operations.numpy_to_input(xgrid)
-    normalization = normalizer(integrator(pdf_integrand(xgrid_input)))
-
-    def ultimate_pdf(x):
-        return operations.op_multiply_dim([final_pdf_layer(x), normalization])
-
-    if verbose:
-        #         only_int = integrator(pdf_integrand(xgrid_input))
-        #         modelito = MetaModel(xgrid_input, only_int)
-        #         result = modelito.predict(x = None, steps = 1)
+    normalizer = MSR_Normalization(input_shape=(basis_size,), mode=mode)
+
+    # 5. Make the xgrid array into a backend input layer so it can be given to the normalization
+    xgrid_input = op.numpy_to_input(xgrid)
+
+    # Now prepare a function that takes as input the 8-flavours output of the NN
+    # and the 14-flavours after the fk rotation and returns a 14-flavours normalized output
+    # note + TODO:
+    # the idea was that the normalization should always be applied at the fktable 14-flavours
+    # and always computed at the output of the NN (in case one would like to compute it differently)
+    # don't think it is a good idea anymore and should be changed to act only on the output to the fktable
+    # but will be dealt with in the future.
+                            # fitlayer        #final_pdf
+    def apply_normalization(layer_fitbasis, layer_pdf):
+        """
+            layer_fitbasis: output of the NN
+            layer_pdf: output for the fktable
+        """
+        pdf_integrand = op.op_multiply([division_by_x(xgrid_input), layer_fitbasis(xgrid_input)])
+        normalization = normalizer(integrator(pdf_integrand))
 
-        print(" > > Generating model for the inyection layer which imposes MSR")
-        check_integration(ultimate_pdf, xgrid_input)
+        def ultimate_pdf(x):
+            return layer_pdf(x)*normalization
 
-    # Save a reference to xgrid in ultimate_pdf, very useful for debugging
-    ultimate_pdf.ref_xgrid = xgrid_input
+        return ultimate_pdf
 
-    return ultimate_pdf, xgrid_input
+    return apply_normalization, xgrid_input
 
 
 def check_integration(ultimate_pdf, integration_input):
@@ -93,12 +99,12 @@ def check_integration(ultimate_pdf, integration_input):
     """
     nx = int(1e4)
     xgrid, weights_array = gen_integration_input(nx)
-    xgrid_input = operations.numpy_to_input(xgrid)
+    xgrid_input = op.numpy_to_input(xgrid)
 
     multiplier = xDivide(output_dim=14, div_list=range(3, 9))
 
     def pdf_integrand(x):
-        res = operations.op_multiply([multiplier(x), ultimate_pdf(x)])
+        res = op.op_multiply([multiplier(x), ultimate_pdf(x)])
         return res
 
     modelito = MetaModel([xgrid_input, integration_input], pdf_integrand(xgrid_input))
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index b12e6359ea..dd912d997e 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -75,6 +75,7 @@ def initialize_seeds(replica: list, trvlseed: int, nnseed: int, mcseed: int, gen
 
 # Action to be called by valid phys
 # All information defining the NN should come here in the "parameters" dict
+@n3fit.checks.can_run_multiple_replicas
 @n3fit.checks.check_consistent_basis
 @n3fit.checks.wrapper_check_NN
 @n3fit.checks.wrapper_hyperopt
@@ -92,6 +93,7 @@ def performfit(
     hyperopt=None,
     debug=False,
     maxcores=None,
+    parallel_models=1,
 ):
     """
         This action will (upon having read a validcard) process a full PDF fit for a given replica.
@@ -141,6 +143,8 @@ def performfit(
                 activate some debug options
             maxcores: int
                 maximum number of (logical) cores that the backend should be aware of
+            parallel_models: int
+                number of models to be run in parallel
     """
     from n3fit.backends import set_initial_state
 
@@ -239,11 +243,11 @@ def performfit(
             fitting["fitbasis"],
             nnseed,
             debug=debug,
-            save_weights_each=fitting.get("save_weights_each"),
             kfold_parameters=kfold_parameters,
             max_cores=maxcores,
             model_file=fitting.get("load"),
-            sum_rules=fitting.get("sum_rules", True)
+            sum_rules=fitting.get("sum_rules", True),
+            parallel_models=parallel_models
         )
 
         # This is just to give a descriptive name to the fit function
@@ -297,63 +301,59 @@ def performfit(
 
         # After the fit is run we get a 'result' dictionary with the following items:
         stopping_object = result["stopping_object"]
-        pdf_model = result["pdf_model"]
-        true_chi2 = result["loss"]
-        training = result["training"]
-        log.info("Total exp chi2: %s", true_chi2)
+        pdf_models = result["pdf_models"]
 
         # Where has the stopping point happened (this is only for debugging purposes)
         print(
             """
         > > The stopping point has been at: {0} with a loss of {1}
                 which it got at {2}. Stopping degree {3}
-                Positivity state: {4}
+                Positivity status: {4}
                 """.format(
                 stopping_object.stop_epoch,
                 stopping_object.vl_chi2,
                 stopping_object.e_best_chi2,
                 stopping_object.stopping_degree,
-                stopping_object.positivity_status(),
+                stopping_object.positivity_status
             )
         )
 
-        # Create a pdf instance
-        pdf_instance = N3PDF(pdf_model, fit_basis=fitting.get("basis"))
+        final_time = stopwatch.stop()
+        all_training_chi2, all_val_chi2, all_exp_chi2 = the_model_trainer.evaluate(stopping_object)
 
-        # Generate the writer wrapper
-        writer_wrapper = WriterWrapper(
-            replica_number,
-            pdf_instance,
-            stopping_object,
-            theoryid.get_description().get("Q0") ** 2,
-            stopwatch.stop(),
-        )
+        for i, pdf_model in enumerate(pdf_models):
+            # Each model goes into its own replica folder
+            replica_path_set = replica_path / f"replica_{replica_number + i}"
 
-        # Now write the data down
-        training_chi2, val_chi2, exp_chi2 = the_model_trainer.evaluate(stopping_object)
-        writer_wrapper.write_data(
-            replica_path_set, output_path.name, training_chi2, val_chi2, true_chi2
-        )
+            # Create a pdf instance
+            pdf_instance = N3PDF(pdf_model, fit_basis=fitting.get("basis"))
+
+            # Generate the writer wrapper
+            writer_wrapper = WriterWrapper(
+                replica_number,
+                pdf_instance,
+                stopping_object, # TODO
+                theoryid.get_description().get("Q0") ** 2,
+                final_time,
+            )
+
+            # Get the right chi2s
+            training_chi2 = np.take(all_training_chi2, i)
+            val_chi2 = np.take(all_val_chi2, i)
+            exp_chi2 = np.take(all_exp_chi2, i)
+
+            # And write the data down
+            writer_wrapper.write_data(
+                replica_path_set, output_path.name, training_chi2, val_chi2, exp_chi2
+            )
 
-        # Save the weights to some file for the given replica
-        model_file = fitting.get("save")
-        if model_file:
-            model_file_path = replica_path_set / model_file
-            log.info(" > Saving the weights for future in %s", model_file_path)
-            # Need to use "str" here because TF 2.2 has a bug for paths objects (fixed in 2.3 though)
-            pdf_model.save_weights(str(model_file_path), save_format="h5")
-
-        # If the history of weights is active then loop over it
-        # rewind the state back to every step and write down the results
-        for step in range(len(stopping_object.history.reloadable_history)):
-            stopping_object.history.rewind(step)
-            new_path = output_path / f"history_step_{step}/replica_{replica_number}"
-            # We need to recompute the experimental chi2 for this point
-            training_chi2, val_chi2, exp_chi2 = the_model_trainer.evaluate(stopping_object)
-            writer_wrapper.write_data(new_path, output_path.name, training_chi2, val_chi2, exp_chi2)
-
-        # So every time we want to capture output_path.name and addd a history_step_X
-        # parallel to the nnfit folder
+            # Save the weights to some file for the given replica
+            model_file = fitting.get("save")
+            if model_file:
+                model_file_path = replica_path_set / model_file
+                log.info(" > Saving the weights for future in %s", model_file_path)
+                # Need to use "str" here because TF 2.2 has a bug for paths objects (fixed in 2.3 though)
+                pdf_model.save_weights(str(model_file_path), save_format="h5")
 
         if tboard is not None:
             log.info("Tensorboard logging information is stored at %s", log_path)
diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py
index 6773cd6c1c..94fb7e8dee 100644
--- a/n3fit/src/n3fit/stopping.py
+++ b/n3fit/src/n3fit/stopping.py
@@ -41,6 +41,7 @@
 # Pass/veto keys
 POS_OK = "POS_PASS"
 POS_BAD = "POS_VETO"
+THRESHOLD_POS = 1e-6
 
 
 def parse_ndata(all_data):
@@ -117,7 +118,7 @@ def parse_losses(history_object, data, suffix="loss"):
     total_points = 0
     total_loss = 0
     for exp_name, npoints in data.items():
-        loss = np.take(hobj[exp_name + f"_{suffix}"], -1)
+        loss = np.array(hobj[exp_name + f"_{suffix}"])
         dict_chi2[exp_name] = loss / npoints
         total_points += npoints
         total_loss += loss
@@ -133,259 +134,357 @@ def parse_losses(history_object, data, suffix="loss"):
 
 class FitState:
     """
-        Holds the state of the chi2 during the fit.
+    Holds the state of the chi2 during the fit for all replicas
 
-        It holds the necessary information to reload the fit
-        to a specific point in time if we are interested on reloading
-        (otherwise the relevant variables stay empty to save memory)
+    It holds the necessary information to reload the fit
+    to a specific point in time if we are interested on reloading
+    (otherwise the relevant variables stay empty to save memory)
 
-        Note: the training chi2 is computed before the update of the weights
-        so it is the chi2 that informed the updated corresponding to this state.
-        The validation chi2 instead is computed after the update of the weights.
+    Note: the training chi2 is computed before the update of the weights
+    so it is the chi2 that informed the updated corresponding to this state.
+    The validation chi2 instead is computed after the update of the weights.
 
-        Parameters
-        ----------
-            all_tr_chi2: dict
-                Chi2 for all training datasets computed before the update of the weights
-            all_vl_chi2: dict
-                Chi2 for all validation datasets computed after the update of the weights
-            info: dict
-                Full return state of the Validation model
+    Parameters
+    ----------
+        training_info: dict
+            all losses for the training model
+        validation_info: dict
+            all losses for the validation model
     """
 
-    def __init__(self, all_tr_chi2, all_vl_chi2, info):
-        self.all_tr_chi2 = all_tr_chi2
-        self.all_vl_chi2 = all_vl_chi2
-        self.info = info
-        # These two variables are only filled for specific points
-        # in order to save precious memory, and only when we are
-        # saving the fit history each X number of epoch
-        self.weights = None
-        self.best_epoch = 0
-
-    def register_weigths(self, weights, best_epoch):
-        """ Save the current best weights and best_epoch of the fit """
-        self.weights = weights
-        self.best_epoch = best_epoch
+    vl_ndata = None
+    tr_ndata = None
+    vl_suffix = None
+
+    def __init__(self, training_info, validation_info):
+        if self.vl_ndata is None or self.tr_ndata is None or self.vl_suffix is None:
+            raise ValueError(
+                "FitState cannot be instantiated until vl_ndata, tr_ndata and vl_suffix are filled"
+            )
+        self.training = training_info
+        self.validation = validation_info
+        self._parsed = False
+        self._vl_chi2 = None
+        self._tr_chi2 = None
+        self._vl_dict = None
+        self._tr_dict = None
 
     @property
-    def vl_chi2(self):
-        """ Returns the total validation chi2 """
-        return self.all_vl_chi2["total"]
+    def vl_loss(self):
+        """Return the total validation loss as it comes from the info dictionaries"""
+        return self.validation.get("loss")
+
+    @property
+    def tr_loss(self):
+        """Return the total validation loss as it comes from the info dictionaries"""
+        return self.training.get("loss")
+
+    def _parse_chi2(self):
+        """
+        Parses the chi2 from the losses according to the `tr_ndata` and
+        `vl_ndata` dictionaries of {dataset: n_points}
+        """
+        if self._parsed:
+            return
+        if self.training is not None:
+            self._tr_chi2, self._tr_dict = parse_losses(self.training, self.tr_ndata)
+        if self.validation is not None:
+            self._vl_chi2, self._vl_dict = parse_losses(
+                self.validation, self.vl_ndata, suffix=self.vl_suffix
+            )
 
     @property
     def tr_chi2(self):
-        """ Returns the total training chi2 """
-        return self.all_tr_chi2["total"]
+        self._parse_chi2()
+        return self._tr_chi2
 
-    def __str__(self):
-        return f"chi2: tr={self.tr_chi2} vl={self.vl_chi2}"
+    @property
+    def vl_chi2(self):
+        self._parse_chi2()
+        return self._vl_chi2
 
+    @property
+    def all_tr_chi2(self):
+        self._parse_chi2()
+        return self._tr_dict
 
-class FitHistory:
-    """
-        Keeps a list of FitState items holding the full history of the fit.
+    @property
+    def all_vl_chi2(self):
+        self._parse_chi2()
+        return self._vl_dict
 
-        It also keeps track of the best epoch and the associated weights.
+    def all_tr_chi2_for_replica(self, r):
+        """" Return the tr chi2 per dataset for a given replica """
+        return {k: np.take(i, r) for k, i in self.all_tr_chi2.items()}
 
-        Can be iterated when there are snapshots of the fit being saved.
-        When iterated it will rewind the fit to each of the point in history
-        that have been saved.
+    def all_vl_chi2_for_replica(self, r):
+        """" Return the vl chi2 per dataset for a given replica """
+        return {k: np.take(i, r) for k, i in self.all_vl_chi2.items()}
+
+    def total_partial_tr_chi2(self):
+        """ Return the tr chi2 summed over replicas per experiment"""
+        return {k: np.sum(i) for k, i in self.all_tr_chi2.items()}
+
+    def total_partial_vl_chi2(self):
+        """ Return the vl chi2 summed over replicas per experiment"""
+        return {k: np.sum(i) for k, i in self.all_tr_chi2.items()}
+
+    def total_tr_chi2(self):
+        """ Return the total tr chi2 summed over replicas """
+        return np.sum(self.tr_chi2)
+
+    def total_vl_chi2(self):
+        """ Return the total vl chi2 summed over replicas """
+        return np.sum(self.vl_chi2)
+
+    def __str__(self):
+        return f"chi2: tr={self.tr_chi2} vl={self.vl_chi2}"
 
-        Parameters
-        ----------
-            pdf_model: n3fit.backends.MetaModel
-                PDF model being trained, used to saved the weights and compute
-                more
-            save_weights_each: int
-                if given, it will save a snapshot of the fit every  `save_weights_each` epochs
-    """
 
-    def __init__(self, pdf_model, save_weights_each=None):
+class ReplicaState:
+    """Extra complication which eventually will be merged with someone else
+    but it is here only for development."""
+
+    def __init__(self, pdf_model):
         self._pdf_model = pdf_model
-        self._save_weights_each = save_weights_each
-        # Initialize variables for the history
         self._weights = None
         self._best_epoch = None
-        self._history = []
-        self.final_epoch = None
-        # Initialize variables for the snapshots
-        self.reloadable_history = []
+        self._stop_epoch = None
+        self._best_vl_chi2 = INITIAL_CHI2
+
+    def positivity_pass(self):
+        """ By definition, if we have a ``best_epoch`` then positivity passed """
+        if self._best_epoch is None:
+            return False
+        else:
+            return True
 
     @property
     def best_epoch(self):
-        """ Epoch of the best fit """
+        if self._best_epoch is None:
+            return self.stop_epoch
         return self._best_epoch
 
-    @best_epoch.setter
-    def best_epoch(self, epoch):
-        """ Saves the current weight """
+    @property
+    def stop_epoch(self):
+        return self._stop_epoch
+
+    @property
+    def best_vl(self):
+        return float(self._best_vl_chi2)
+
+    @property
+    def positivity_status(self):
+        if self.positivity_pass():
+            return POS_OK
+        else:
+            return POS_BAD
+
+    def register_best(self, chi2, epoch):
+        """ Register a new best state and some metadata about it """
         self._weights = self._pdf_model.get_weights()
         self._best_epoch = epoch
+        self._best_vl_chi2 = chi2
+
+    def reload(self):
+        """ Reload the weights of the best state """
+        if self._weights:
+            self._pdf_model.set_weights(self._weights)
+
+    def stop_training(self, epoch = None):
+        """ Stop training this replica if not stopped before """
+        if self._pdf_model.trainable:
+            self._pdf_model.trainable = False
+            self._stop_epoch = epoch
+
+
+class FitHistory:
+    """
+    Keeps a list of FitState items holding the full history of the fit.
+
+    It also keeps track of the best epoch and the associated weights.
+
+    Can be iterated when there are snapshots of the fit being saved.
+    When iterated it will rewind the fit to each of the point in history
+    that have been saved.
+
+    Parameters
+    ----------
+        pdf_models: n3fit.backends.MetaModel
+            list of PDF models being trained, used to saved the weights
+    """
+
+    def __init__(self, pdf_models, tr_ndata, vl_ndata):
+        # Create a ReplicaState object for all models
+        # which will hold the best chi2 and weights per replica
+        self._replicas = []
+        for pdf_model in pdf_models:
+            self._replicas.append(ReplicaState(pdf_model))
+        self._iter_replicas = iter(self._replicas)
+
+        if vl_ndata is None:
+            vl_ndata = tr_ndata
+            vl_suffix = "loss"
+        else:
+            vl_suffix = "val_loss"
+        # All instances of FitState should use these
+        FitState.tr_ndata = tr_ndata
+        FitState.vl_ndata = vl_ndata
+        FitState.vl_suffix = vl_suffix
+
+        # Save a list of status for the entire fit
+        self._history = []
+        self.final_epoch = None
+
+    @property
+    def best_epoch(self):
+        """ Return the best epoch per replica """
+        return [i.best_epoch for i in self._replicas]
 
     def get_state(self, epoch):
         """ Get the FitState of the system for a given epoch """
-        return self._history[epoch]
+        try:
+            return self._history[epoch]
+        except IndexError as e:
+            raise ValueError(
+                f"Tried to get obtain the state for epoch {epoch} when only {len(self._history)} epochs have been saved"
+            ) from e
+
+    def save_best_replica(self, i, epoch=None):
+        """Save the state of replica ``i`` as a best fit so far.
+        If an epoch is given, save the best as the given epoch, otherwise
+        use the last one
+        """
+        if epoch is None:
+            epoch = self.final_epoch
+        loss = self.get_state(epoch).vl_loss[i]
+        self._replicas[i].register_best(loss, epoch)
 
-    def best_state(self):
-        """ Return the FitState object corresponding to the best fit """
-        if self.best_epoch is None:
-            return None
-        else:
-            index = self.best_epoch
-            best_state = self._history[index]
-            return best_state
+    def all_positivity_status(self):
+        """ Returns whether the positivity passed or not per replica """
+        return np.array([i.positivity_status for i in self._replicas])
 
-    def best_vl(self):
-        """ Returns the chi2 of the best fit
-        if there was no best fit returns `INITIAL_CHI2`
-        if there was a problem, returns `TERRIBLE_CHI2` """
-        if not self._weights:
-            return TERRIBLE_CHI2
-        best_state = self.best_state()
-        if best_state:
-            return best_state.vl_chi2
-        else:
-            return INITIAL_CHI2
-
-    def best_tr(self):
-        """ Returns the training chi2 of the best fit
-        if there are no best fit, returns the last one """
-        best_state = self.best_state()
-        if best_state:
-            return best_state.tr_chi2
-        else:
-            return self._history[self.final_epoch].tr_chi2
+    def all_best_vl_loss(self):
+        """ Returns the best validation loss for each replica """
+        return np.array([i.best_vl for i in self._replicas])
 
-    def register(self, fitstate, epoch):
-        """ Save a new fitstate and updates the current final epoch
-        Every `save_weights_each` (if set) saves a snapshot of the current best fit into
-        the fitstate
+    def register(self, epoch, training_info, validation_info):
+        """Save a new fitstate and updates the current final epoch
 
         Parameters
         ----------
-            `fitstate`
-                a fitstate object to save
-            `epoch`
+            fitstate: FitState
+                FitState object
+                the fitstate of the object to save
+            epoch: int
                 the current epoch of the fit
         """
+        # Save all the information in a fitstate object
+        fitstate = FitState(training_info, validation_info)
         self.final_epoch = epoch
         self._history.append(fitstate)
-        if self._save_weights_each:
-            save_here = (epoch + 1) % self._save_weights_each
-            if save_here == 0:
-                fitstate.register_weigths(self._weights, self.best_epoch)
-                self.reloadable_history.append(fitstate)
-
-    def reload(self, weights=None):
-        """ Reloads the best fit weights into the model
-        if there are models to be reloaded
-        A set of weights can be enforced as an optional argument
-        """
-        if weights is None:
-            weights = self._weights
-        if weights:
-            self._pdf_model.set_weights(weights)
+        return fitstate
+
+    def stop_training_replica(self, i, e):
+        """ Stop training replica i in epoch e"""
+        self._replicas[i].stop_training(e)
 
-    def rewind(self, step):
-        """ Rewind the FitHistory object to the step `step` in the fit
+    def reload(self):
+        """Reloads the best fit weights into the model if there are models to be reloaded
+        Ensure that all replicas have stopped at this point.
         """
-        fitstate = self.reloadable_history[step]
-        historic_weights = fitstate.weights
-        self.reload(weights=historic_weights)
-        self.best_epoch = fitstate.best_epoch
-        self.final_epoch = (step + 1) * self._save_weights_each - 1
-        # -1 because we are saving the epochs starting at 0
+        for replica in self._replicas:
+            replica.stop_training(self.final_epoch)
+            replica.reload()
+
+    def __next__(self):
+        return next(self._iter_replicas)
 
 
 class Stopping:
     """
-        Driver of the stopping algorithm
+    Driver of the stopping algorithm
 
-        Note, if the total number of points in the validation dictionary is None, it is assumed
-        the validation_model actually corresponds to the training model.
+    Note, if the total number of points in the validation dictionary is None, it is assumed
+    the validation_model actually corresponds to the training model.
 
-        Parameters
-        ----------
-            validation_model: n3fit.backends.MetaModel
-                the model with the validation mask applied
-                (and compiled with the validation data and covmat)
-            all_data_dict: dict
-                list containg all dictionaries containing all information about
-                the experiments/validation/regularizers/etc to be parsed by Stopping
-            pdf_model: n3fit.backends.MetaModel
-                pdf_model being trained
-            threshold_positivity: float
-               maximum value allowed for the sum of all positivity losses
-            total_epochs: int
-               total number of epochs
-            stopping_patience: int
-               how many epochs to wait for the validation loss to improve
-            dont_stop: bool
-               dont care about early stopping
-            save_weights_each: int
-                every how many epochs to save a snapshot of the fit
+    Parameters
+    ----------
+        validation_model: n3fit.backends.MetaModel
+           the model with the validation mask applied
+           (and compiled with the validation data and covmat)
+        all_data_dict: dict
+           list containg all dictionaries containing all information about
+           the experiments/validation/regularizers/etc to be parsed by Stopping
+        pdf_models: list(n3fit.backends.MetaModel)
+           list of pdf_models being trained
+        threshold_positivity: float
+           maximum value allowed for the sum of all positivity losses
+        total_epochs: int
+           total number of epochs
+        stopping_patience: int
+           how many epochs to wait for the validation loss to improve
+        dont_stop: bool
+           dont care about early stopping
     """
 
     def __init__(
         self,
         validation_model,
         all_data_dicts,
-        pdf_model,
-        threshold_positivity=1e-6,
+        pdf_models,
+        threshold_positivity=THRESHOLD_POS,
         total_epochs=0,
         stopping_patience=7000,
         threshold_chi2=10.0,
         dont_stop=False,
-        save_weights_each=None,
     ):
-        # Parse the training, validation and positivity sets from all the input dictionaries
-        self._tr_ndata, vl_ndata, pos_sets = parse_ndata(all_data_dicts)
+        # Save the validation object
+        self._validation = validation_model
 
-        # Create the Validation, Positivity and History objects
-        if vl_ndata is None:
-            self.validation = Validation(validation_model, self._tr_ndata, no_validation=True)
-        else:
-            self.validation = Validation(validation_model, vl_ndata)
-        self.positivity = Positivity(threshold_positivity, pos_sets)
-        self.history = FitHistory(pdf_model, save_weights_each=save_weights_each)
+        # Create the History object
+        tr_ndata, vl_ndata, pos_sets = parse_ndata(all_data_dicts)
+        self._history = FitHistory(pdf_models, tr_ndata, vl_ndata)
+
+        # And the positivity checker
+        self._positivity = Positivity(threshold_positivity, pos_sets)
 
         # Initialize internal variables for the stopping
+        self.n_replicas = len(pdf_models)
         self.threshold_chi2 = threshold_chi2
+        self.stopping_degree = np.zeros(self.n_replicas, dtype=np.int)
+        self.count = np.zeros(self.n_replicas, dtype=np.int)
+
         self.dont_stop = dont_stop
         self.stop_now = False
         self.stopping_patience = stopping_patience
-        self.stopping_degree = 0
-        self.count = 0
         self.total_epochs = total_epochs
 
     @property
     def vl_chi2(self):
-        """ Validation chi2 """
-        return self.history.best_vl()
-
-    @property
-    def tr_chi2(self):
-        """ Training chi2 """
-        return self.history.best_tr()
+        """ Current validation chi2 """
+        validation_info = self._validation.compute_losses()
+        fitstate = FitState(None, validation_info)
+        return fitstate.vl_chi2
 
     @property
     def e_best_chi2(self):
-        """ Epoch of the best chi2, if there is no best epoch
-        return the last epoch"""
-        be = self.history.best_epoch
-        if be is None:
-            return self.stop_epoch
-        return be
-
+        """ Epoch of the best chi2, if there is no best epoch, return last"""
+        return self._history.best_epoch
 
     @property
     def stop_epoch(self):
         """ Epoch in which the fit is stopped """
-        return self.history.final_epoch + 1
+        return self._history.final_epoch + 1
+
+    @property
+    def positivity_status(self):
+        """Returns POS_PASS if positivity passes or veto if it doesn't
+        for each replica"""
+        return self._history.all_positivity_status()
 
     def evaluate_training(self, training_model):
-        """ Given the training model, evaluates the
+        """Given the training model, evaluates the
         model and parses the chi2 of the training datasets
 
         Parameters
@@ -399,8 +498,8 @@ def evaluate_training(self, training_model):
                 chi2 of the given ``training_model``
         """
         training_info = training_model.compute_losses()
-        tr_chi2, _ = parse_losses(training_info, self._tr_ndata)
-        return tr_chi2
+        fitstate = FitState(training_info, None)
+        return fitstate.tr_chi2
 
     def monitor_chi2(self, training_info, epoch, print_stats=False):
         """
@@ -416,81 +515,86 @@ def monitor_chi2(self, training_info, epoch, print_stats=False):
 
         Parameters
         ----------
-            `training_info`
-                the output of a .fit() run
-            `epoch`
-                the index of the epoch
+            training_info: dict
+                output of a .fit() call, dictionary of the total loss (summed over replicas) for
+                each experiment
+            epoch: int
+                index of the epoch
 
         Returns
         -------
-            `pass_ok`
+            pass_ok: bool
                 true/false according to the status of the run
         """
-        # Step 1. Preprocess the event, count it towards the stopping degree
-        #         parse the training information and check whether it is a good point
-        tr_chi2, all_tr = parse_losses(training_info, self._tr_ndata)
-
-        if np.isnan(tr_chi2):
+        # Step 1. Check whether the fit has NaN'd and stop it if so
+        if np.isnan(training_info["loss"]):
             log.warning(" > NaN found, stopping activated")
-            self.stop_now = True
-            # If we had a good model at any point, reload
-            self.history.reload()
+            self.make_stop()
             return False
 
-        self.stopping_degree += self.count
-
-        # Step 2. Check the validation loss at this point
-        vl_chi2, all_vl = self.validation.loss()
+        # Step 2. Compute the validation metrics
+        validation_info = self._validation.compute_losses()
 
-        # Step 3. Store information about the run and print stats if asked
-        fitstate = FitState(all_tr, all_vl, self.validation.state)
-        self.history.register(fitstate, epoch)
+        # Step 3. Register the current point in (the) history
+        fitstate = self._history.register(epoch, training_info, validation_info)
         if print_stats:
             self.print_current_stats(epoch, fitstate)
 
         # Step 4. Check whether this is a better fit
         #         this means improving vl_chi2 and passing positivity
-        if self.positivity(fitstate) and vl_chi2 < self.threshold_chi2:
-            if vl_chi2 < self.history.best_vl():
-                # Set the new best
-                self.history.best_epoch = epoch
-                # Save stopping info
-                self.stopping_degree = 0
-                # Initialize the counter
-                self.count = 1
-
-        # If your patience has ended, prepare for stop
-        if self.stopping_degree > self.stopping_patience:
+        # Don't start counting until the chi2 of the validation goes below a certain threshold
+        # once we start counting, don't bother anymore
+        passes = self.count | (fitstate.vl_chi2 < self.threshold_chi2)
+        passes &= fitstate.vl_loss < self._history.all_best_vl_loss()
+        # And the ones that pass positivity
+        passes &= self._positivity(fitstate)
+
+        self.stopping_degree += self.count
+
+        # Step 5. loop over the valid indices to check whether the vl improved
+        for i in np.where(passes)[0]:
+            self._history.save_best_replica(i)
+            self.stopping_degree[i] = 0
+            self.count[i] = 1
+
+        stop_replicas = self.count & (self.stopping_degree > self.stopping_patience)
+        for i in np.where(stop_replicas)[0]:
+            self.count[i] = 0
+            self._history.stop_training_replica(i, epoch)
+
+        # By using the stopping degree we only stop when none of the replicas are improving anymore
+        if min(self.stopping_degree) > self.stopping_patience:
             self.make_stop()
         return True
 
     def make_stop(self):
-        """ Convenience method to set the stop_now flag
+        """Convenience method to set the stop_now flag
         and reload the history to the point of the best model if any
         """
         self.stop_now = True
-        self.history.reload()
+        self._history.reload()
 
     def print_current_stats(self, epoch, fitstate):
         """
-            Prints the last validation and training loss saved
+        Prints ``fitstate`` training and validation chi2s
         """
         epoch_index = epoch + 1
-        tr_loss = fitstate.tr_chi2
-        vl_loss = fitstate.vl_chi2
-        total_str = f"At epoch {epoch_index}/{self.total_epochs}, total loss: {tr_loss}\n"
-
-        partials = []
-        for experiment in self._tr_ndata:
-            chi2 = fitstate.all_tr_chi2[experiment]
-            partials.append(f"{experiment}: {chi2:.3f}")
-        total_str += ", ".join(partials)
-
-        total_str += f"\nValidation loss at this point: {vl_loss}"
+        tr_chi2 = fitstate.total_tr_chi2()
+        vl_chi2 = fitstate.total_vl_chi2()
+        total_str = f"At epoch {epoch_index}/{self.total_epochs}, total chi2: {tr_chi2}\n"
+
+        # The partial chi2 makes no sense for more than one replica at once:
+        if self.n_replicas == 1:
+            partial_tr_chi2 = fitstate.total_partial_tr_chi2()
+            partials = []
+            for experiment, chi2 in partial_tr_chi2.items():
+                partials.append(f"{experiment}: {chi2:.3f}")
+            total_str += ", ".join(partials) + "\n"
+        total_str += f"Validation chi2 at this point: {vl_chi2}"
         log.info(total_str)
 
     def stop_here(self):
-        """ Returns the stopping status
+        """Returns the stopping status
         If `dont_stop` is set returns always False (i.e., never stop)
         """
         if self.dont_stop:
@@ -498,133 +602,64 @@ def stop_here(self):
         else:
             return self.stop_now
 
-    def positivity_pass(self):
-        """ Checks whether the positivity loss is below the requested threshold
-        If there is no best state, the positivity (obv) cannot pass
-        """
-        best_state = self.history.best_state()
-        if best_state is not None and self.positivity(best_state):
-            return True
-        else:
-            return False
+    def get_next_replica(self):
+        """ Return the next ReplicaState object"""
+        return next(self._history)
 
-    def positivity_status(self):
-        """ Checks whether the positivity loss is below the requested threshold
-        If there is no best state, the positivity (obv) cannot pass
-        """
-        if self.positivity_pass():
-            return POS_OK
-        else:
-            return POS_BAD
-
-    def chi2exps_str(self, log_each=100):
+    def chi2exps_str(self, replica=0, log_each=100):
         """
         Returns a list of log-string with the status of the fit
         every `log_each` epochs
 
         Parameters
         ----------
-            `log_each`
+            replica: int
+                which replica are we writing the log for
+            log_each: int
                 every how many epochs to print the log
 
         Returns
         -------
-            `file_list`
-                a list of string to be printed as `chi2exps.log`
+            file_list: list(str)
+                a list of strings to be printed as `chi2exps.log`
         """
-        final_epoch = self.history.final_epoch
+        final_epoch = self._history.final_epoch
         file_list = []
         for i in range(log_each - 1, final_epoch + 1, log_each):
-            fitstate = self.history.get_state(i)
-            all_tr = fitstate.all_tr_chi2
-            all_vl = fitstate.all_vl_chi2
+            fitstate = self._history.get_state(i)
+            all_tr = fitstate.all_tr_chi2_for_replica(replica)
+            all_vl = fitstate.all_vl_chi2_for_replica(replica)
             # Here it is assumed the validation exp set is always a subset of the training exp set
             data_list = []
-            for exp in self._tr_ndata:
-                tr_loss = all_tr[exp]
+            for exp, tr_loss in all_tr.items():
                 vl_loss = all_vl.get(exp, 0.0)
                 data_str = f"{exp}: {tr_loss} {vl_loss}"
                 data_list.append(data_str)
             data = "\n".join(data_list)
             epoch_index = i + 1
-            total_tr_loss = fitstate.tr_chi2
-            total_vl_loss = fitstate.vl_chi2
             strout = f"""
 Epoch: {epoch_index}
 {data}
-Total: training = {total_tr_loss} validation = {total_vl_loss}
 """
             file_list.append(strout)
         return file_list
 
 
-class Validation:
-    """
-        Controls the NNPDF cross-validation algorithm
-
-        The cross-validation refers to the validation loss of the points of the dataset
-        not used in the fitting.
-        In general for any points considered here there will accompanying points from the
-        same dataset being included in the fitting.
-
-        Parameters
-        ----------
-            model: n3fit.backends.MetaModel
-                the model with the validation mask applied
-                (and compiled with the validation data and covmat)
-    """
-
-    def __init__(self, model, ndata_dict, verbose=False, no_validation=False):
-        self.model = model
-        self.state = None
-        self.verbose = verbose
-        self.ndata_dict = ndata_dict
-        self.n_val_exp = len(ndata_dict)
-        if no_validation:
-            self.suffix = "loss"
-        else:
-            self.suffix = "val_loss"
-
-    def _compute_validation_loss(self):
-        """
-        Evaluates the validation model and returns a tuple (`total_loss`, `vl_dict`)
-        with the information for the validation loss by experimenet normalized to the
-        number of points of each experiment
-
-        Returns
-        -------
-            `total_loss`
-                total vale for the validation loss
-            `vl_dict`
-                dictionary containing a map of experiment names and loss
-        """
-        loss_dict = self.model.compute_losses()
-        self.state = loss_dict
-        return parse_losses(loss_dict, self.ndata_dict, suffix=self.suffix)
-
-    def loss(self):
-        """
-        Returns a tuple with the validation loss and a
-        dictionary for the validation loss per experiment
-        """
-        return self._compute_validation_loss()
-
-
 class Positivity:
     """
-        Controls the positivity requirements.
+    Controls the positivity requirements.
 
-        In order to check the positivity passes will check the history of the fitting
-        as the fitting included positivity sets.
-        If the sum of all positivity sets losses is above a certain value the model is
-        not accepted and the training continues.
+    In order to check the positivity passes will check the history of the fitting
+    as the fitting included positivity sets.
+    If the sum of all positivity sets losses is above a certain value the model is
+    not accepted and the training continues.
 
-        Parameters
-        ----------
-            threshold_positivity: float
-                maximum value allowed for the sum of all positivity losses
-            positivity_sets: list
-                list of positivity datasets
+    Parameters
+    ----------
+        threshold_positivity: float
+            maximum value allowed for the sum of all positivity losses
+        positivity_sets: list
+            list of positivity datasets
     """
 
     def __init__(self, threshold, positivity_sets):
@@ -633,29 +668,27 @@ def __init__(self, threshold, positivity_sets):
 
     def check_positivity(self, history_object):
         """
-        This function receives a history objects and loops over the
-        positivity_sets to check the value of the positivity loss.
-
-        If the positivity loss is above the threshold, the positivity fails
-        otherwise, it passes.
-
-        Parameters
-        ----------
-            history_object: dict
-                dictionary of entries in the form  {'name': loss}, output of a MetaModel .fit()
+                This function receives a history objects and loops over the
+                positivity_sets to check the value of the positivity loss.
+
+                If the positivity loss is above the threshold, the positivity fails
+                otherwise, it passes.
+                It returns an array booleans which are True if positivity passed
+        story_object[key_loss] < self.threshold
+                Parameters
+                ----------
+                    history_object: dict
+                        dictionary of entries in the form  {'name': loss}, output of a MetaModel .fit()
         """
+        positivity_pass = True
         for key in self.positivity_sets:
             key_loss = f"{key}_loss"
-            # If we are taking the avg when checking the output, we should do so here as well?
-            positivity_loss = np.take(history_object[key_loss], -1)
-            if positivity_loss > self.threshold:
-                return False
-        # If none of the positivities failed, it passes
-        return True
+            positivity_pass &= history_object[key_loss] < self.threshold
+        return np.array(positivity_pass)
 
     def __call__(self, fitstate):
         """
-            Checks whether a given FitState object
-            passes the positivity requirement
+        Checks whether a given FitState object
+        passes the positivity requirement
         """
-        return self.check_positivity(fitstate.info)
+        return self.check_positivity(fitstate.validation)
diff --git a/n3fit/src/n3fit/tests/test_backend.py b/n3fit/src/n3fit/tests/test_backend.py
index 2693fd435a..64355a06f1 100644
--- a/n3fit/src/n3fit/tests/test_backend.py
+++ b/n3fit/src/n3fit/tests/test_backend.py
@@ -6,7 +6,6 @@
 import functools
 import numpy as np
 from n3fit.backends import operations as op
-from n3fit.backends import losses
 
 # General parameters
 DIM = 7
@@ -117,34 +116,3 @@ def test_tensor_product():
 
 def test_sum():
     numpy_check(op.sum, np.sum, mode='single')
-
-
-# Tests loss functions
-def test_l_invcovmat():
-    loss_f = losses.l_invcovmat(INVCOVMAT)
-    result = loss_f(T1, T2)
-    y = ARR1 - ARR2
-    tmp = np.dot(INVCOVMAT, y)
-    reference = np.dot(y, tmp)
-    are_equal(result, reference)
-
-
-def test_l_positivity():
-    alpha = 1e-7
-    loss_f = losses.l_positivity(alpha=alpha)
-    result = loss_f(0.0, T1)
-
-    def elu_sum(yarr_in):
-        """ Applies Exponential Linear Unit
-        to an array and sums it up """
-        yarr = -yarr_in
-        res = 0.0
-        for y in yarr:
-            if y > 0:
-                res += y
-            else:
-                res += alpha * (np.exp(y) - 1)
-        return res
-
-    reference = elu_sum(ARR1)
-    are_equal(result, reference)
diff --git a/n3fit/src/n3fit/tests/test_fit.py b/n3fit/src/n3fit/tests/test_fit.py
index adfb4809a8..2c3f509405 100644
--- a/n3fit/src/n3fit/tests/test_fit.py
+++ b/n3fit/src/n3fit/tests/test_fit.py
@@ -138,6 +138,7 @@ def test_performfit_and_timing(tmp_path):
     auxiliary_performfit(tmp_path, replica=2, timing=True)
 
 
+@pytest.mark.skip(reason="Still not implemented in parallel mode")
 def test_hyperopt(tmp_path):
     # Prepare the run
     quickcard = f"hyper-{QUICKNAME}.yml"
diff --git a/n3fit/src/n3fit/tests/test_layers.py b/n3fit/src/n3fit/tests/test_layers.py
index 8ea84457da..d27616b830 100644
--- a/n3fit/src/n3fit/tests/test_layers.py
+++ b/n3fit/src/n3fit/tests/test_layers.py
@@ -163,7 +163,8 @@ def test_DY():
         fks = [i['fktable'] for i in fkdicts]
         obs_layer = layers.DY(fkdicts, fks, ope, nfl=FLAVS)
         pdf = np.random.rand(XSIZE, FLAVS)
-        kp = op.numpy_to_tensor(np.expand_dims(pdf, 0))
+        # Add batch dimension (0) and replica dimension (-1)
+        kp = op.numpy_to_tensor(np.expand_dims(pdf, [0,-1]))
         # generate the n3fit results
         result_tensor = obs_layer(kp)
         result = op.evaluate(result_tensor)
diff --git a/n3fit/src/n3fit/tests/test_losses.py b/n3fit/src/n3fit/tests/test_losses.py
new file mode 100644
index 0000000000..784c9aeed7
--- /dev/null
+++ b/n3fit/src/n3fit/tests/test_losses.py
@@ -0,0 +1,43 @@
+"""
+    Test the losses layers
+"""
+import numpy as np
+from n3fit.layers import losses
+from n3fit.backends import operations as op
+from .test_backend import are_equal, DIM
+
+ARR1 = np.random.rand(DIM)
+ARR2 = np.random.rand(DIM)
+C = np.random.rand(DIM, DIM)
+INVCOVMAT = np.linalg.inv(C @ C.T)
+
+# Tests loss functions
+def test_l_invcovmat():
+    loss_f = losses.LossInvcovmat(INVCOVMAT, ARR1)
+    # Add a replica and batch dimension to T2
+    result = loss_f(np.expand_dims(ARR2, [0, 1]))
+    y = ARR1 - ARR2
+    tmp = np.dot(INVCOVMAT, y)
+    reference = np.dot(y, tmp)
+    are_equal(result, reference)
+
+
+def test_l_positivity():
+    alpha = 1e-7
+    loss_f = losses.LossPositivity(alpha=alpha)
+    result = loss_f(np.expand_dims(ARR2, [0, 1]))
+
+    def elu_sum(yarr_in):
+        """Applies Exponential Linear Unit
+        to an array and sums it up"""
+        yarr = -yarr_in
+        res = 0.0
+        for y in yarr:
+            if y > 0:
+                res += y
+            else:
+                res += alpha * (np.exp(y) - 1)
+        return res
+
+    reference = elu_sum(ARR1)
+    are_equal(result, reference)
diff --git a/n3fit/src/n3fit/vpinterface.py b/n3fit/src/n3fit/vpinterface.py
index dd5077e8c5..b4d4a975b6 100644
--- a/n3fit/src/n3fit/vpinterface.py
+++ b/n3fit/src/n3fit/vpinterface.py
@@ -18,12 +18,14 @@
 
 
 """
+import logging
 import numpy as np
 import numpy.linalg as la
 from validphys.core import PDF, MCStats
 from validphys.pdfbases import ALL_FLAVOURS, check_basis
 from validphys.arclength import integrability_number, arc_lengths
 
+log = logging.getLogger(__name__)
 # Order of the evolution basis output from n3fit
 EVOL_LIST = [
     "photon",
@@ -83,13 +85,23 @@ def get_nn_weights(self):
         """Outputs all weights of the NN as numpy.ndarrays """
         return self.model.get_weights()
 
-    def get_preprocessing_factors(self):
+    def get_preprocessing_factors(self, replica=None):
         """Loads the preprocessing alpha and beta arrays from the PDF trained model.
         If a ``fit_basis`` given in the format of ``n3fit`` runcards is given it will be used
         to generate a new dictionary with the names, the exponent and whether they are trainable
         otherwise outputs a Nx2 array where [:,0] are alphas and [:,1] betas
         """
-        preprocessing_layer = self.model.get_layer("pdf_prepro")
+        # If the replica is given, get the requested preprocessing layer
+        # otherwise, search for any pdf_prepro_X layer within the model
+        if replica is not None:
+            preprocessing_layer = self.model.get_layer(f"pdf_prepro_{replica}")
+        else:
+            preprocessing_layers = self.model.get_layer_re("pdf_prepro_\d")
+            if len(preprocessing_layers) != 1:
+                # We really don't want to fail at this point, but print a warning at least...
+                log.warning("More than one preprocessing layer found within the model!")
+            preprocessing_layer = preprocessing_layers[0]
+
         if self.fit_basis is not None:
             output_dictionaries = []
             for d in self.fit_basis: