From 2517a0948b9ad187c53d7f5f6b8944aba2ffa831 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 1 Jun 2021 09:53:37 +0200
Subject: [PATCH 1/6] parallel replicas now accept genrep=true

---
 n3fit/runcards/Basic_runcard_parallel.yml |  5 +-
 n3fit/src/n3fit/checks.py                 | 22 ++-----
 n3fit/src/n3fit/model_gen.py              | 73 +++++++++++++++--------
 n3fit/src/n3fit/model_trainer.py          |  2 +-
 n3fit/src/n3fit/performfit.py             | 62 ++++++++++++++-----
 5 files changed, 104 insertions(+), 60 deletions(-)

diff --git a/n3fit/runcards/Basic_runcard_parallel.yml b/n3fit/runcards/Basic_runcard_parallel.yml
index ff64063412..5d35e50bd0 100644
--- a/n3fit/runcards/Basic_runcard_parallel.yml
+++ b/n3fit/runcards/Basic_runcard_parallel.yml
@@ -38,7 +38,7 @@ theory:
 trvlseed: 1
 nnseed: 2
 mcseed: 3
-genrep: False     # true = generate MC replicas, false = use real data
+genrep: True     # true = generate MC replicas, false = use real data
 
 parameters: # This defines the parameter dictionary that is passed to the Model Trainer
   nodes_per_layer: [15, 10, 8]
@@ -86,4 +86,5 @@ positivity:
 ############################################################
 debug: False
 maxcores: 8
-parallel_models: 4
+parallel_models: true
+same_trvl_per_replica: true
diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
index 4774510af2..4d36d5c858 100644
--- a/n3fit/src/n3fit/checks.py
+++ b/n3fit/src/n3fit/checks.py
@@ -354,27 +354,15 @@ def can_run_multiple_replicas(replicas, genrep, parallel_models):
     """Checks whether a runcard which is trying to run several replicas at once
     (parallel_models =/= 1) is valid
     """
-    rp = len(replicas)
-    if rp > 1 and not genrep:
-        raise CheckError(
-            "Can't run more than one replica at once if no replicas are to be generated"
-        )
-    if rp > 1 and parallel_models != 1:
-        raise CheckError("Parallel mode cannot be used together with multireplica runs")
-
-@make_argcheck
-def can_run_parallel_replicas(genrep, parameters, hyperopt, parallel_models):
-    """Checks whether a runcard which is trying to run several replicas at once
-    (parallel_models =/= 1) is valid
-    """
-    if parallel_models == 1:
+    if not parallel_models:
         return
-    if hyperopt:
+    rp = len(replicas)
+    if hyperopt and rp > 1:
         raise CheckError("Running replicas in parallel with hyperopt is still not supported")
-    if genrep:
-        raise CheckError("Replica generation is not supported yet for parallel models")
     if parameters.get("layer_type") != "dense":
         raise CheckError("Parallelization has only been tested with layer_type=='dense'")
+    if rp == 1:
+        log.warning("parallel_models is set to true for only one replica")
 
 
 @make_argcheck
diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index b85a5835a2..11ec7a2c04 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -6,6 +6,8 @@
             Generates the output layers as functions
         # pdfNN_layer_generator:
             Generates the PDF NN layer to be fitted
+
+
 """
 from dataclasses import dataclass
 import numpy as np
@@ -18,6 +20,7 @@
 from n3fit.backends import MetaLayer, Lambda
 from n3fit.backends import base_layer_selector, regularizer_selector
 
+
 @dataclass
 class ObservableWrapper:
     """Wrapper to generate the observable layer once the PDF model is prepared
@@ -53,7 +56,7 @@ def _generate_loss(self, mask=None):
         return loss
 
     def _generate_experimental_layer(self, pdf):
-        """ Generates the experimental layer from the PDF """
+        """Generates the experimental layer from the PDF"""
         # First split the layer into the different datasets (if needed!)
         if len(self.dataset_xsizes) > 1:
             splitting_layer = op.as_layer(
@@ -79,7 +82,9 @@ def __call__(self, pdf_layer, mask=None):
         return loss_f(experiment_prediction)
 
 
-def observable_generator(spec_dict, positivity_initial=1.0, integrability=False):  # pylint: disable=too-many-locals
+def observable_generator(
+    spec_dict, positivity_initial=1.0, integrability=False
+):  # pylint: disable=too-many-locals
     """
     This function generates the observable model for each experiment.
     These are models which takes as input a PDF tensor (1 x size_of_xgrid x flavours) and outputs
@@ -199,7 +204,6 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
         model_obs_vl.append(obs_layer_vl)
         model_obs_ex.append(obs_layer_ex)
 
-
     full_nx = sum(dataset_xsizes)
     if spec_dict["positivity"]:
         out_positivity = ObservableWrapper(
@@ -212,10 +216,10 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
         )
 
         layer_info = {
-                "inputs": model_inputs,
-                "output_tr": out_positivity,
-                "experiment_xsize" : full_nx
-                }
+            "inputs": model_inputs,
+            "output_tr": out_positivity,
+            "experiment_xsize": full_nx,
+        }
         # For positivity we end here
         return layer_info
 
@@ -264,13 +268,10 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
         "output_tr": out_tr,
         "output_vl": out_vl,
         "experiment_xsize": full_nx,
-        }
+    }
     return layer_info
 
 
-
-
-
 # Network generation functions
 def generate_dense_network(
     nodes_in,
@@ -436,6 +437,17 @@ def pdfNN_layer_generator(
     Finally we output the final answer as well as the list of all generating functions
     in the model for easy usage within `n3fit`.
 
+    Example
+    -------
+
+    >>> import numpy as np
+    >>> from n3fit.vpinterface import N3PDF
+    >>> from n3fit.model_gen import pdfNN_layer_generator
+    >>> from validphys.pdfgrids import xplotting_grid
+    >>> fake_fl = [{'fl' : i, 'largex' : [0,1], 'smallx': [1,2]} for i in ['u', 'ubar', 'd', 'dbar', 'c', 'cbar', 's', 'sbar']]
+    >>> fake_x = np.linspace(1e-3,0.8,3)
+    >>> pdf_model = pdfNN_layer_generator(nodes=[8], activations=['linear'], seed=[2,3], flav_info=fake_fl, parallel_models=2)
+
     Parameters
     ----------
         inp: int
@@ -460,10 +472,10 @@ def pdfNN_layer_generator(
         dropout: float
             rate of dropout layer by layer
         impose_sumrule: str
-            whether to impose sumrule on the output pdf model and which one to impose (All, MSR, VSR)
+            whether to impose sumrules on the output pdf and which one to impose (All, MSR, VSR)
         scaler: scaler
             Function to apply to the input. If given the input to the model
-            will be a (1, None, 2) tensor where dim [:,:,0] is scaled 
+            will be a (1, None, 2) tensor where dim [:,:,0] is scaled
         parallel_models: int
             How many models should be trained in parallel
 
@@ -473,6 +485,11 @@ def pdfNN_layer_generator(
             a model f(x) = y where x is a tensor (1, xgrid, 1) and y a tensor (1, xgrid, out)
     """
     # Parse the input configuration
+    if seed is None:
+        seed = parallel_models * [None]
+    elif isinstance(seed, int):
+        seed = parallel_models * [seed]
+
     if nodes is None:
         nodes = [15, 8]
     ln = len(nodes)
@@ -492,7 +509,7 @@ def pdfNN_layer_generator(
 
     number_of_layers = len(nodes)
     # The number of nodes in the last layer is equal to the number of fitted flavours
-    last_layer_nodes = nodes[-1] # (== len(flav_info))
+    last_layer_nodes = nodes[-1]  # (== len(flav_info))
 
     # Generate the generic layers that will not depend on extra considerations
 
@@ -506,18 +523,18 @@ def pdfNN_layer_generator(
     # TODO: make it its own option (i.e., one could want to use this without using scaler)
     if scaler:
         # change the input domain [0,1] -> [-1,1]
-        process_input = Lambda(lambda  x: 2*x-1)
+        process_input = Lambda(lambda x: 2 * x - 1)
         subtract_one = True
         input_x_eq_1 = scaler([1.0])[0]
         placeholder_input = Input(shape=(None, 2), batch_size=1)
-    elif inp==2:
+    elif inp == 2:
         # If the input is of type (x, logx)
         # create a x --> (x, logx) layer to preppend to everything
         process_input = Lambda(lambda x: op.concatenate([x, op.op_log(x)], axis=-1))
 
     model_input = [placeholder_input]
     if subtract_one:
-        layer_x_eq_1 = op.numpy_to_input(np.array(input_x_eq_1).reshape(1,1))
+        layer_x_eq_1 = op.numpy_to_input(np.array(input_x_eq_1).reshape(1, 1))
         model_input.append(layer_x_eq_1)
 
     # Evolution layer
@@ -533,12 +550,9 @@ def pdfNN_layer_generator(
     else:
         sumrule_layer = lambda x: x
 
-
     # Now we need a trainable network per model to be trained in parallel
     pdf_models = []
-    for i in range(parallel_models):
-        # Move the seed
-        layer_seed = seed + i * number_of_layers
+    for i, layer_seed in enumerate(seed):
         if layer_type == "dense":
             reg = regularizer_selector(regularizer, **regularizer_args)
             list_of_pdf_layers = generate_dense_network(
@@ -555,7 +569,12 @@ def pdfNN_layer_generator(
             # TODO: this information should come from the basis information
             #       once the basis information is passed to this class
             list_of_pdf_layers = generate_dense_per_flavour_network(
-                inp, nodes, activations, initializer_name, seed=layer_seed, basis_size=last_layer_nodes,
+                inp,
+                nodes,
+                activations,
+                initializer_name,
+                seed=layer_seed,
+                basis_size=last_layer_nodes,
             )
 
         def dense_me(x):
@@ -568,18 +587,18 @@ def dense_me(x):
                 curr_fun = dense_layer(curr_fun)
             return curr_fun
 
-        preproseed = layer_seed + number_of_layers * (i + 1)
+        preproseed = layer_seed + number_of_layers
         layer_preproc = Preprocessing(
             flav_info=flav_info,
             input_shape=(1,),
             name=f"pdf_prepro_{i}",
             seed=preproseed,
-            large_x = not subtract_one
+            large_x=not subtract_one,
         )
 
         # Apply preprocessing and basis
         def layer_fitbasis(x):
-            """ The tensor x has a expected shape of (1, None, {1,2})
+            """The tensor x has a expected shape of (1, None, {1,2})
             where x[...,0] corresponds to the feature_scaled input and x[...,-1] the original input
             """
             x_scaled = op.op_gather_keep_dims(x, 0, axis=-1)
@@ -604,6 +623,8 @@ def layer_pdf(x):
         final_pdf = sumrule_layer(layer_pdf)
 
         # Create the model
-        pdf_model = MetaModel(model_input, final_pdf(placeholder_input), name=f"PDF_{i}", scaler=scaler)
+        pdf_model = MetaModel(
+            model_input, final_pdf(placeholder_input), name=f"PDF_{i}", scaler=scaler
+        )
         pdf_models.append(pdf_model)
     return pdf_models
diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
index 853983cad1..30ba439199 100644
--- a/n3fit/src/n3fit/model_trainer.py
+++ b/n3fit/src/n3fit/model_trainer.py
@@ -111,6 +111,7 @@ def __init__(
                 the name of the basis being fitted
             nnseed: int
                 the seed used to initialise the Neural Network, will be passed to model_gen
+                can be a list of lists of parallel_models > 1
             pass_status: str
                 flag to signal a good run
             failed_status: str
@@ -128,7 +129,6 @@ def __init__(
             parallel_models: int
                 number of models to fit in parallel
         """
-
         # Save all input information
         self.exp_info = exp_info
         self.pos_info = pos_info
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 4cec1c751a..c5838d65ed 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -3,6 +3,8 @@
 """
 
 # Backend-independent imports
+from collections import namedtuple
+import copy
 import logging
 import numpy as np
 import n3fit.checks
@@ -36,7 +38,7 @@ def performfit(
     tensorboard=None,
     debug=False,
     maxcores=None,
-    parallel_models=1
+    parallel_models=False
 ):
     """
         This action will (upon having read a validcard) process a full PDF fit
@@ -115,8 +117,8 @@ def performfit(
                 activate some debug options
             maxcores: int
                 maximum number of (logical) cores that the backend should be aware of
-            parallel_models: int
-                number of models to be run in parallel
+            parallel_models: bool
+                whether to run models in parallel
     """
     from n3fit.backends import set_initial_state
 
@@ -132,18 +134,45 @@ def performfit(
     from n3fit.model_trainer import ModelTrainer
     from n3fit.io.writer import WriterWrapper
 
+    # If models are to be run in parallel, we just need to enter the replica loop once
+    # but we need all data
+    n_models = len(replicas_nnseed_fitting_data_dict)
+    if parallel_models or n_models == 1:
+        replicas, replica_experiments, nnseeds = zip(*replicas_nnseed_fitting_data_dict)
+        # Parse the experiments so that the output data contain information for all replicas
+        # as the only different from replica to replica is the experimental training/validation data
+        all_experiments = copy.deepcopy(replica_experiments[0])
+        for i_exp in range(len(all_experiments)):
+            training_data = []
+            validation_data = []
+            for i_rep in range(n_models):
+                training_data.append(replica_experiments[i_rep][i_exp]['expdata'])
+                validation_data.append(replica_experiments[i_rep][i_exp]['expdata_vl'])
+            all_experiments[i_exp]['expdata'] = np.concatenate(training_data, axis=0)
+            all_experiments[i_exp]['expdata_vl'] = np.concatenate(validation_data, axis=0)
+        log.info(
+            "Starting replica fits %d to %d",
+            replicas[0],
+            replicas[0] + n_models - 1,
+        )
+        replicas_info = [(replicas, all_experiments, nnseeds)]
+    else:
+        replicas_info = replicas_nnseed_fitting_data_dict
+
     # Note: In the basic scenario we are only running for one replica and thus this loop is only
     # run once as replicas_nnseed_fitting_data_dict is a list of just one element
-    for replica_number, exp_info, nnseed in replicas_nnseed_fitting_data_dict:
-        replica_path_set = replica_path / f"replica_{replica_number}"
-        if parallel_models == 1:
+
+    # Note: there are three possible scenarios for this loop:
+    #   1.- Only one replica is being run, in this case the loop is only evaluated once
+    #   2.- Many replicas being run, in this case each will have a replica_number, seed, etc
+    #       and they will be fitted sequentially
+    #   3.- Many replicas being run in parallel. In this case the loop will be evaluated just once
+    #       but a model per replica will be generated
+    for replica_numbers, exp_info, nnseed in replicas_info:
+        # TODO: replica_number is too restrictive as a name!
+        if not parallel_models:
+            replica_number = replica_numbers
             log.info("Starting replica fit %s", replica_number)
-        else:
-            log.info(
-                "Starting replica fits %s to %s",
-                replica_number,
-                replica_number + parallel_models - 1,
-            )
 
         # Generate a ModelTrainer object
         # this object holds all necessary information to train a PDF (up to the NN definition)
@@ -159,7 +188,7 @@ def performfit(
             max_cores=maxcores,
             model_file=load,
             sum_rules=sum_rules,
-            parallel_models=parallel_models
+            parallel_models=n_models
         )
 
         # This is just to give a descriptive name to the fit function
@@ -178,6 +207,7 @@ def performfit(
         if hyperopt:
             from n3fit.hyper_optimization.hyper_scan import hyper_scan_wrapper
 
+            replica_path_set = replica_path / f"replica_{replica_number}"
             true_best = hyper_scan_wrapper(
                 replica_path_set, the_model_trainer, parameters, hyperscan, max_evals=hyperopt,
             )
@@ -198,6 +228,10 @@ def performfit(
         if tensorboard is not None:
             profiling = tensorboard.get("profiling", False)
             weight_freq = tensorboard.get("weight_freq", 0)
+            if parallel_models:
+                replica_path_set = replica_path
+            else:
+                replica_path_set = replica_path / f"replica_{replica_number}"
             log_path = replica_path_set / "tboard"
             the_model_trainer.enable_tensorboard(log_path, weight_freq, profiling)
 
@@ -216,7 +250,7 @@ def performfit(
         all_training_chi2, all_val_chi2, all_exp_chi2 = the_model_trainer.evaluate(stopping_object)
 
         pdf_models = result["pdf_models"]
-        for i, pdf_model in enumerate(pdf_models):
+        for i, (replica_number, pdf_model) in enumerate(zip(replica_numbers, pdf_models)):
             # Each model goes into its own replica folder
             replica_path_set = replica_path / f"replica_{replica_number + i}"
 

From 59eb42d1f3d5481d8bc538b3a80f2c90ad567a59 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 1 Jun 2021 12:57:20 +0200
Subject: [PATCH 2/6] change docs, beautify

---
 doc/sphinx/source/n3fit/runcard_detailed.rst | 24 ++++++++-
 n3fit/src/n3fit/checks.py                    | 27 ++++++----
 n3fit/src/n3fit/model_gen.py                 |  2 +-
 n3fit/src/n3fit/model_trainer.py             | 17 +++---
 n3fit/src/n3fit/performfit.py                | 54 ++++++++++++--------
 5 files changed, 79 insertions(+), 45 deletions(-)

diff --git a/doc/sphinx/source/n3fit/runcard_detailed.rst b/doc/sphinx/source/n3fit/runcard_detailed.rst
index 091819644a..7a9ad9fce0 100644
--- a/doc/sphinx/source/n3fit/runcard_detailed.rst
+++ b/doc/sphinx/source/n3fit/runcard_detailed.rst
@@ -291,14 +291,34 @@ Running fits in parallel
 ------------------------
 
 It is possible to run fits in parallel with ``n3fit`` by using the ``parallel_models``
-flag in the runcard (by default the number of ``parallel_models`` is set to 1).
+flag in the runcard when running a range of replicas.
 Running in parallel can be quite hard on memory and it is only advantageous when
 fitting on a GPU, where one can find a speed up equal to the number of models run
 in parallel (each model being a different replica).
 
-At present it cannot be used together with the ``hyperopt`` module.
+Running in parallel leverages the fact that the only difference between two replicas
+is the output data the prediction is compared to.
+In order to ensure this is indeed the case it is necessary to also
+use the `same_trvl_per_replica` flag in the runcard.
+
+In order to run several replicas in parallel, add the following options
+to the runcard:
+
+.. code-block:: yaml
+
+  parallel_models: true
+  same_trvl_per_replica: true
 
 
+And then run ``n3fit`` with the replica range to be parallelized
+(in this case from replica 1 to replica 4).
+
+.. code-block:: bash
+  
+   n3fit runcard.yml 1 -r 4
+
+At present it cannot be used together with the ``hyperopt`` module.
+
 .. _otheroptions-label:
 
 Other options
diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
index 4d36d5c858..40ef62edef 100644
--- a/n3fit/src/n3fit/checks.py
+++ b/n3fit/src/n3fit/checks.py
@@ -265,20 +265,18 @@ def check_hyperopt_stopping(stopping_dict):
         if min_ep is None or max_ep is None:
             raise CheckError("Need to set both the max_epochs and the min_epochs")
         if min_ep < 1:
-            raise CheckError(f"Can't run for less than 1 epoch: " "selected min_ep = {min_ep}")
+            raise CheckError(f"Can't run for less than 1 epoch: selected min_ep = {min_ep}")
         if max_ep <= min_ep:
             raise CheckError(f"min_epochs cannot be greater than max_epochs: ({min_ep} > {max_ep})")
     min_pat = stopping_dict.get("min_patience")
     max_pat = stopping_dict.get("max_patience")
     if min_pat is not None or max_pat is not None:
         if min_pat is not None and min_pat < 0.0:
-            raise CheckError(
-                f"min_patience cannot be less than 0.0: " "selected min_pat = {min_pat}"
-            )
+            raise CheckError(f"min_patience cannot be less than 0.0: selected min_pat = {min_pat}")
         if max_pat is not None:
             if max_pat > 1.0:
                 raise CheckError(
-                    f"max_patience cannot be greater than 1.0: " "selected max_pat = {max_pat}"
+                    f"max_patience cannot be greater than 1.0: selected max_pat = {max_pat}"
                 )
             if min_pat is not None and max_pat < min_pat:
                 raise CheckError(
@@ -292,7 +290,7 @@ def wrapper_hyperopt(hyperopt, hyperscan, genrep, data):
     No check is performed if hyperopt is not active
     """
     if not hyperopt:
-        return None
+        return
     if genrep:
         raise CheckError("Generation of replicas is not accepted during hyperoptimization")
     if hyperscan is None:
@@ -350,19 +348,26 @@ def check_consistent_basis(sum_rules, fitbasis, basis, theoryid):
 
 
 @make_argcheck
-def can_run_multiple_replicas(replicas, genrep, parallel_models):
+def can_run_multiple_replicas(
+    replicas, parameters, hyperopt, parallel_models, same_trvl_per_replica
+):
     """Checks whether a runcard which is trying to run several replicas at once
     (parallel_models =/= 1) is valid
     """
     if not parallel_models:
         return
-    rp = len(replicas)
-    if hyperopt and rp > 1:
+    if len(replicas) == 1:
+        log.warning("parallel_models is set to true for only one replica")
+        return
+    if not same_trvl_per_replica:
+        raise CheckError(
+            "Replicas cannot be run in parallel with different training/validation "
+            " masks, please set `same_trvl_per_replica` to True in the runcard"
+        )
+    if hyperopt:
         raise CheckError("Running replicas in parallel with hyperopt is still not supported")
     if parameters.get("layer_type") != "dense":
         raise CheckError("Parallelization has only been tested with layer_type=='dense'")
-    if rp == 1:
-        log.warning("parallel_models is set to true for only one replica")
 
 
 @make_argcheck
diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 11ec7a2c04..a28ad857f1 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -467,7 +467,7 @@ def pdfNN_layer_generator(
             to be used by Preprocessing
         out: int
             number of output flavours of the model (default 14)
-        seed: int
+        seed: list(int)
             seed to initialize the NN
         dropout: float
             rate of dropout layer by layer
diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
index 30ba439199..332be94fc2 100644
--- a/n3fit/src/n3fit/model_trainer.py
+++ b/n3fit/src/n3fit/model_trainer.py
@@ -86,7 +86,7 @@ def __init__(
         integ_info,
         flavinfo,
         fitbasis,
-        nnseed,
+        nnseeds,
         pass_status="ok",
         failed_status="fail",
         debug=False,
@@ -109,9 +109,8 @@ def __init__(
                 the object returned by fitting['basis']
             fitbasis: str
                 the name of the basis being fitted
-            nnseed: int
-                the seed used to initialise the Neural Network, will be passed to model_gen
-                can be a list of lists of parallel_models > 1
+            nnseeds: list(int)
+                the seed used to initialise the NN for each model to be passed to model_gen
             pass_status: str
                 flag to signal a good run
             failed_status: str
@@ -139,7 +138,7 @@ def __init__(
             self.all_info = exp_info + pos_info
         self.flavinfo = flavinfo
         self.fitbasis = fitbasis
-        self.NNseed = nnseed
+        self._nn_seeds = nnseeds
         self.pass_status = pass_status
         self.failed_status = failed_status
         self.debug = debug
@@ -795,10 +794,10 @@ def hyperparametrizable(self, params):
         ### Training loop
         for k, partition in enumerate(self.kpartitions):
             # Each partition of the kfolding needs to have its own separate model
-            seed = self.NNseed
+            # and the seed needs to be updated accordingly
+            seeds = self._nn_seeds
             if k > 0:
-                # Update the seed
-                seed = np.random.randint(0, pow(2, 31))
+                seeds = [np.random.randint(0, pow(2, 31))]
 
             # Generate the pdf model
             pdf_models = self._generate_pdf(
@@ -809,7 +808,7 @@ def hyperparametrizable(self, params):
                 params["dropout"],
                 params.get("regularizer", None),  # regularizer optional
                 params.get("regularizer_args", None),
-                seed,
+                seeds,
             )
 
             # Model generation joins all the different observable layers
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index c5838d65ed..3baae81470 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -134,10 +134,24 @@ def performfit(
     from n3fit.model_trainer import ModelTrainer
     from n3fit.io.writer import WriterWrapper
 
-    # If models are to be run in parallel, we just need to enter the replica loop once
-    # but we need all data
+    # Note: there are three possible scenarios for the loop of replicas:
+    #   1.- Only one replica is being run, in this case the loop is only evaluated once
+    #   2.- Many replicas being run, in this case each will have a replica_number, seed, etc
+    #       and they will be fitted sequentially
+    #   3.- Many replicas being run in parallel. In this case the loop will be evaluated just once
+    #       but a model per replica will be generated
+    #
+    # In the main scenario (1) replicas_nnseed_fitting_data_dict is a list of just one element
+    # case (3) is similar but the one element of replicas_nnseed_fitting_data_dict will be modified
+    # to be (
+    #       [list of all replica idx],
+    #       one experiment with data=(replicas, ndata),
+    #       [list of all NN seeds]
+    #       )
+    #
+
     n_models = len(replicas_nnseed_fitting_data_dict)
-    if parallel_models or n_models == 1:
+    if parallel_models and n_models != 1:
         replicas, replica_experiments, nnseeds = zip(*replicas_nnseed_fitting_data_dict)
         # Parse the experiments so that the output data contain information for all replicas
         # as the only different from replica to replica is the experimental training/validation data
@@ -159,20 +173,13 @@ def performfit(
     else:
         replicas_info = replicas_nnseed_fitting_data_dict
 
-    # Note: In the basic scenario we are only running for one replica and thus this loop is only
-    # run once as replicas_nnseed_fitting_data_dict is a list of just one element
-
-    # Note: there are three possible scenarios for this loop:
-    #   1.- Only one replica is being run, in this case the loop is only evaluated once
-    #   2.- Many replicas being run, in this case each will have a replica_number, seed, etc
-    #       and they will be fitted sequentially
-    #   3.- Many replicas being run in parallel. In this case the loop will be evaluated just once
-    #       but a model per replica will be generated
-    for replica_numbers, exp_info, nnseed in replicas_info:
-        # TODO: replica_number is too restrictive as a name!
-        if not parallel_models:
-            replica_number = replica_numbers
-            log.info("Starting replica fit %s", replica_number)
+    for replica_idxs, exp_info, nnseeds in replicas_info:
+        if not parallel_models or n_models == 1:
+            # Cases 1 and 2 above are a special case of 3 where the replica idx and the seed should
+            # be a list of just one element
+            replica_idxs = [replica_idxs]
+            nnseeds = [nnseeds]
+            log.info("Starting replica fit %d", replica_idxs[0])
 
         # Generate a ModelTrainer object
         # this object holds all necessary information to train a PDF (up to the NN definition)
@@ -182,7 +189,7 @@ def performfit(
             integdatasets_fitting_integ_dict,
             basis,
             fitbasis,
-            nnseed,
+            nnseeds,
             debug=debug,
             kfold_parameters=kfold_parameters,
             max_cores=maxcores,
@@ -207,7 +214,8 @@ def performfit(
         if hyperopt:
             from n3fit.hyper_optimization.hyper_scan import hyper_scan_wrapper
 
-            replica_path_set = replica_path / f"replica_{replica_number}"
+            # Note that hyperopt will not run in parallel or with more than one model _for now_
+            replica_path_set = replica_path / f"replica_{replica_idxs[0]}"
             true_best = hyper_scan_wrapper(
                 replica_path_set, the_model_trainer, parameters, hyperscan, max_evals=hyperopt,
             )
@@ -228,10 +236,12 @@ def performfit(
         if tensorboard is not None:
             profiling = tensorboard.get("profiling", False)
             weight_freq = tensorboard.get("weight_freq", 0)
-            if parallel_models:
+            if parallel_models and n_models != 1:
+                # If using tensorboard when running in parallel
+                # dump the debugging data to the nnfit folder
                 replica_path_set = replica_path
             else:
-                replica_path_set = replica_path / f"replica_{replica_number}"
+                replica_path_set = replica_path / f"replica_{replica_idxs[0]}"
             log_path = replica_path_set / "tboard"
             the_model_trainer.enable_tensorboard(log_path, weight_freq, profiling)
 
@@ -250,7 +260,7 @@ def performfit(
         all_training_chi2, all_val_chi2, all_exp_chi2 = the_model_trainer.evaluate(stopping_object)
 
         pdf_models = result["pdf_models"]
-        for i, (replica_number, pdf_model) in enumerate(zip(replica_numbers, pdf_models)):
+        for i, (replica_number, pdf_model) in enumerate(zip(replica_idxs, pdf_models)):
             # Each model goes into its own replica folder
             replica_path_set = replica_path / f"replica_{replica_number + i}"
 

From cc182b6eec7c2172f8de4c08943d4b08a9d12a40 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Wed, 9 Jun 2021 15:18:52 +0200
Subject: [PATCH 3/6] remove offset

---
 n3fit/src/n3fit/performfit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 3baae81470..12e4411342 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -262,7 +262,7 @@ def performfit(
         pdf_models = result["pdf_models"]
         for i, (replica_number, pdf_model) in enumerate(zip(replica_idxs, pdf_models)):
             # Each model goes into its own replica folder
-            replica_path_set = replica_path / f"replica_{replica_number + i}"
+            replica_path_set = replica_path / f"replica_{replica_number}"
 
             # Create a pdf instance
             pdf_instance = N3PDF(pdf_model, fit_basis=basis)
@@ -287,7 +287,7 @@ def performfit(
             )
             log.info(
                     "Best fit for replica #%d, chi2=%.3f (tr=%.3f, vl=%.3f)",
-                    replica_number+i,
+                    replica_number,
                     exp_chi2,
                     training_chi2,
                     val_chi2

From 5020187650bb5a100ce9ff8c71d8c0a522824e04 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Wed, 23 Jun 2021 12:36:16 +0200
Subject: [PATCH 4/6] apply comments

---
 doc/sphinx/source/n3fit/runcard_detailed.rst | 21 ++++++++++++++------
 n3fit/src/n3fit/performfit.py                |  2 +-
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/doc/sphinx/source/n3fit/runcard_detailed.rst b/doc/sphinx/source/n3fit/runcard_detailed.rst
index 7a9ad9fce0..8a6e5a2828 100644
--- a/doc/sphinx/source/n3fit/runcard_detailed.rst
+++ b/doc/sphinx/source/n3fit/runcard_detailed.rst
@@ -290,8 +290,8 @@ as well as a detailed analysis of the amount of time that TensorFlow spent on ea
 Running fits in parallel
 ------------------------
 
-It is possible to run fits in parallel with ``n3fit`` by using the ``parallel_models``
-flag in the runcard when running a range of replicas.
+It is possible to run fits in parallel with ``n3fit`` by setting the ``parallel_models``
+flag in the runcard to ``true`` when running a range of replicas.
 Running in parallel can be quite hard on memory and it is only advantageous when
 fitting on a GPU, where one can find a speed up equal to the number of models run
 in parallel (each model being a different replica).
@@ -301,8 +301,10 @@ is the output data the prediction is compared to.
 In order to ensure this is indeed the case it is necessary to also
 use the `same_trvl_per_replica` flag in the runcard.
 
-In order to run several replicas in parallel, add the following options
-to the runcard:
+In other words, in order to run several replicas in parallel in a machine
+(be it a big CPU or, most likely, a GPU)
+it is necessary to modify the ``n3fit`` runcard by adding the following two
+top-level options:
 
 .. code-block:: yaml
 
@@ -310,14 +312,21 @@ to the runcard:
   same_trvl_per_replica: true
 
 
-And then run ``n3fit`` with the replica range to be parallelized
+And then run ``n3fit`` with a replica range to be parallelized
 (in this case from replica 1 to replica 4).
 
 .. code-block:: bash
   
    n3fit runcard.yml 1 -r 4
 
-At present it cannot be used together with the ``hyperopt`` module.
+
+In machines with more than one GPU you can select the GPU in which the code
+should run by setting the environment variable ``CUDA_VISIBLE_DEVICES``
+to the right index (usually ``0, 1, 2``) or leaving it explicitly empty
+to avoid running on GPU: ``export CUDA_VISIBLE_DEVICES=""``
+
+
+Note that at present it cannot be used together with the ``hyperopt`` module.
 
 .. _otheroptions-label:
 
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 12e4411342..635426367c 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -165,7 +165,7 @@ def performfit(
             all_experiments[i_exp]['expdata'] = np.concatenate(training_data, axis=0)
             all_experiments[i_exp]['expdata_vl'] = np.concatenate(validation_data, axis=0)
         log.info(
-            "Starting replica fits %d to %d",
+            "Starting parallel fits from replica %d to %d",
             replicas[0],
             replicas[0] + n_models - 1,
         )

From b6426c9d93c3bc8f524f07900db229f1fdbdee63 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Wed, 23 Jun 2021 14:56:08 +0200
Subject: [PATCH 5/6] fix rebase

---
 n3fit/src/n3fit/n3fit_checks_provider.py | 5 +++--
 n3fit/src/n3fit/performfit.py            | 1 -
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/n3fit/src/n3fit/n3fit_checks_provider.py b/n3fit/src/n3fit/n3fit_checks_provider.py
index af6b6972d4..b5933675cb 100644
--- a/n3fit/src/n3fit/n3fit_checks_provider.py
+++ b/n3fit/src/n3fit/n3fit_checks_provider.py
@@ -6,7 +6,7 @@
 
 import n3fit.checks
 
-@n3fit.checks.can_run_parallel_replicas
+@n3fit.checks.can_run_multiple_replicas
 @n3fit.checks.check_consistent_basis
 @n3fit.checks.wrapper_check_NN
 @n3fit.checks.wrapper_hyperopt
@@ -25,6 +25,7 @@ def n3fit_checks_action(
     hyperscan=None,
     hyperopt=None,
     tensorboard=None,
-    parallel_models=1,
+    parallel_models=False,
+    same_trvl_per_replica=False,
 ):
     return
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 635426367c..762373b1f6 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -15,7 +15,6 @@
 
 # Action to be called by validphys
 # All information defining the NN should come here in the "parameters" dict
-@n3fit.checks.can_run_multiple_replicas
 def performfit(
     *,
     n3fit_checks_action, # used for checks

From 66e0368830106988f5e3c429fc3002d9483e4b29 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Wed, 23 Jun 2021 15:17:50 +0200
Subject: [PATCH 6/6] isolate the checks that need replicas

---
 n3fit/src/n3fit/checks.py                | 21 +++++++++++++--------
 n3fit/src/n3fit/n3fit_checks_provider.py |  4 ++--
 n3fit/src/n3fit/performfit.py            |  5 +++--
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
index 40ef62edef..9957f17b66 100644
--- a/n3fit/src/n3fit/checks.py
+++ b/n3fit/src/n3fit/checks.py
@@ -348,17 +348,12 @@ def check_consistent_basis(sum_rules, fitbasis, basis, theoryid):
 
 
 @make_argcheck
-def can_run_multiple_replicas(
-    replicas, parameters, hyperopt, parallel_models, same_trvl_per_replica
-):
-    """Checks whether a runcard which is trying to run several replicas at once
-    (parallel_models =/= 1) is valid
+def check_consistent_parallel(hyperopt, parameters, parallel_models, same_trvl_per_replica):
+    """Checks whether the multiple-replica fit options are consistent among them
+    i.e., that the trvl seed is fixed, hyperopt is not on and the layer type is correct
     """
     if not parallel_models:
         return
-    if len(replicas) == 1:
-        log.warning("parallel_models is set to true for only one replica")
-        return
     if not same_trvl_per_replica:
         raise CheckError(
             "Replicas cannot be run in parallel with different training/validation "
@@ -370,6 +365,16 @@ def can_run_multiple_replicas(
         raise CheckError("Parallelization has only been tested with layer_type=='dense'")
 
 
+@make_argcheck
+def can_run_multiple_replicas(replicas, parallel_models):
+    """Warns the user if trying to run just one replica in parallel"""
+    if not parallel_models:
+        return
+    if len(replicas) == 1:
+        log.warning("parallel_models is set to true for only one replica")
+        return
+
+
 @make_argcheck
 def check_deprecated_options(fitting):
     """Checks whether the runcard is using deprecated options"""
diff --git a/n3fit/src/n3fit/n3fit_checks_provider.py b/n3fit/src/n3fit/n3fit_checks_provider.py
index b5933675cb..e8cd01c162 100644
--- a/n3fit/src/n3fit/n3fit_checks_provider.py
+++ b/n3fit/src/n3fit/n3fit_checks_provider.py
@@ -6,11 +6,11 @@
 
 import n3fit.checks
 
-@n3fit.checks.can_run_multiple_replicas
 @n3fit.checks.check_consistent_basis
 @n3fit.checks.wrapper_check_NN
 @n3fit.checks.wrapper_hyperopt
 @n3fit.checks.check_deprecated_options
+@n3fit.checks.check_consistent_parallel
 def n3fit_checks_action(
     *,
     genrep,
@@ -26,6 +26,6 @@ def n3fit_checks_action(
     hyperopt=None,
     tensorboard=None,
     parallel_models=False,
-    same_trvl_per_replica=False,
+    same_trvl_per_replica=False
 ):
     return
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 762373b1f6..da85196be6 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -15,10 +15,11 @@
 
 # Action to be called by validphys
 # All information defining the NN should come here in the "parameters" dict
+@n3fit.checks.can_run_multiple_replicas
 def performfit(
     *,
-    n3fit_checks_action, # used for checks
-    replicas, # used for checks specific to performfit
+    n3fit_checks_action, # wrapper for all checks
+    replicas, # checks specific to performfit
     replicas_nnseed_fitting_data_dict,
     posdatasets_fitting_pos_dict,
     integdatasets_fitting_integ_dict,