NNPDF · scarrazza · Jun 23, 2021 · Jun 1, 2021 · Jun 1, 2021 · Jun 9, 2021
diff --git a/doc/sphinx/source/n3fit/runcard_detailed.rst b/doc/sphinx/source/n3fit/runcard_detailed.rst
@@ -290,14 +290,43 @@ as well as a detailed analysis of the amount of time that TensorFlow spent on ea
 Running fits in parallel
 ------------------------
 
-It is possible to run fits in parallel with ``n3fit`` by using the ``parallel_models``
-flag in the runcard (by default the number of ``parallel_models`` is set to 1).
+It is possible to run fits in parallel with ``n3fit`` by setting the ``parallel_models``
+flag in the runcard to ``true`` when running a range of replicas.
 Running in parallel can be quite hard on memory and it is only advantageous when
 fitting on a GPU, where one can find a speed up equal to the number of models run
 in parallel (each model being a different replica).
 
-At present it cannot be used together with the ``hyperopt`` module.
+Running in parallel leverages the fact that the only difference between two replicas
+is the output data the prediction is compared to.
+In order to ensure this is indeed the case it is necessary to also
+use the `same_trvl_per_replica` flag in the runcard.
 
+In other words, in order to run several replicas in parallel in a machine
+(be it a big CPU or, most likely, a GPU)
+it is necessary to modify the ``n3fit`` runcard by adding the following two
+top-level options:
+
+.. code-block:: yaml
+
+  parallel_models: true
+  same_trvl_per_replica: true
+
+
+And then run ``n3fit`` with a replica range to be parallelized
+(in this case from replica 1 to replica 4).
+
+.. code-block:: bash
+
+   n3fit runcard.yml 1 -r 4
+
+
+In machines with more than one GPU you can select the GPU in which the code
+should run by setting the environment variable ``CUDA_VISIBLE_DEVICES``
+to the right index (usually ``0, 1, 2``) or leaving it explicitly empty
+to avoid running on GPU: ``export CUDA_VISIBLE_DEVICES=""``
+
+
+Note that at present it cannot be used together with the ``hyperopt`` module.
 
 .. _otheroptions-label:
 

diff --git a/n3fit/runcards/Basic_runcard_parallel.yml b/n3fit/runcards/Basic_runcard_parallel.yml
@@ -38,7 +38,7 @@ theory:
 trvlseed: 1
 nnseed: 2
 mcseed: 3
-genrep: False     # true = generate MC replicas, false = use real data
+genrep: True     # true = generate MC replicas, false = use real data
 
 parameters: # This defines the parameter dictionary that is passed to the Model Trainer
   nodes_per_layer: [15, 10, 8]
@@ -86,4 +86,5 @@ positivity:
 ############################################################
 debug: False
 maxcores: 8
-parallel_models: 4
+parallel_models: true
+same_trvl_per_replica: true
diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
@@ -265,20 +265,18 @@ def check_hyperopt_stopping(stopping_dict):
         if min_ep is None or max_ep is None:
             raise CheckError("Need to set both the max_epochs and the min_epochs")
         if min_ep < 1:
-            raise CheckError(f"Can't run for less than 1 epoch: " "selected min_ep = {min_ep}")
+            raise CheckError(f"Can't run for less than 1 epoch: selected min_ep = {min_ep}")
         if max_ep <= min_ep:
             raise CheckError(f"min_epochs cannot be greater than max_epochs: ({min_ep} > {max_ep})")
     min_pat = stopping_dict.get("min_patience")
     max_pat = stopping_dict.get("max_patience")
     if min_pat is not None or max_pat is not None:
         if min_pat is not None and min_pat < 0.0:
-            raise CheckError(
-                f"min_patience cannot be less than 0.0: " "selected min_pat = {min_pat}"
-            )
+            raise CheckError(f"min_patience cannot be less than 0.0: selected min_pat = {min_pat}")
         if max_pat is not None:
             if max_pat > 1.0:
                 raise CheckError(
-                    f"max_patience cannot be greater than 1.0: " "selected max_pat = {max_pat}"
+                    f"max_patience cannot be greater than 1.0: selected max_pat = {max_pat}"
                 )
             if min_pat is not None and max_pat < min_pat:
                 raise CheckError(
@@ -292,7 +290,7 @@ def wrapper_hyperopt(hyperopt, hyperscan, genrep, data):
     No check is performed if hyperopt is not active
     """
     if not hyperopt:
-        return None
+        return
     if genrep:
         raise CheckError("Generation of replicas is not accepted during hyperoptimization")
     if hyperscan is None:
@@ -350,33 +348,33 @@ def check_consistent_basis(sum_rules, fitbasis, basis, theoryid):
 
 
 @make_argcheck
-def can_run_multiple_replicas(replicas, genrep, parallel_models):
-    """Checks whether a runcard which is trying to run several replicas at once
-    (parallel_models =/= 1) is valid
+def check_consistent_parallel(hyperopt, parameters, parallel_models, same_trvl_per_replica):
+    """Checks whether the multiple-replica fit options are consistent among them
+    i.e., that the trvl seed is fixed, hyperopt is not on and the layer type is correct
     """
-    rp = len(replicas)
-    if rp > 1 and not genrep:
+    if not parallel_models:
+        return
+    if not same_trvl_per_replica:
         raise CheckError(
-            "Can't run more than one replica at once if no replicas are to be generated"
+            "Replicas cannot be run in parallel with different training/validation "
+            " masks, please set `same_trvl_per_replica` to True in the runcard"
         )
-    if rp > 1 and parallel_models != 1:
-        raise CheckError("Parallel mode cannot be used together with multireplica runs")
-
-@make_argcheck
-def can_run_parallel_replicas(genrep, parameters, hyperopt, parallel_models):
-    """Checks whether a runcard which is trying to run several replicas at once
-    (parallel_models =/= 1) is valid
-    """
-    if parallel_models == 1:
-        return
     if hyperopt:
         raise CheckError("Running replicas in parallel with hyperopt is still not supported")
-    if genrep:
-        raise CheckError("Replica generation is not supported yet for parallel models")
     if parameters.get("layer_type") != "dense":
         raise CheckError("Parallelization has only been tested with layer_type=='dense'")
 
 
+@make_argcheck
+def can_run_multiple_replicas(replicas, parallel_models):
+    """Warns the user if trying to run just one replica in parallel"""
+    if not parallel_models:
+        return
+    if len(replicas) == 1:
+        log.warning("parallel_models is set to true for only one replica")
+        return
+
+
 @make_argcheck
 def check_deprecated_options(fitting):
     """Checks whether the runcard is using deprecated options"""

diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
@@ -6,6 +6,8 @@
             Generates the output layers as functions
         # pdfNN_layer_generator:
             Generates the PDF NN layer to be fitted
+
+
 """
 from dataclasses import dataclass
 import numpy as np
@@ -18,6 +20,7 @@
 from n3fit.backends import MetaLayer, Lambda
 from n3fit.backends import base_layer_selector, regularizer_selector
 
+
 @dataclass
 class ObservableWrapper:
     """Wrapper to generate the observable layer once the PDF model is prepared
@@ -53,7 +56,7 @@ def _generate_loss(self, mask=None):
         return loss
 
     def _generate_experimental_layer(self, pdf):
-        """ Generates the experimental layer from the PDF """
+        """Generates the experimental layer from the PDF"""
         # First split the layer into the different datasets (if needed!)
         if len(self.dataset_xsizes) > 1:
             splitting_layer = op.as_layer(
@@ -79,7 +82,9 @@ def __call__(self, pdf_layer, mask=None):
         return loss_f(experiment_prediction)
 
 
-def observable_generator(spec_dict, positivity_initial=1.0, integrability=False):  # pylint: disable=too-many-locals
+def observable_generator(
+    spec_dict, positivity_initial=1.0, integrability=False
+):  # pylint: disable=too-many-locals
     """
     This function generates the observable model for each experiment.
     These are models which takes as input a PDF tensor (1 x size_of_xgrid x flavours) and outputs
@@ -199,7 +204,6 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
         model_obs_vl.append(obs_layer_vl)
         model_obs_ex.append(obs_layer_ex)
 
-
     full_nx = sum(dataset_xsizes)
     if spec_dict["positivity"]:
         out_positivity = ObservableWrapper(
@@ -212,10 +216,10 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
         )
 
         layer_info = {
-                "inputs": model_inputs,
-                "output_tr": out_positivity,
-                "experiment_xsize" : full_nx
-                }
+            "inputs": model_inputs,
+            "output_tr": out_positivity,
+            "experiment_xsize": full_nx,
+        }
         # For positivity we end here
         return layer_info
 
@@ -264,13 +268,10 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
         "output_tr": out_tr,
         "output_vl": out_vl,
         "experiment_xsize": full_nx,
-        }
+    }
     return layer_info
 
 
-
-
-
 # Network generation functions
 def generate_dense_network(
     nodes_in,
@@ -436,6 +437,17 @@ def pdfNN_layer_generator(
     Finally we output the final answer as well as the list of all generating functions
     in the model for easy usage within `n3fit`.
 
+    Example
+    -------
+
+    >>> import numpy as np
+    >>> from n3fit.vpinterface import N3PDF
+    >>> from n3fit.model_gen import pdfNN_layer_generator
+    >>> from validphys.pdfgrids import xplotting_grid
+    >>> fake_fl = [{'fl' : i, 'largex' : [0,1], 'smallx': [1,2]} for i in ['u', 'ubar', 'd', 'dbar', 'c', 'cbar', 's', 'sbar']]
+    >>> fake_x = np.linspace(1e-3,0.8,3)
+    >>> pdf_model = pdfNN_layer_generator(nodes=[8], activations=['linear'], seed=[2,3], flav_info=fake_fl, parallel_models=2)
+
     Parameters
     ----------
         inp: int
@@ -455,15 +467,15 @@ def pdfNN_layer_generator(
             to be used by Preprocessing
         out: int
             number of output flavours of the model (default 14)
-        seed: int
+        seed: list(int)
             seed to initialize the NN
         dropout: float
             rate of dropout layer by layer
         impose_sumrule: str
-            whether to impose sumrule on the output pdf model and which one to impose (All, MSR, VSR)
+            whether to impose sumrules on the output pdf and which one to impose (All, MSR, VSR)
         scaler: scaler
             Function to apply to the input. If given the input to the model
-            will be a (1, None, 2) tensor where dim [:,:,0] is scaled 
+            will be a (1, None, 2) tensor where dim [:,:,0] is scaled
         parallel_models: int
             How many models should be trained in parallel
 
@@ -473,6 +485,11 @@ def pdfNN_layer_generator(
             a model f(x) = y where x is a tensor (1, xgrid, 1) and y a tensor (1, xgrid, out)
     """
     # Parse the input configuration
+    if seed is None:
+        seed = parallel_models * [None]
+    elif isinstance(seed, int):
+        seed = parallel_models * [seed]
+
     if nodes is None:
         nodes = [15, 8]
     ln = len(nodes)
@@ -492,7 +509,7 @@ def pdfNN_layer_generator(
 
     number_of_layers = len(nodes)
     # The number of nodes in the last layer is equal to the number of fitted flavours
-    last_layer_nodes = nodes[-1] # (== len(flav_info))
+    last_layer_nodes = nodes[-1]  # (== len(flav_info))
 
     # Generate the generic layers that will not depend on extra considerations
 
@@ -506,18 +523,18 @@ def pdfNN_layer_generator(
     # TODO: make it its own option (i.e., one could want to use this without using scaler)
     if scaler:
         # change the input domain [0,1] -> [-1,1]
-        process_input = Lambda(lambda  x: 2*x-1)
+        process_input = Lambda(lambda x: 2 * x - 1)
         subtract_one = True
         input_x_eq_1 = scaler([1.0])[0]
         placeholder_input = Input(shape=(None, 2), batch_size=1)
-    elif inp==2:
+    elif inp == 2:
         # If the input is of type (x, logx)
         # create a x --> (x, logx) layer to preppend to everything
         process_input = Lambda(lambda x: op.concatenate([x, op.op_log(x)], axis=-1))
 
     model_input = [placeholder_input]
     if subtract_one:
-        layer_x_eq_1 = op.numpy_to_input(np.array(input_x_eq_1).reshape(1,1))
+        layer_x_eq_1 = op.numpy_to_input(np.array(input_x_eq_1).reshape(1, 1))
         model_input.append(layer_x_eq_1)
 
     # Evolution layer
@@ -533,12 +550,9 @@ def pdfNN_layer_generator(
     else:
         sumrule_layer = lambda x: x
 
-
     # Now we need a trainable network per model to be trained in parallel
     pdf_models = []
-    for i in range(parallel_models):
-        # Move the seed
-        layer_seed = seed + i * number_of_layers
+    for i, layer_seed in enumerate(seed):
         if layer_type == "dense":
             reg = regularizer_selector(regularizer, **regularizer_args)
             list_of_pdf_layers = generate_dense_network(
@@ -555,7 +569,12 @@ def pdfNN_layer_generator(
             # TODO: this information should come from the basis information
             #       once the basis information is passed to this class
             list_of_pdf_layers = generate_dense_per_flavour_network(
-                inp, nodes, activations, initializer_name, seed=layer_seed, basis_size=last_layer_nodes,
+                inp,
+                nodes,
+                activations,
+                initializer_name,
+                seed=layer_seed,
+                basis_size=last_layer_nodes,
             )
 
         def dense_me(x):
@@ -568,18 +587,18 @@ def dense_me(x):
                 curr_fun = dense_layer(curr_fun)
             return curr_fun
 
-        preproseed = layer_seed + number_of_layers * (i + 1)
+        preproseed = layer_seed + number_of_layers
         layer_preproc = Preprocessing(
             flav_info=flav_info,
             input_shape=(1,),
             name=f"pdf_prepro_{i}",
             seed=preproseed,
-            large_x = not subtract_one
+            large_x=not subtract_one,
         )
 
         # Apply preprocessing and basis
         def layer_fitbasis(x):
-            """ The tensor x has a expected shape of (1, None, {1,2})
+            """The tensor x has a expected shape of (1, None, {1,2})
             where x[...,0] corresponds to the feature_scaled input and x[...,-1] the original input
             """
             x_scaled = op.op_gather_keep_dims(x, 0, axis=-1)
@@ -604,6 +623,8 @@ def layer_pdf(x):
         final_pdf = sumrule_layer(layer_pdf)
 
         # Create the model
-        pdf_model = MetaModel(model_input, final_pdf(placeholder_input), name=f"PDF_{i}", scaler=scaler)
+        pdf_model = MetaModel(
+            model_input, final_pdf(placeholder_input), name=f"PDF_{i}", scaler=scaler
+        )
         pdf_models.append(pdf_model)
     return pdf_models