Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions doc/sphinx/source/n3fit/runcard_detailed.rst
Original file line number Diff line number Diff line change
Expand Up @@ -290,14 +290,43 @@ as well as a detailed analysis of the amount of time that TensorFlow spent on ea
Running fits in parallel
------------------------

It is possible to run fits in parallel with ``n3fit`` by using the ``parallel_models``
flag in the runcard (by default the number of ``parallel_models`` is set to 1).
It is possible to run fits in parallel with ``n3fit`` by setting the ``parallel_models``
flag in the runcard to ``true`` when running a range of replicas.
Running in parallel can be quite hard on memory and it is only advantageous when
fitting on a GPU, where one can find a speed up equal to the number of models run
in parallel (each model being a different replica).

At present it cannot be used together with the ``hyperopt`` module.
Running in parallel leverages the fact that the only difference between two replicas
is the output data the prediction is compared to.
In order to ensure this is indeed the case it is necessary to also
use the `same_trvl_per_replica` flag in the runcard.
Comment thread
scarlehoff marked this conversation as resolved.

In other words, in order to run several replicas in parallel in a machine
(be it a big CPU or, most likely, a GPU)
it is necessary to modify the ``n3fit`` runcard by adding the following two
top-level options:

.. code-block:: yaml

parallel_models: true
same_trvl_per_replica: true


And then run ``n3fit`` with a replica range to be parallelized
(in this case from replica 1 to replica 4).

.. code-block:: bash

n3fit runcard.yml 1 -r 4


In machines with more than one GPU you can select the GPU in which the code
should run by setting the environment variable ``CUDA_VISIBLE_DEVICES``
to the right index (usually ``0, 1, 2``) or leaving it explicitly empty
to avoid running on GPU: ``export CUDA_VISIBLE_DEVICES=""``


Note that at present it cannot be used together with the ``hyperopt`` module.

.. _otheroptions-label:

Expand Down
5 changes: 3 additions & 2 deletions n3fit/runcards/Basic_runcard_parallel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ theory:
trvlseed: 1
nnseed: 2
mcseed: 3
genrep: False # true = generate MC replicas, false = use real data
genrep: True # true = generate MC replicas, false = use real data

parameters: # This defines the parameter dictionary that is passed to the Model Trainer
nodes_per_layer: [15, 10, 8]
Expand Down Expand Up @@ -86,4 +86,5 @@ positivity:
############################################################
debug: False
maxcores: 8
parallel_models: 4
parallel_models: true
same_trvl_per_replica: true
46 changes: 22 additions & 24 deletions n3fit/src/n3fit/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,20 +265,18 @@ def check_hyperopt_stopping(stopping_dict):
if min_ep is None or max_ep is None:
raise CheckError("Need to set both the max_epochs and the min_epochs")
if min_ep < 1:
raise CheckError(f"Can't run for less than 1 epoch: " "selected min_ep = {min_ep}")
raise CheckError(f"Can't run for less than 1 epoch: selected min_ep = {min_ep}")
if max_ep <= min_ep:
raise CheckError(f"min_epochs cannot be greater than max_epochs: ({min_ep} > {max_ep})")
min_pat = stopping_dict.get("min_patience")
max_pat = stopping_dict.get("max_patience")
if min_pat is not None or max_pat is not None:
if min_pat is not None and min_pat < 0.0:
raise CheckError(
f"min_patience cannot be less than 0.0: " "selected min_pat = {min_pat}"
)
raise CheckError(f"min_patience cannot be less than 0.0: selected min_pat = {min_pat}")
if max_pat is not None:
if max_pat > 1.0:
raise CheckError(
f"max_patience cannot be greater than 1.0: " "selected max_pat = {max_pat}"
f"max_patience cannot be greater than 1.0: selected max_pat = {max_pat}"
)
if min_pat is not None and max_pat < min_pat:
raise CheckError(
Expand All @@ -292,7 +290,7 @@ def wrapper_hyperopt(hyperopt, hyperscan, genrep, data):
No check is performed if hyperopt is not active
"""
if not hyperopt:
return None
return
if genrep:
raise CheckError("Generation of replicas is not accepted during hyperoptimization")
if hyperscan is None:
Expand Down Expand Up @@ -350,33 +348,33 @@ def check_consistent_basis(sum_rules, fitbasis, basis, theoryid):


@make_argcheck
def can_run_multiple_replicas(replicas, genrep, parallel_models):
"""Checks whether a runcard which is trying to run several replicas at once
(parallel_models =/= 1) is valid
def check_consistent_parallel(hyperopt, parameters, parallel_models, same_trvl_per_replica):
"""Checks whether the multiple-replica fit options are consistent among them
i.e., that the trvl seed is fixed, hyperopt is not on and the layer type is correct
"""
rp = len(replicas)
if rp > 1 and not genrep:
if not parallel_models:
return
if not same_trvl_per_replica:
raise CheckError(
"Can't run more than one replica at once if no replicas are to be generated"
"Replicas cannot be run in parallel with different training/validation "
" masks, please set `same_trvl_per_replica` to True in the runcard"
)
if rp > 1 and parallel_models != 1:
raise CheckError("Parallel mode cannot be used together with multireplica runs")

@make_argcheck
def can_run_parallel_replicas(genrep, parameters, hyperopt, parallel_models):
"""Checks whether a runcard which is trying to run several replicas at once
(parallel_models =/= 1) is valid
"""
if parallel_models == 1:
return
if hyperopt:
raise CheckError("Running replicas in parallel with hyperopt is still not supported")
if genrep:
raise CheckError("Replica generation is not supported yet for parallel models")
if parameters.get("layer_type") != "dense":
raise CheckError("Parallelization has only been tested with layer_type=='dense'")


@make_argcheck
def can_run_multiple_replicas(replicas, parallel_models):
"""Warns the user if trying to run just one replica in parallel"""
if not parallel_models:
return
if len(replicas) == 1:
log.warning("parallel_models is set to true for only one replica")
return


@make_argcheck
def check_deprecated_options(fitting):
"""Checks whether the runcard is using deprecated options"""
Expand Down
75 changes: 48 additions & 27 deletions n3fit/src/n3fit/model_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
Generates the output layers as functions
# pdfNN_layer_generator:
Generates the PDF NN layer to be fitted


"""
from dataclasses import dataclass
import numpy as np
Expand All @@ -18,6 +20,7 @@
from n3fit.backends import MetaLayer, Lambda
from n3fit.backends import base_layer_selector, regularizer_selector


@dataclass
class ObservableWrapper:
"""Wrapper to generate the observable layer once the PDF model is prepared
Expand Down Expand Up @@ -53,7 +56,7 @@ def _generate_loss(self, mask=None):
return loss

def _generate_experimental_layer(self, pdf):
""" Generates the experimental layer from the PDF """
"""Generates the experimental layer from the PDF"""
# First split the layer into the different datasets (if needed!)
if len(self.dataset_xsizes) > 1:
splitting_layer = op.as_layer(
Expand All @@ -79,7 +82,9 @@ def __call__(self, pdf_layer, mask=None):
return loss_f(experiment_prediction)


def observable_generator(spec_dict, positivity_initial=1.0, integrability=False): # pylint: disable=too-many-locals
def observable_generator(
spec_dict, positivity_initial=1.0, integrability=False
): # pylint: disable=too-many-locals
"""
This function generates the observable model for each experiment.
These are models which takes as input a PDF tensor (1 x size_of_xgrid x flavours) and outputs
Expand Down Expand Up @@ -199,7 +204,6 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
model_obs_vl.append(obs_layer_vl)
model_obs_ex.append(obs_layer_ex)


full_nx = sum(dataset_xsizes)
if spec_dict["positivity"]:
out_positivity = ObservableWrapper(
Expand All @@ -212,10 +216,10 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
)

layer_info = {
"inputs": model_inputs,
"output_tr": out_positivity,
"experiment_xsize" : full_nx
}
"inputs": model_inputs,
"output_tr": out_positivity,
"experiment_xsize": full_nx,
}
# For positivity we end here
return layer_info

Expand Down Expand Up @@ -264,13 +268,10 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False)
"output_tr": out_tr,
"output_vl": out_vl,
"experiment_xsize": full_nx,
}
}
return layer_info





# Network generation functions
def generate_dense_network(
nodes_in,
Expand Down Expand Up @@ -436,6 +437,17 @@ def pdfNN_layer_generator(
Finally we output the final answer as well as the list of all generating functions
in the model for easy usage within `n3fit`.

Example
-------

>>> import numpy as np
>>> from n3fit.vpinterface import N3PDF
>>> from n3fit.model_gen import pdfNN_layer_generator
>>> from validphys.pdfgrids import xplotting_grid
>>> fake_fl = [{'fl' : i, 'largex' : [0,1], 'smallx': [1,2]} for i in ['u', 'ubar', 'd', 'dbar', 'c', 'cbar', 's', 'sbar']]
>>> fake_x = np.linspace(1e-3,0.8,3)
>>> pdf_model = pdfNN_layer_generator(nodes=[8], activations=['linear'], seed=[2,3], flav_info=fake_fl, parallel_models=2)

Parameters
----------
inp: int
Expand All @@ -455,15 +467,15 @@ def pdfNN_layer_generator(
to be used by Preprocessing
out: int
number of output flavours of the model (default 14)
seed: int
seed: list(int)
seed to initialize the NN
dropout: float
rate of dropout layer by layer
impose_sumrule: str
whether to impose sumrule on the output pdf model and which one to impose (All, MSR, VSR)
whether to impose sumrules on the output pdf and which one to impose (All, MSR, VSR)
scaler: scaler
Function to apply to the input. If given the input to the model
will be a (1, None, 2) tensor where dim [:,:,0] is scaled
will be a (1, None, 2) tensor where dim [:,:,0] is scaled
parallel_models: int
How many models should be trained in parallel

Expand All @@ -473,6 +485,11 @@ def pdfNN_layer_generator(
a model f(x) = y where x is a tensor (1, xgrid, 1) and y a tensor (1, xgrid, out)
"""
# Parse the input configuration
if seed is None:
seed = parallel_models * [None]
elif isinstance(seed, int):
seed = parallel_models * [seed]

if nodes is None:
nodes = [15, 8]
ln = len(nodes)
Expand All @@ -492,7 +509,7 @@ def pdfNN_layer_generator(

number_of_layers = len(nodes)
# The number of nodes in the last layer is equal to the number of fitted flavours
last_layer_nodes = nodes[-1] # (== len(flav_info))
last_layer_nodes = nodes[-1] # (== len(flav_info))

# Generate the generic layers that will not depend on extra considerations

Expand All @@ -506,18 +523,18 @@ def pdfNN_layer_generator(
# TODO: make it its own option (i.e., one could want to use this without using scaler)
if scaler:
# change the input domain [0,1] -> [-1,1]
process_input = Lambda(lambda x: 2*x-1)
process_input = Lambda(lambda x: 2 * x - 1)
subtract_one = True
input_x_eq_1 = scaler([1.0])[0]
placeholder_input = Input(shape=(None, 2), batch_size=1)
elif inp==2:
elif inp == 2:
# If the input is of type (x, logx)
# create a x --> (x, logx) layer to preppend to everything
process_input = Lambda(lambda x: op.concatenate([x, op.op_log(x)], axis=-1))

model_input = [placeholder_input]
if subtract_one:
layer_x_eq_1 = op.numpy_to_input(np.array(input_x_eq_1).reshape(1,1))
layer_x_eq_1 = op.numpy_to_input(np.array(input_x_eq_1).reshape(1, 1))
model_input.append(layer_x_eq_1)

# Evolution layer
Expand All @@ -533,12 +550,9 @@ def pdfNN_layer_generator(
else:
sumrule_layer = lambda x: x


# Now we need a trainable network per model to be trained in parallel
pdf_models = []
for i in range(parallel_models):
# Move the seed
layer_seed = seed + i * number_of_layers
for i, layer_seed in enumerate(seed):
if layer_type == "dense":
reg = regularizer_selector(regularizer, **regularizer_args)
list_of_pdf_layers = generate_dense_network(
Expand All @@ -555,7 +569,12 @@ def pdfNN_layer_generator(
# TODO: this information should come from the basis information
# once the basis information is passed to this class
list_of_pdf_layers = generate_dense_per_flavour_network(
inp, nodes, activations, initializer_name, seed=layer_seed, basis_size=last_layer_nodes,
inp,
nodes,
activations,
initializer_name,
seed=layer_seed,
basis_size=last_layer_nodes,
)

def dense_me(x):
Expand All @@ -568,18 +587,18 @@ def dense_me(x):
curr_fun = dense_layer(curr_fun)
return curr_fun

preproseed = layer_seed + number_of_layers * (i + 1)
preproseed = layer_seed + number_of_layers
layer_preproc = Preprocessing(
flav_info=flav_info,
input_shape=(1,),
name=f"pdf_prepro_{i}",
seed=preproseed,
large_x = not subtract_one
large_x=not subtract_one,
)

# Apply preprocessing and basis
def layer_fitbasis(x):
""" The tensor x has a expected shape of (1, None, {1,2})
"""The tensor x has a expected shape of (1, None, {1,2})
where x[...,0] corresponds to the feature_scaled input and x[...,-1] the original input
"""
x_scaled = op.op_gather_keep_dims(x, 0, axis=-1)
Expand All @@ -604,6 +623,8 @@ def layer_pdf(x):
final_pdf = sumrule_layer(layer_pdf)

# Create the model
pdf_model = MetaModel(model_input, final_pdf(placeholder_input), name=f"PDF_{i}", scaler=scaler)
pdf_model = MetaModel(
model_input, final_pdf(placeholder_input), name=f"PDF_{i}", scaler=scaler
)
pdf_models.append(pdf_model)
return pdf_models
Loading