From 2517a0948b9ad187c53d7f5f6b8944aba2ffa831 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Tue, 1 Jun 2021 09:53:37 +0200 Subject: [PATCH 1/6] parallel replicas now accept genrep=true --- n3fit/runcards/Basic_runcard_parallel.yml | 5 +- n3fit/src/n3fit/checks.py | 22 ++----- n3fit/src/n3fit/model_gen.py | 73 +++++++++++++++-------- n3fit/src/n3fit/model_trainer.py | 2 +- n3fit/src/n3fit/performfit.py | 62 ++++++++++++++----- 5 files changed, 104 insertions(+), 60 deletions(-) diff --git a/n3fit/runcards/Basic_runcard_parallel.yml b/n3fit/runcards/Basic_runcard_parallel.yml index ff64063412..5d35e50bd0 100644 --- a/n3fit/runcards/Basic_runcard_parallel.yml +++ b/n3fit/runcards/Basic_runcard_parallel.yml @@ -38,7 +38,7 @@ theory: trvlseed: 1 nnseed: 2 mcseed: 3 -genrep: False # true = generate MC replicas, false = use real data +genrep: True # true = generate MC replicas, false = use real data parameters: # This defines the parameter dictionary that is passed to the Model Trainer nodes_per_layer: [15, 10, 8] @@ -86,4 +86,5 @@ positivity: ############################################################ debug: False maxcores: 8 -parallel_models: 4 +parallel_models: true +same_trvl_per_replica: true diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py index 4774510af2..4d36d5c858 100644 --- a/n3fit/src/n3fit/checks.py +++ b/n3fit/src/n3fit/checks.py @@ -354,27 +354,15 @@ def can_run_multiple_replicas(replicas, genrep, parallel_models): """Checks whether a runcard which is trying to run several replicas at once (parallel_models =/= 1) is valid """ - rp = len(replicas) - if rp > 1 and not genrep: - raise CheckError( - "Can't run more than one replica at once if no replicas are to be generated" - ) - if rp > 1 and parallel_models != 1: - raise CheckError("Parallel mode cannot be used together with multireplica runs") - -@make_argcheck -def can_run_parallel_replicas(genrep, parameters, hyperopt, parallel_models): - """Checks whether a runcard which is trying to run several replicas at once - (parallel_models =/= 1) is valid - """ - if parallel_models == 1: + if not parallel_models: return - if hyperopt: + rp = len(replicas) + if hyperopt and rp > 1: raise CheckError("Running replicas in parallel with hyperopt is still not supported") - if genrep: - raise CheckError("Replica generation is not supported yet for parallel models") if parameters.get("layer_type") != "dense": raise CheckError("Parallelization has only been tested with layer_type=='dense'") + if rp == 1: + log.warning("parallel_models is set to true for only one replica") @make_argcheck diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py index b85a5835a2..11ec7a2c04 100644 --- a/n3fit/src/n3fit/model_gen.py +++ b/n3fit/src/n3fit/model_gen.py @@ -6,6 +6,8 @@ Generates the output layers as functions # pdfNN_layer_generator: Generates the PDF NN layer to be fitted + + """ from dataclasses import dataclass import numpy as np @@ -18,6 +20,7 @@ from n3fit.backends import MetaLayer, Lambda from n3fit.backends import base_layer_selector, regularizer_selector + @dataclass class ObservableWrapper: """Wrapper to generate the observable layer once the PDF model is prepared @@ -53,7 +56,7 @@ def _generate_loss(self, mask=None): return loss def _generate_experimental_layer(self, pdf): - """ Generates the experimental layer from the PDF """ + """Generates the experimental layer from the PDF""" # First split the layer into the different datasets (if needed!) if len(self.dataset_xsizes) > 1: splitting_layer = op.as_layer( @@ -79,7 +82,9 @@ def __call__(self, pdf_layer, mask=None): return loss_f(experiment_prediction) -def observable_generator(spec_dict, positivity_initial=1.0, integrability=False): # pylint: disable=too-many-locals +def observable_generator( + spec_dict, positivity_initial=1.0, integrability=False +): # pylint: disable=too-many-locals """ This function generates the observable model for each experiment. These are models which takes as input a PDF tensor (1 x size_of_xgrid x flavours) and outputs @@ -199,7 +204,6 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False) model_obs_vl.append(obs_layer_vl) model_obs_ex.append(obs_layer_ex) - full_nx = sum(dataset_xsizes) if spec_dict["positivity"]: out_positivity = ObservableWrapper( @@ -212,10 +216,10 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False) ) layer_info = { - "inputs": model_inputs, - "output_tr": out_positivity, - "experiment_xsize" : full_nx - } + "inputs": model_inputs, + "output_tr": out_positivity, + "experiment_xsize": full_nx, + } # For positivity we end here return layer_info @@ -264,13 +268,10 @@ def observable_generator(spec_dict, positivity_initial=1.0, integrability=False) "output_tr": out_tr, "output_vl": out_vl, "experiment_xsize": full_nx, - } + } return layer_info - - - # Network generation functions def generate_dense_network( nodes_in, @@ -436,6 +437,17 @@ def pdfNN_layer_generator( Finally we output the final answer as well as the list of all generating functions in the model for easy usage within `n3fit`. + Example + ------- + + >>> import numpy as np + >>> from n3fit.vpinterface import N3PDF + >>> from n3fit.model_gen import pdfNN_layer_generator + >>> from validphys.pdfgrids import xplotting_grid + >>> fake_fl = [{'fl' : i, 'largex' : [0,1], 'smallx': [1,2]} for i in ['u', 'ubar', 'd', 'dbar', 'c', 'cbar', 's', 'sbar']] + >>> fake_x = np.linspace(1e-3,0.8,3) + >>> pdf_model = pdfNN_layer_generator(nodes=[8], activations=['linear'], seed=[2,3], flav_info=fake_fl, parallel_models=2) + Parameters ---------- inp: int @@ -460,10 +472,10 @@ def pdfNN_layer_generator( dropout: float rate of dropout layer by layer impose_sumrule: str - whether to impose sumrule on the output pdf model and which one to impose (All, MSR, VSR) + whether to impose sumrules on the output pdf and which one to impose (All, MSR, VSR) scaler: scaler Function to apply to the input. If given the input to the model - will be a (1, None, 2) tensor where dim [:,:,0] is scaled + will be a (1, None, 2) tensor where dim [:,:,0] is scaled parallel_models: int How many models should be trained in parallel @@ -473,6 +485,11 @@ def pdfNN_layer_generator( a model f(x) = y where x is a tensor (1, xgrid, 1) and y a tensor (1, xgrid, out) """ # Parse the input configuration + if seed is None: + seed = parallel_models * [None] + elif isinstance(seed, int): + seed = parallel_models * [seed] + if nodes is None: nodes = [15, 8] ln = len(nodes) @@ -492,7 +509,7 @@ def pdfNN_layer_generator( number_of_layers = len(nodes) # The number of nodes in the last layer is equal to the number of fitted flavours - last_layer_nodes = nodes[-1] # (== len(flav_info)) + last_layer_nodes = nodes[-1] # (== len(flav_info)) # Generate the generic layers that will not depend on extra considerations @@ -506,18 +523,18 @@ def pdfNN_layer_generator( # TODO: make it its own option (i.e., one could want to use this without using scaler) if scaler: # change the input domain [0,1] -> [-1,1] - process_input = Lambda(lambda x: 2*x-1) + process_input = Lambda(lambda x: 2 * x - 1) subtract_one = True input_x_eq_1 = scaler([1.0])[0] placeholder_input = Input(shape=(None, 2), batch_size=1) - elif inp==2: + elif inp == 2: # If the input is of type (x, logx) # create a x --> (x, logx) layer to preppend to everything process_input = Lambda(lambda x: op.concatenate([x, op.op_log(x)], axis=-1)) model_input = [placeholder_input] if subtract_one: - layer_x_eq_1 = op.numpy_to_input(np.array(input_x_eq_1).reshape(1,1)) + layer_x_eq_1 = op.numpy_to_input(np.array(input_x_eq_1).reshape(1, 1)) model_input.append(layer_x_eq_1) # Evolution layer @@ -533,12 +550,9 @@ def pdfNN_layer_generator( else: sumrule_layer = lambda x: x - # Now we need a trainable network per model to be trained in parallel pdf_models = [] - for i in range(parallel_models): - # Move the seed - layer_seed = seed + i * number_of_layers + for i, layer_seed in enumerate(seed): if layer_type == "dense": reg = regularizer_selector(regularizer, **regularizer_args) list_of_pdf_layers = generate_dense_network( @@ -555,7 +569,12 @@ def pdfNN_layer_generator( # TODO: this information should come from the basis information # once the basis information is passed to this class list_of_pdf_layers = generate_dense_per_flavour_network( - inp, nodes, activations, initializer_name, seed=layer_seed, basis_size=last_layer_nodes, + inp, + nodes, + activations, + initializer_name, + seed=layer_seed, + basis_size=last_layer_nodes, ) def dense_me(x): @@ -568,18 +587,18 @@ def dense_me(x): curr_fun = dense_layer(curr_fun) return curr_fun - preproseed = layer_seed + number_of_layers * (i + 1) + preproseed = layer_seed + number_of_layers layer_preproc = Preprocessing( flav_info=flav_info, input_shape=(1,), name=f"pdf_prepro_{i}", seed=preproseed, - large_x = not subtract_one + large_x=not subtract_one, ) # Apply preprocessing and basis def layer_fitbasis(x): - """ The tensor x has a expected shape of (1, None, {1,2}) + """The tensor x has a expected shape of (1, None, {1,2}) where x[...,0] corresponds to the feature_scaled input and x[...,-1] the original input """ x_scaled = op.op_gather_keep_dims(x, 0, axis=-1) @@ -604,6 +623,8 @@ def layer_pdf(x): final_pdf = sumrule_layer(layer_pdf) # Create the model - pdf_model = MetaModel(model_input, final_pdf(placeholder_input), name=f"PDF_{i}", scaler=scaler) + pdf_model = MetaModel( + model_input, final_pdf(placeholder_input), name=f"PDF_{i}", scaler=scaler + ) pdf_models.append(pdf_model) return pdf_models diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py index 853983cad1..30ba439199 100644 --- a/n3fit/src/n3fit/model_trainer.py +++ b/n3fit/src/n3fit/model_trainer.py @@ -111,6 +111,7 @@ def __init__( the name of the basis being fitted nnseed: int the seed used to initialise the Neural Network, will be passed to model_gen + can be a list of lists of parallel_models > 1 pass_status: str flag to signal a good run failed_status: str @@ -128,7 +129,6 @@ def __init__( parallel_models: int number of models to fit in parallel """ - # Save all input information self.exp_info = exp_info self.pos_info = pos_info diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index 4cec1c751a..c5838d65ed 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -3,6 +3,8 @@ """ # Backend-independent imports +from collections import namedtuple +import copy import logging import numpy as np import n3fit.checks @@ -36,7 +38,7 @@ def performfit( tensorboard=None, debug=False, maxcores=None, - parallel_models=1 + parallel_models=False ): """ This action will (upon having read a validcard) process a full PDF fit @@ -115,8 +117,8 @@ def performfit( activate some debug options maxcores: int maximum number of (logical) cores that the backend should be aware of - parallel_models: int - number of models to be run in parallel + parallel_models: bool + whether to run models in parallel """ from n3fit.backends import set_initial_state @@ -132,18 +134,45 @@ def performfit( from n3fit.model_trainer import ModelTrainer from n3fit.io.writer import WriterWrapper + # If models are to be run in parallel, we just need to enter the replica loop once + # but we need all data + n_models = len(replicas_nnseed_fitting_data_dict) + if parallel_models or n_models == 1: + replicas, replica_experiments, nnseeds = zip(*replicas_nnseed_fitting_data_dict) + # Parse the experiments so that the output data contain information for all replicas + # as the only different from replica to replica is the experimental training/validation data + all_experiments = copy.deepcopy(replica_experiments[0]) + for i_exp in range(len(all_experiments)): + training_data = [] + validation_data = [] + for i_rep in range(n_models): + training_data.append(replica_experiments[i_rep][i_exp]['expdata']) + validation_data.append(replica_experiments[i_rep][i_exp]['expdata_vl']) + all_experiments[i_exp]['expdata'] = np.concatenate(training_data, axis=0) + all_experiments[i_exp]['expdata_vl'] = np.concatenate(validation_data, axis=0) + log.info( + "Starting replica fits %d to %d", + replicas[0], + replicas[0] + n_models - 1, + ) + replicas_info = [(replicas, all_experiments, nnseeds)] + else: + replicas_info = replicas_nnseed_fitting_data_dict + # Note: In the basic scenario we are only running for one replica and thus this loop is only # run once as replicas_nnseed_fitting_data_dict is a list of just one element - for replica_number, exp_info, nnseed in replicas_nnseed_fitting_data_dict: - replica_path_set = replica_path / f"replica_{replica_number}" - if parallel_models == 1: + + # Note: there are three possible scenarios for this loop: + # 1.- Only one replica is being run, in this case the loop is only evaluated once + # 2.- Many replicas being run, in this case each will have a replica_number, seed, etc + # and they will be fitted sequentially + # 3.- Many replicas being run in parallel. In this case the loop will be evaluated just once + # but a model per replica will be generated + for replica_numbers, exp_info, nnseed in replicas_info: + # TODO: replica_number is too restrictive as a name! + if not parallel_models: + replica_number = replica_numbers log.info("Starting replica fit %s", replica_number) - else: - log.info( - "Starting replica fits %s to %s", - replica_number, - replica_number + parallel_models - 1, - ) # Generate a ModelTrainer object # this object holds all necessary information to train a PDF (up to the NN definition) @@ -159,7 +188,7 @@ def performfit( max_cores=maxcores, model_file=load, sum_rules=sum_rules, - parallel_models=parallel_models + parallel_models=n_models ) # This is just to give a descriptive name to the fit function @@ -178,6 +207,7 @@ def performfit( if hyperopt: from n3fit.hyper_optimization.hyper_scan import hyper_scan_wrapper + replica_path_set = replica_path / f"replica_{replica_number}" true_best = hyper_scan_wrapper( replica_path_set, the_model_trainer, parameters, hyperscan, max_evals=hyperopt, ) @@ -198,6 +228,10 @@ def performfit( if tensorboard is not None: profiling = tensorboard.get("profiling", False) weight_freq = tensorboard.get("weight_freq", 0) + if parallel_models: + replica_path_set = replica_path + else: + replica_path_set = replica_path / f"replica_{replica_number}" log_path = replica_path_set / "tboard" the_model_trainer.enable_tensorboard(log_path, weight_freq, profiling) @@ -216,7 +250,7 @@ def performfit( all_training_chi2, all_val_chi2, all_exp_chi2 = the_model_trainer.evaluate(stopping_object) pdf_models = result["pdf_models"] - for i, pdf_model in enumerate(pdf_models): + for i, (replica_number, pdf_model) in enumerate(zip(replica_numbers, pdf_models)): # Each model goes into its own replica folder replica_path_set = replica_path / f"replica_{replica_number + i}" From 59eb42d1f3d5481d8bc538b3a80f2c90ad567a59 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Tue, 1 Jun 2021 12:57:20 +0200 Subject: [PATCH 2/6] change docs, beautify --- doc/sphinx/source/n3fit/runcard_detailed.rst | 24 ++++++++- n3fit/src/n3fit/checks.py | 27 ++++++---- n3fit/src/n3fit/model_gen.py | 2 +- n3fit/src/n3fit/model_trainer.py | 17 +++--- n3fit/src/n3fit/performfit.py | 54 ++++++++++++-------- 5 files changed, 79 insertions(+), 45 deletions(-) diff --git a/doc/sphinx/source/n3fit/runcard_detailed.rst b/doc/sphinx/source/n3fit/runcard_detailed.rst index 091819644a..7a9ad9fce0 100644 --- a/doc/sphinx/source/n3fit/runcard_detailed.rst +++ b/doc/sphinx/source/n3fit/runcard_detailed.rst @@ -291,14 +291,34 @@ Running fits in parallel ------------------------ It is possible to run fits in parallel with ``n3fit`` by using the ``parallel_models`` -flag in the runcard (by default the number of ``parallel_models`` is set to 1). +flag in the runcard when running a range of replicas. Running in parallel can be quite hard on memory and it is only advantageous when fitting on a GPU, where one can find a speed up equal to the number of models run in parallel (each model being a different replica). -At present it cannot be used together with the ``hyperopt`` module. +Running in parallel leverages the fact that the only difference between two replicas +is the output data the prediction is compared to. +In order to ensure this is indeed the case it is necessary to also +use the `same_trvl_per_replica` flag in the runcard. + +In order to run several replicas in parallel, add the following options +to the runcard: + +.. code-block:: yaml + + parallel_models: true + same_trvl_per_replica: true +And then run ``n3fit`` with the replica range to be parallelized +(in this case from replica 1 to replica 4). + +.. code-block:: bash + + n3fit runcard.yml 1 -r 4 + +At present it cannot be used together with the ``hyperopt`` module. + .. _otheroptions-label: Other options diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py index 4d36d5c858..40ef62edef 100644 --- a/n3fit/src/n3fit/checks.py +++ b/n3fit/src/n3fit/checks.py @@ -265,20 +265,18 @@ def check_hyperopt_stopping(stopping_dict): if min_ep is None or max_ep is None: raise CheckError("Need to set both the max_epochs and the min_epochs") if min_ep < 1: - raise CheckError(f"Can't run for less than 1 epoch: " "selected min_ep = {min_ep}") + raise CheckError(f"Can't run for less than 1 epoch: selected min_ep = {min_ep}") if max_ep <= min_ep: raise CheckError(f"min_epochs cannot be greater than max_epochs: ({min_ep} > {max_ep})") min_pat = stopping_dict.get("min_patience") max_pat = stopping_dict.get("max_patience") if min_pat is not None or max_pat is not None: if min_pat is not None and min_pat < 0.0: - raise CheckError( - f"min_patience cannot be less than 0.0: " "selected min_pat = {min_pat}" - ) + raise CheckError(f"min_patience cannot be less than 0.0: selected min_pat = {min_pat}") if max_pat is not None: if max_pat > 1.0: raise CheckError( - f"max_patience cannot be greater than 1.0: " "selected max_pat = {max_pat}" + f"max_patience cannot be greater than 1.0: selected max_pat = {max_pat}" ) if min_pat is not None and max_pat < min_pat: raise CheckError( @@ -292,7 +290,7 @@ def wrapper_hyperopt(hyperopt, hyperscan, genrep, data): No check is performed if hyperopt is not active """ if not hyperopt: - return None + return if genrep: raise CheckError("Generation of replicas is not accepted during hyperoptimization") if hyperscan is None: @@ -350,19 +348,26 @@ def check_consistent_basis(sum_rules, fitbasis, basis, theoryid): @make_argcheck -def can_run_multiple_replicas(replicas, genrep, parallel_models): +def can_run_multiple_replicas( + replicas, parameters, hyperopt, parallel_models, same_trvl_per_replica +): """Checks whether a runcard which is trying to run several replicas at once (parallel_models =/= 1) is valid """ if not parallel_models: return - rp = len(replicas) - if hyperopt and rp > 1: + if len(replicas) == 1: + log.warning("parallel_models is set to true for only one replica") + return + if not same_trvl_per_replica: + raise CheckError( + "Replicas cannot be run in parallel with different training/validation " + " masks, please set `same_trvl_per_replica` to True in the runcard" + ) + if hyperopt: raise CheckError("Running replicas in parallel with hyperopt is still not supported") if parameters.get("layer_type") != "dense": raise CheckError("Parallelization has only been tested with layer_type=='dense'") - if rp == 1: - log.warning("parallel_models is set to true for only one replica") @make_argcheck diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py index 11ec7a2c04..a28ad857f1 100644 --- a/n3fit/src/n3fit/model_gen.py +++ b/n3fit/src/n3fit/model_gen.py @@ -467,7 +467,7 @@ def pdfNN_layer_generator( to be used by Preprocessing out: int number of output flavours of the model (default 14) - seed: int + seed: list(int) seed to initialize the NN dropout: float rate of dropout layer by layer diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py index 30ba439199..332be94fc2 100644 --- a/n3fit/src/n3fit/model_trainer.py +++ b/n3fit/src/n3fit/model_trainer.py @@ -86,7 +86,7 @@ def __init__( integ_info, flavinfo, fitbasis, - nnseed, + nnseeds, pass_status="ok", failed_status="fail", debug=False, @@ -109,9 +109,8 @@ def __init__( the object returned by fitting['basis'] fitbasis: str the name of the basis being fitted - nnseed: int - the seed used to initialise the Neural Network, will be passed to model_gen - can be a list of lists of parallel_models > 1 + nnseeds: list(int) + the seed used to initialise the NN for each model to be passed to model_gen pass_status: str flag to signal a good run failed_status: str @@ -139,7 +138,7 @@ def __init__( self.all_info = exp_info + pos_info self.flavinfo = flavinfo self.fitbasis = fitbasis - self.NNseed = nnseed + self._nn_seeds = nnseeds self.pass_status = pass_status self.failed_status = failed_status self.debug = debug @@ -795,10 +794,10 @@ def hyperparametrizable(self, params): ### Training loop for k, partition in enumerate(self.kpartitions): # Each partition of the kfolding needs to have its own separate model - seed = self.NNseed + # and the seed needs to be updated accordingly + seeds = self._nn_seeds if k > 0: - # Update the seed - seed = np.random.randint(0, pow(2, 31)) + seeds = [np.random.randint(0, pow(2, 31))] # Generate the pdf model pdf_models = self._generate_pdf( @@ -809,7 +808,7 @@ def hyperparametrizable(self, params): params["dropout"], params.get("regularizer", None), # regularizer optional params.get("regularizer_args", None), - seed, + seeds, ) # Model generation joins all the different observable layers diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index c5838d65ed..3baae81470 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -134,10 +134,24 @@ def performfit( from n3fit.model_trainer import ModelTrainer from n3fit.io.writer import WriterWrapper - # If models are to be run in parallel, we just need to enter the replica loop once - # but we need all data + # Note: there are three possible scenarios for the loop of replicas: + # 1.- Only one replica is being run, in this case the loop is only evaluated once + # 2.- Many replicas being run, in this case each will have a replica_number, seed, etc + # and they will be fitted sequentially + # 3.- Many replicas being run in parallel. In this case the loop will be evaluated just once + # but a model per replica will be generated + # + # In the main scenario (1) replicas_nnseed_fitting_data_dict is a list of just one element + # case (3) is similar but the one element of replicas_nnseed_fitting_data_dict will be modified + # to be ( + # [list of all replica idx], + # one experiment with data=(replicas, ndata), + # [list of all NN seeds] + # ) + # + n_models = len(replicas_nnseed_fitting_data_dict) - if parallel_models or n_models == 1: + if parallel_models and n_models != 1: replicas, replica_experiments, nnseeds = zip(*replicas_nnseed_fitting_data_dict) # Parse the experiments so that the output data contain information for all replicas # as the only different from replica to replica is the experimental training/validation data @@ -159,20 +173,13 @@ def performfit( else: replicas_info = replicas_nnseed_fitting_data_dict - # Note: In the basic scenario we are only running for one replica and thus this loop is only - # run once as replicas_nnseed_fitting_data_dict is a list of just one element - - # Note: there are three possible scenarios for this loop: - # 1.- Only one replica is being run, in this case the loop is only evaluated once - # 2.- Many replicas being run, in this case each will have a replica_number, seed, etc - # and they will be fitted sequentially - # 3.- Many replicas being run in parallel. In this case the loop will be evaluated just once - # but a model per replica will be generated - for replica_numbers, exp_info, nnseed in replicas_info: - # TODO: replica_number is too restrictive as a name! - if not parallel_models: - replica_number = replica_numbers - log.info("Starting replica fit %s", replica_number) + for replica_idxs, exp_info, nnseeds in replicas_info: + if not parallel_models or n_models == 1: + # Cases 1 and 2 above are a special case of 3 where the replica idx and the seed should + # be a list of just one element + replica_idxs = [replica_idxs] + nnseeds = [nnseeds] + log.info("Starting replica fit %d", replica_idxs[0]) # Generate a ModelTrainer object # this object holds all necessary information to train a PDF (up to the NN definition) @@ -182,7 +189,7 @@ def performfit( integdatasets_fitting_integ_dict, basis, fitbasis, - nnseed, + nnseeds, debug=debug, kfold_parameters=kfold_parameters, max_cores=maxcores, @@ -207,7 +214,8 @@ def performfit( if hyperopt: from n3fit.hyper_optimization.hyper_scan import hyper_scan_wrapper - replica_path_set = replica_path / f"replica_{replica_number}" + # Note that hyperopt will not run in parallel or with more than one model _for now_ + replica_path_set = replica_path / f"replica_{replica_idxs[0]}" true_best = hyper_scan_wrapper( replica_path_set, the_model_trainer, parameters, hyperscan, max_evals=hyperopt, ) @@ -228,10 +236,12 @@ def performfit( if tensorboard is not None: profiling = tensorboard.get("profiling", False) weight_freq = tensorboard.get("weight_freq", 0) - if parallel_models: + if parallel_models and n_models != 1: + # If using tensorboard when running in parallel + # dump the debugging data to the nnfit folder replica_path_set = replica_path else: - replica_path_set = replica_path / f"replica_{replica_number}" + replica_path_set = replica_path / f"replica_{replica_idxs[0]}" log_path = replica_path_set / "tboard" the_model_trainer.enable_tensorboard(log_path, weight_freq, profiling) @@ -250,7 +260,7 @@ def performfit( all_training_chi2, all_val_chi2, all_exp_chi2 = the_model_trainer.evaluate(stopping_object) pdf_models = result["pdf_models"] - for i, (replica_number, pdf_model) in enumerate(zip(replica_numbers, pdf_models)): + for i, (replica_number, pdf_model) in enumerate(zip(replica_idxs, pdf_models)): # Each model goes into its own replica folder replica_path_set = replica_path / f"replica_{replica_number + i}" From cc182b6eec7c2172f8de4c08943d4b08a9d12a40 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Wed, 9 Jun 2021 15:18:52 +0200 Subject: [PATCH 3/6] remove offset --- n3fit/src/n3fit/performfit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index 3baae81470..12e4411342 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -262,7 +262,7 @@ def performfit( pdf_models = result["pdf_models"] for i, (replica_number, pdf_model) in enumerate(zip(replica_idxs, pdf_models)): # Each model goes into its own replica folder - replica_path_set = replica_path / f"replica_{replica_number + i}" + replica_path_set = replica_path / f"replica_{replica_number}" # Create a pdf instance pdf_instance = N3PDF(pdf_model, fit_basis=basis) @@ -287,7 +287,7 @@ def performfit( ) log.info( "Best fit for replica #%d, chi2=%.3f (tr=%.3f, vl=%.3f)", - replica_number+i, + replica_number, exp_chi2, training_chi2, val_chi2 From 5020187650bb5a100ce9ff8c71d8c0a522824e04 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Wed, 23 Jun 2021 12:36:16 +0200 Subject: [PATCH 4/6] apply comments --- doc/sphinx/source/n3fit/runcard_detailed.rst | 21 ++++++++++++++------ n3fit/src/n3fit/performfit.py | 2 +- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/doc/sphinx/source/n3fit/runcard_detailed.rst b/doc/sphinx/source/n3fit/runcard_detailed.rst index 7a9ad9fce0..8a6e5a2828 100644 --- a/doc/sphinx/source/n3fit/runcard_detailed.rst +++ b/doc/sphinx/source/n3fit/runcard_detailed.rst @@ -290,8 +290,8 @@ as well as a detailed analysis of the amount of time that TensorFlow spent on ea Running fits in parallel ------------------------ -It is possible to run fits in parallel with ``n3fit`` by using the ``parallel_models`` -flag in the runcard when running a range of replicas. +It is possible to run fits in parallel with ``n3fit`` by setting the ``parallel_models`` +flag in the runcard to ``true`` when running a range of replicas. Running in parallel can be quite hard on memory and it is only advantageous when fitting on a GPU, where one can find a speed up equal to the number of models run in parallel (each model being a different replica). @@ -301,8 +301,10 @@ is the output data the prediction is compared to. In order to ensure this is indeed the case it is necessary to also use the `same_trvl_per_replica` flag in the runcard. -In order to run several replicas in parallel, add the following options -to the runcard: +In other words, in order to run several replicas in parallel in a machine +(be it a big CPU or, most likely, a GPU) +it is necessary to modify the ``n3fit`` runcard by adding the following two +top-level options: .. code-block:: yaml @@ -310,14 +312,21 @@ to the runcard: same_trvl_per_replica: true -And then run ``n3fit`` with the replica range to be parallelized +And then run ``n3fit`` with a replica range to be parallelized (in this case from replica 1 to replica 4). .. code-block:: bash n3fit runcard.yml 1 -r 4 -At present it cannot be used together with the ``hyperopt`` module. + +In machines with more than one GPU you can select the GPU in which the code +should run by setting the environment variable ``CUDA_VISIBLE_DEVICES`` +to the right index (usually ``0, 1, 2``) or leaving it explicitly empty +to avoid running on GPU: ``export CUDA_VISIBLE_DEVICES=""`` + + +Note that at present it cannot be used together with the ``hyperopt`` module. .. _otheroptions-label: diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index 12e4411342..635426367c 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -165,7 +165,7 @@ def performfit( all_experiments[i_exp]['expdata'] = np.concatenate(training_data, axis=0) all_experiments[i_exp]['expdata_vl'] = np.concatenate(validation_data, axis=0) log.info( - "Starting replica fits %d to %d", + "Starting parallel fits from replica %d to %d", replicas[0], replicas[0] + n_models - 1, ) From b6426c9d93c3bc8f524f07900db229f1fdbdee63 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Wed, 23 Jun 2021 14:56:08 +0200 Subject: [PATCH 5/6] fix rebase --- n3fit/src/n3fit/n3fit_checks_provider.py | 5 +++-- n3fit/src/n3fit/performfit.py | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/n3fit/src/n3fit/n3fit_checks_provider.py b/n3fit/src/n3fit/n3fit_checks_provider.py index af6b6972d4..b5933675cb 100644 --- a/n3fit/src/n3fit/n3fit_checks_provider.py +++ b/n3fit/src/n3fit/n3fit_checks_provider.py @@ -6,7 +6,7 @@ import n3fit.checks -@n3fit.checks.can_run_parallel_replicas +@n3fit.checks.can_run_multiple_replicas @n3fit.checks.check_consistent_basis @n3fit.checks.wrapper_check_NN @n3fit.checks.wrapper_hyperopt @@ -25,6 +25,7 @@ def n3fit_checks_action( hyperscan=None, hyperopt=None, tensorboard=None, - parallel_models=1, + parallel_models=False, + same_trvl_per_replica=False, ): return diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index 635426367c..762373b1f6 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -15,7 +15,6 @@ # Action to be called by validphys # All information defining the NN should come here in the "parameters" dict -@n3fit.checks.can_run_multiple_replicas def performfit( *, n3fit_checks_action, # used for checks From 66e0368830106988f5e3c429fc3002d9483e4b29 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Wed, 23 Jun 2021 15:17:50 +0200 Subject: [PATCH 6/6] isolate the checks that need replicas --- n3fit/src/n3fit/checks.py | 21 +++++++++++++-------- n3fit/src/n3fit/n3fit_checks_provider.py | 4 ++-- n3fit/src/n3fit/performfit.py | 5 +++-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py index 40ef62edef..9957f17b66 100644 --- a/n3fit/src/n3fit/checks.py +++ b/n3fit/src/n3fit/checks.py @@ -348,17 +348,12 @@ def check_consistent_basis(sum_rules, fitbasis, basis, theoryid): @make_argcheck -def can_run_multiple_replicas( - replicas, parameters, hyperopt, parallel_models, same_trvl_per_replica -): - """Checks whether a runcard which is trying to run several replicas at once - (parallel_models =/= 1) is valid +def check_consistent_parallel(hyperopt, parameters, parallel_models, same_trvl_per_replica): + """Checks whether the multiple-replica fit options are consistent among them + i.e., that the trvl seed is fixed, hyperopt is not on and the layer type is correct """ if not parallel_models: return - if len(replicas) == 1: - log.warning("parallel_models is set to true for only one replica") - return if not same_trvl_per_replica: raise CheckError( "Replicas cannot be run in parallel with different training/validation " @@ -370,6 +365,16 @@ def can_run_multiple_replicas( raise CheckError("Parallelization has only been tested with layer_type=='dense'") +@make_argcheck +def can_run_multiple_replicas(replicas, parallel_models): + """Warns the user if trying to run just one replica in parallel""" + if not parallel_models: + return + if len(replicas) == 1: + log.warning("parallel_models is set to true for only one replica") + return + + @make_argcheck def check_deprecated_options(fitting): """Checks whether the runcard is using deprecated options""" diff --git a/n3fit/src/n3fit/n3fit_checks_provider.py b/n3fit/src/n3fit/n3fit_checks_provider.py index b5933675cb..e8cd01c162 100644 --- a/n3fit/src/n3fit/n3fit_checks_provider.py +++ b/n3fit/src/n3fit/n3fit_checks_provider.py @@ -6,11 +6,11 @@ import n3fit.checks -@n3fit.checks.can_run_multiple_replicas @n3fit.checks.check_consistent_basis @n3fit.checks.wrapper_check_NN @n3fit.checks.wrapper_hyperopt @n3fit.checks.check_deprecated_options +@n3fit.checks.check_consistent_parallel def n3fit_checks_action( *, genrep, @@ -26,6 +26,6 @@ def n3fit_checks_action( hyperopt=None, tensorboard=None, parallel_models=False, - same_trvl_per_replica=False, + same_trvl_per_replica=False ): return diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index 762373b1f6..da85196be6 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -15,10 +15,11 @@ # Action to be called by validphys # All information defining the NN should come here in the "parameters" dict +@n3fit.checks.can_run_multiple_replicas def performfit( *, - n3fit_checks_action, # used for checks - replicas, # used for checks specific to performfit + n3fit_checks_action, # wrapper for all checks + replicas, # checks specific to performfit replicas_nnseed_fitting_data_dict, posdatasets_fitting_pos_dict, integdatasets_fitting_integ_dict,