diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst index f345e49604..7eb5ae2343 100644 --- a/doc/sphinx/source/n3fit/hyperopt.rst +++ b/doc/sphinx/source/n3fit/hyperopt.rst @@ -320,10 +320,10 @@ Changing the hyperoptimization target ----------------------------------- Beyond the usual :math:`\chi2`-based optimization figures above, it is possible to utilize other measures as the target for hyperoptimization. -One possibility is to use a :ref:`future test`-based metric for which the goal is not to get the minimum :math:`\chi2` but to get the same :math:`\chi2` (with PDF errors considered) for different datasets. The idea is that this way we select models of which the prediction is stable upon variations in the dataset. +One possibility is to use a :ref:`future test`-based metric for which the goal is not to get the minimum :math:`\chi2` but to get the same :math:`\chi2` (with PDF errors considered) for different datasets. The idea is that this way we select models of which the prediction is stable upon variations in the dataset. In order to obtain the PDF errors used in the figure of merit it is necessary to run multiple replicas, luckily ``n3fit`` provides such a possibility also during hyperoptimization. -Take the following modifications to a normal hyperopt runcard +Take the following modifications to a normal hyperopt runcard (note that for convenience we take the trials directly from a previous run, so we don't have to create a new hyperopt configuration dictionary). @@ -345,7 +345,7 @@ hyperopt configuration dictionary). kfold: target: fit_future_tests - partitions: + partitions: - datasets: - HERACOMBCCEP - HERACOMBCCEM @@ -370,3 +370,19 @@ The figure of merit will be the difference between the :math:`\chi2` of the seco .. math:: L_{\rm hyperopt} = \chi^{2}_{(1) \rm pdferr} - \chi^{2}_{(2)} + + +Restarting hyperoptimization runs +--------------------------------- + +In addition to the ``tries.json`` files, hyperparameter scans also produce ``tries.pkl`` `pickle `_ files, +which are located in the same directory as the corresponding ``tries.json`` file. +The generated ``tries.pkl`` file stores the complete history of a previous hyperoptimization run, making it possible to resume the process using the ``hyperopt`` framework. +To achieve this, you can use the ``--restart`` option within the ``n3fit`` command, e.g.,: + +.. code-block:: bash + + n3fit runcard.yml 1 -r 10 --hyperopt 20 --restart + +The above command example is effective when the number of saved trials in the ``test_run/nnfit/replica_1/tries.pkl`` is +less than ``20``. If there are ``20`` or more saved trials, ``n3fit`` will simply terminate, displaying the best results. diff --git a/n3fit/src/n3fit/hyper_optimization/filetrials.py b/n3fit/src/n3fit/hyper_optimization/filetrials.py index c3d2d8e68f..4c9ee07647 100644 --- a/n3fit/src/n3fit/hyper_optimization/filetrials.py +++ b/n3fit/src/n3fit/hyper_optimization/filetrials.py @@ -1,12 +1,15 @@ """ Custom hyperopt trial object for persistent file storage - in the form of a json file within the nnfit folder + in the form of json and pickle files within the nnfit folder """ import json import logging -from validphys.hyperoptplot import HyperoptTrial +import pickle + from hyperopt import Trials, space_eval +from validphys.hyperoptplot import HyperoptTrial + log = logging.getLogger(__name__) # Note: the plan would be to do a PR in hyperopt's main repository @@ -60,10 +63,42 @@ class FileTrials(Trials): def __init__(self, replica_path, parameters=None, **kwargs): self._store_trial = False - self._json_file = "{0}/tries.json".format(replica_path) + self._json_file = replica_path / "tries.json" + self.pkl_file = replica_path / "tries.pkl" self._parameters = parameters + self._rstate = None super().__init__(**kwargs) + @property + def rstate(self): + """ + Returns the rstate attribute. + + Notes: + :func:`rstate` stores a `numpy.random.Generator` which is important to make + hyperopt restarts reproducible in the hyperparameter space. It can + be passed later as the `rstate` parameters of `hyperopt.fmin`. + """ + return self._rstate + + @rstate.setter + def rstate(self, random_generator): + """ + Sets the rstate attribute. + + # Arguments: + - `random_generator`: `numpy.random.Generator` + + Example + -------- + >>> import numpy as np + >>> from n3fit.hyper_optimization.filetrials import FileTrials + >>> + >>> trials = FileTrials(replica_path_set, parameters=parameters) + >>> trials.rstate = np.random.default_rng(42) + """ + self._rstate = random_generator + def refresh(self): """ This is the "flushing" method which is called at the end of every trial to @@ -78,9 +113,7 @@ def refresh(self): local_trials = [] for idx, t in enumerate(self._dynamic_trials): local_trials.append(t) - local_trials[idx]["misc"]["space_vals"] = space_eval_trial( - self._parameters, t - ) + local_trials[idx]["misc"]["space_vals"] = space_eval_trial(self._parameters, t) all_to_str = json.dumps(local_trials, default=str) with open(self._json_file, "w") as f: @@ -95,3 +128,25 @@ def new_trial_ids(self, n): def new_trial_docs(self, tids, specs, results, miscs): self._store_trial = True return super().new_trial_docs(tids, specs, results, miscs) + + def to_pkl(self): + """Dump `FileTrials` object into a pickle file.""" + with open(self.pkl_file, "wb") as file: + pickle.dump(self, file) + + @classmethod + def from_pkl(cls, pickle_filepath): + """ + Load and return an instance of `FileTrials` from a pickle file. + + If a pickle file from previous run is present this method can be used + to instantiate an initial `FileTrials` object to restart. + """ + try: + with open(pickle_filepath, "rb") as file: + return pickle.load(file) + except FileNotFoundError as err: + raise FileNotFoundError( + "Failed to open 'tries.pkl' pickle file for restarting. " + f"Please ensure it is located in: {pickle_filepath}" + ) from err diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index 08814dc859..174e921677 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -10,17 +10,22 @@ - a function - a dictionary of spaces of parameters you can do so by simply modifying the wrappers to point somewhere else -(and, of course the function in the fitting action that calls the miniimization). +(and, of course the function in the fitting action that calls the minimization). """ import copy +import logging + import hyperopt import numpy as np -from n3fit.backends import MetaModel, MetaLayer -import n3fit.hyper_optimization.filetrials as filetrials -import logging + +from n3fit.backends import MetaLayer, MetaModel +from n3fit.hyper_optimization.filetrials import FileTrials log = logging.getLogger(__name__) +HYPEROPT_SEED = 42 + + # These are just wrapper around some hyperopt's sampling expresions defined in here # https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions # with a bit of extra documentation for the ones that are not obvious @@ -88,12 +93,13 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= and performs ``max_evals`` evaluations of the hyperparametrizable function of ``model_trainer``. A ``tries.json`` file will be saved in the ``replica_path_set`` folder with the information - of all trials. + of all trials. An additional ``tries.pkl`` file will also be generated in the same folder + that stores the previous states of `FileTrials`, this file can be used for restarting purposes. Parameters ----------- replica_path_set: path - folder where to create the json ``tries.json`` file + folder where to create the ``tries.json`` and ``tries.pkl`` files model_trainer: :py:class:`n3fit.ModelTrainer.ModelTrainer` a ``ModelTrainer`` object with the ``hyperparametrizable`` method hyperscanner: :py:class:`n3fit.hyper_optimization.hyper_scan.HyperScanner` @@ -109,7 +115,15 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= # Tell the trainer we are doing hpyeropt model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys, status_ok=hyperopt.STATUS_OK) # Generate the trials object - trials = filetrials.FileTrials(replica_path_set, parameters=hyperscanner.as_dict()) + trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict()) + # Initialize seed for hyperopt + trials.rstate = np.random.default_rng(HYPEROPT_SEED) + + # For restarts, reset the state of `FileTrials` saved in the pickle file + if hyperscanner.restart_hyperopt: + pickle_file_to_load = f"{replica_path_set}/tries.pkl" + log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load) + trials = FileTrials.from_pkl(pickle_file_to_load) # Perform the scan best = hyperopt.fmin( @@ -119,6 +133,8 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= max_evals=max_evals, show_progressbar=False, trials=trials, + rstate=trials.rstate, + trials_save_file=trials.pkl_file, ) return hyperscanner.space_eval(best) @@ -174,6 +190,10 @@ def __init__(self, parameters, sampling_dict, steps=5): self.parameters = copy.deepcopy(parameters) self.steps = steps + # adding extra options for restarting + restart_config = sampling_dict.get("restart") + self.restart_hyperopt = True if restart_config else False + self.hyper_keys = set([]) if "parameters" in sampling_dict: @@ -256,8 +276,7 @@ def stopping(self, min_epochs=None, max_epochs=None, min_patience=None, max_pati stopping_key = "stopping_patience" if min_epochs is not None and max_epochs is not None: - epochs = hp_quniform(epochs_key, min_epochs, max_epochs, - step_size=1000) + epochs = hp_quniform(epochs_key, min_epochs, max_epochs, step_size=1) self._update_param(epochs_key, epochs) if min_patience is not None or max_patience is not None: @@ -333,11 +352,7 @@ def optimizer(self, optimizers): self._update_param(opt_key, opt_val) def positivity( - self, - min_multiplier=None, - max_multiplier=None, - min_initial=None, - max_initial=None, + self, min_multiplier=None, max_multiplier=None, min_initial=None, max_initial=None ): """ Modifies the following entries of the `parameters` dictionary: @@ -414,8 +429,7 @@ def architecture( units = [] for i in range(n): units_label = "nl{0}:-{1}/{0}".format(n, i) - units_sampler = hp_quniform(units_label, min_units, max_units, - step_size=1) + units_sampler = hp_quniform(units_label, min_units, max_units, step_size=1) units.append(units_sampler) # The number of nodes in the last layer are read from the runcard units.append(output_size) diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py index b96737481c..ea8bdb0b7a 100644 --- a/n3fit/src/n3fit/model_trainer.py +++ b/n3fit/src/n3fit/model_trainer.py @@ -222,13 +222,7 @@ def __init__( "folds": [], "posdatasets": [], } - self.experimental = { - "output": [], - "expdata": [], - "ndata": 0, - "model": None, - "folds": [], - } + self.experimental = {"output": [], "expdata": [], "ndata": 0, "model": None, "folds": []} self._fill_the_dictionaries() @@ -483,11 +477,7 @@ def _model_generation(self, xinput, pdf_models, partition, partition_idx): except ValueError: pass - models = { - "training": training, - "validation": validation, - "experimental": experimental, - } + models = {"training": training, "validation": validation, "experimental": experimental} return models @@ -850,7 +840,7 @@ def hyperparametrizable(self, params): # Initialize all photon classes for the different replicas: if self.lux_params: photons = Photon( - theoryid=self.theoryid, lux_params=self.lux_params, replicas=self.replicas, + theoryid=self.theoryid, lux_params=self.lux_params, replicas=self.replicas ) else: photons = None @@ -860,7 +850,11 @@ def hyperparametrizable(self, params): # and the seed needs to be updated accordingly seeds = self._nn_seeds if k > 0: - seeds = [np.random.randint(0, pow(2, 31)) for _ in seeds] + # generate random integers for each k-fold from the input `nnseeds` + # we generate new seeds to avoid the integer overflow that may + # occur when doing k*nnseeds + rngs = [np.random.default_rng(seed=seed) for seed in seeds] + seeds = [generator.integers(1, pow(2, 30)) * k for generator in rngs] # Generate the pdf model pdf_models = self._generate_pdf( @@ -922,7 +916,7 @@ def hyperparametrizable(self, params): for model in models.values(): model.compile(**params["optimizer"]) - passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs,) + passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs) if self.mode_hyperopt: # If doing a hyperparameter scan we need to keep track of the loss function diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py index cf6b276037..32ddf5260f 100755 --- a/n3fit/src/n3fit/scripts/n3fit_exec.py +++ b/n3fit/src/n3fit/scripts/n3fit_exec.py @@ -232,6 +232,8 @@ def produce_hyperscanner(self, parameters, hyperscan_config=None, hyperopt=None) if hyperscan_config is None or hyperopt is None: return None + if hyperopt and self.environment.restart: + hyperscan_config.update({'restart': 'true'}) return HyperScanner(parameters, hyperscan_config) @@ -258,6 +260,7 @@ def check_positive(value): return ivalue parser.add_argument("--hyperopt", help="Enable hyperopt scan", default=None, type=int) + parser.add_argument("--restart", help="Enable hyperopt restarts", action="store_true") parser.add_argument("replica", help="MC replica number", type=check_positive) parser.add_argument( "-r", @@ -283,6 +286,7 @@ def run(self): replicas = [replica] self.environment.replicas = NSList(replicas, nskey="replica") self.environment.hyperopt = self.args["hyperopt"] + self.environment.restart = self.args["restart"] super().run() except N3FitError as e: log.error(f"Error in n3fit:\n{e}") diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py index f06ec133f1..cecc747452 100644 --- a/n3fit/src/n3fit/tests/test_hyperopt.py +++ b/n3fit/src/n3fit/tests/test_hyperopt.py @@ -1,13 +1,81 @@ """ Test hyperoptimization features """ +import json +import pathlib +import shutil +import subprocess as sp from numpy.testing import assert_approx_equal + from n3fit.hyper_optimization import rewards + def test_rewards(): - """ Ensure that rewards continue doing what they are supposed to do """ + """Ensure that rewards continue doing what they are supposed to do""" losses = [0.0, 1.0, 2.0] assert_approx_equal(rewards.average(losses), 1.0) assert_approx_equal(rewards.best_worst(losses), 2.0) assert_approx_equal(rewards.std(losses), 0.816496580927726) + + +REGRESSION_FOLDER = pathlib.Path(__file__).with_name("regressions") +QUICKNAME = "quickcard" +EXE = "n3fit" +REPLICA = "1" + + +def load_data(info_file): + """Loads the information of the fit from the json files""" + with open(info_file, "r", encoding='utf-8') as file: + return json.load(file) + + +def test_restart_from_pickle(tmp_path): + """Ensure that our hyperopt restart works as expected""" + # Prepare the run + quickcard = f"hyper-{QUICKNAME}.yml" + quickpath = REGRESSION_FOLDER / quickcard + # Set up some options + n_trials_stop = 2 + n_trials_total = 4 + output_restart = tmp_path / f"run_{n_trials_stop}_trials_and_then_{n_trials_total}_trials" + output_direct = tmp_path / f"run_{n_trials_total}_trials" + + # cp runcard to tmp folder + shutil.copy(quickpath, tmp_path) + # run some trials for the first time + sp.run( + f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_stop} " f"-o {output_restart}".split(), + cwd=tmp_path, + check=True, + ) + # restart and calculate more trials + sp.run( + f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_total} " + f"-o {output_restart} --restart".split(), + cwd=tmp_path, + check=True, + ) + # start again and calculate all trials at once + sp.run( + f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_total} " f"-o {output_direct}".split(), + cwd=tmp_path, + check=True, + ) + + # read up generated json files + restart_json_path = f"{output_restart}/nnfit/replica_{REPLICA}/tries.json" + restart_json = load_data(restart_json_path) + direct_json_path = f"{output_direct}/nnfit/replica_{REPLICA}/tries.json" + direct_json = load_data(direct_json_path) + + # minimum check: the generated list of nested dictionaries have same lenght + assert len(restart_json) == len(direct_json) + + for i in range(n_trials_total): + # check that the files share exactly the same hyperopt history + assert restart_json[i]['misc'] == direct_json[i]['misc'] + assert restart_json[i]['state'] == direct_json[i]['state'] + assert restart_json[i]['tid'] == direct_json[i]['tid'] + assert restart_json[i]['result'] == direct_json[i]['result']