From 3af8d5a4b051e6f9ebafe3d26f0d1a4879857fbb Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Fri, 26 Jan 2024 16:50:02 +0100 Subject: [PATCH 01/35] Added 'MongoFileTrials' class Added start and stop mongo workers methods to 'MongoFileTrials' Added better exception handling to 'start_mongo_workers' and 'stop_mongo_workers' Fix(MongoFileTrials): remove unused 'tries.pkl' Set GPU device to each mongo worker Fix in GPU device Test with only CUDA_VISIBLE_DEVICES Added proper CUDA_VISIBLE_DEVICES indexes according to the number of GPUs available Changed default poll interval from 5 to 0.1 in hyperopt-mongo-worker Added option to print more output info Added extra function to avoid writing twice to 'tries.json' Added gpu memory growth to 'start_mongo_workers' Fix in mongofiletrials.py --- .../hyper_optimization/mongofiletrials.py | 252 ++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 n3fit/src/n3fit/hyper_optimization/mongofiletrials.py diff --git a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py new file mode 100644 index 0000000000..ba19eac890 --- /dev/null +++ b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py @@ -0,0 +1,252 @@ +""" + Hyperopt trial object for parallel hyperoptimization with MongoDB. + Data are fetched from MongoDB databases and stored in the form of json files within the nnfit folder +""" +import json +import logging +import os +import subprocess + +from bson import SON, ObjectId +from hyperopt.mongoexp import MongoTrials +import tensorflow as tf + +from n3fit.hyper_optimization.filetrials import space_eval_trial + +log = logging.getLogger(__name__) + + +def get_physical_gpus(): + """ + Retrieve a list of all physical GPU devices available in the system. + + Returns + ------- + list: A list of TensorFlow physical devices of type 'GPU'. + """ + return tf.config.list_physical_devices('GPU') + + +def set_tf_visible_device(gpu_id, devices_list): + """ + Set a specific GPU as the visible device for TensorFlow. + + Parameters + ---------- + gpu_id (int): The ID of the GPU to be used. + devices_list (list): List of physical devices detected. + + Returns + ------- + bool: True if the device is set successfully, False otherwise. + """ + try: + if gpu_id < 0 or gpu_id >= len(devices_list): + log.error( + "GPU ID is out of range. Available GPUs: 0 to {}".format(len(devices_list) - 1) + ) + return False + + tf.config.set_visible_devices(devices_list[gpu_id], 'GPU') + tf.config.experimental.set_memory_growth(devices_list[gpu_id], True) + log.info("GPU {} is set as visible device.".format(gpu_id)) + return True + + except Exception as e: + log.error("Failed to set visible device: {}".format(e)) + return False + + +def convert_bson_to_dict(obj): + """ + Recursively convert a BSON object to a standard Python dictionary. + + This function is particularly useful for converting MongoDB query results, + which may contain BSON types like ObjectId and SON, into a more manageable + dictionary format. + + Parameters + ---------- + obj : dict or bson.SON or list or any + The object to convert. Can be a BSON object (like SON), a dictionary + containing BSON types, a list of such objects, or any other type. + + Returns + ------- + dict or list or any + A Python dictionary with all BSON types converted to standard Python + types (e.g., ObjectId converted to string). If the input is a list, + returns a list of converted elements. For other types, returns the + object as is. + + Examples + -------- + >>> from bson import ObjectId, SON + >>> sample_son = SON([('_id', ObjectId('507f1f77bcf86cd799439011')), ('name', 'John Doe')]) + >>> convert_bson_to_dict(sample_son) + {'_id': '507f1f77bcf86cd799439011', 'name': 'John Doe'} + + >>> sample_list = [SON([('_id', ObjectId('507f1f77bcf86cd799439011')), ('name', 'John Doe')]), {'age': 30}] + >>> convert_bson_to_dict(sample_list) + [{'_id': '507f1f77bcf86cd799439011', 'name': 'John Doe'}, {'age': 30}] + """ + if isinstance(obj, (SON, dict)): + return {k: convert_bson_to_dict(v) for k, v in obj.items()} + if isinstance(obj, ObjectId): + return str(obj) # or just return None if you don't need the ObjectId + if isinstance(obj, list): + return [convert_bson_to_dict(v) for v in obj] + return obj + + +class MongoFileTrials(MongoTrials): + """ + MongoDB implementation of :class:`n3fit.hyper_optimization.filetrials.FileTrials`. + + Parameters + ---------- + replica_path: path + Replica folder as generated by n3fit. + db_host: str + MongoDB database connection host. Defaults to "localhost". + db_port: int + MongoDB database connection port. Defaults to 27017. + db_name: str + MongoDB database name. Details to "hyperopt". + num_workers: int + Number of MongoDB workers to be initiated concurrently. Defaults to 1. + parameters: dict + Dictionary of parameters on which we are doing hyperoptimization. Default to None. + store_trial: bool + If True, store data into json file. Default to True. + """ + + def __init__( + self, + replica_path, + db_host="localhost", + db_port=27017, + db_name="hyperopt", + num_workers=1, + parameters=None, + *args, + **kwargs, + ): + self.db_host = db_host + self.db_port = str(db_port) + self.db_name = db_name + self.num_workers = num_workers + self.mongotrials_arg = f"mongo://{self.db_host}:{self.db_port}/{self.db_name}/jobs" + self.workers = [] + + self._store_trial = False + self._json_file = replica_path / "tries.json" + self._parameters = parameters + self._rstate = None + self._dynamic_trials = [] + + super().__init__(self.mongotrials_arg, *args, **kwargs) + + @property + def rstate(self): + """Returns the rstate attribute; see :class:`n3fit.hyper_optimization.filetrials.FileTrials`.""" + return self._rstate + + @rstate.setter + def rstate(self, random_generator): + """Sets the rstate attribute; see :class:`n3fit.hyper_optimization.filetrials.FileTrials`.""" + self._rstate = random_generator + + def _set_dynamic_trials(self): + """Converts self._trials to a dictionary and stores it in self._dynamic_trials.""" + self._dynamic_trials = [convert_bson_to_dict(item) for item in self._trials] + + def refresh(self): + """Fetches data from mongo database and save to a json file.""" + super().refresh() + + # convert BSON object to a dictionary + self._set_dynamic_trials() + + # write json to disk + if self._store_trial: + log.info("Storing scan in %s", self._json_file) + local_trials = [] + for idx, t in enumerate(self._dynamic_trials): + local_trials.append(t) + local_trials[idx]["misc"]["space_vals"] = space_eval_trial(self._parameters, t) + + all_to_str = json.dumps(local_trials, default=str) + with open(self._json_file, "w") as f: + f.write(all_to_str) + + # like in `FileTrials` the two methods below are implemented to avoid writing to the database twice + def new_trial_ids(self, n): + self._store_trial = False + return super().new_trial_ids(n) + + def _insert_trial_docs(self, docs): + self._store_trial = True + return super()._insert_trial_docs(docs) + + def start_mongo_workers( + self, workdir=None, exp_key=None, poll_interval=0.1, use_subprocesses=False + ): + """Initiates all mongo workers simultaneously.""" + # get the number of gpu cards, if any + gpus_all_physical_list = get_physical_gpus() + num_gpus_available = len(gpus_all_physical_list) + if not num_gpus_available: + log.warning("No GPUs found in the system.") + + # launch mongo workers + for i in range(self.num_workers): + # construct the command to start a hyperopt-mongo-worker + args = [ + "hyperopt-mongo-worker", + "--mongo", + f"{self.db_host}:{self.db_port}/{self.db_name}", + ] + if workdir: + args.extend(["--workdir", workdir]) + if exp_key: + args.extend(["--exp-key", exp_key]) + args.extend(["--poll-interval", str(poll_interval)]) + if use_subprocesses: + args.append("--no-subprocesses") + + # start the worker as a subprocess + try: + my_env = os.environ.copy() + + if num_gpus_available: + # set CUDA_VISIBLE_DEVICES environment variable + # the GPU index assigned to each worker i is given by mod(i, num_gpus_available) + my_env["CUDA_VISIBLE_DEVICES"] = str(i % num_gpus_available) + # set tensorflow memory growth + my_env["TF_FORCE_GPU_ALLOW_GROWTH"] = "true" + # avoid memory fragmentation issues? + # my_env["TF_GPU_ALLOCATOR"] = "cuda_malloc_async" + + # run mongo workers + worker = subprocess.Popen(args, env=my_env) + # we could use `stderr=subprocess.DEVNULL` in Popen to suppress output info + self.workers.append(worker) + logging.info(f"Started mongo worker {i+1}/{self.num_workers}") + except OSError as err: + msg = f"Failed to execute {args}. Make sure you have MongoDB installed." + raise EnvironmentError(msg) from err + + def stop_mongo_workers(self): + """Terminates all active mongo workers.""" + for worker in self.workers: + try: + worker.terminate() + worker.wait() + logging.info( + f"Stopped mongo worker {self.workers.index(worker)+1}/{self.num_workers}" + ) + except Exception as e: + logging.error( + f"Failed to stop mongo worker {self.workers.index(worker)+1}/{self.num_workers}: {e}" + ) From eef95cabbff2df25584bc3de04def59787a40e90 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Mon, 29 Jan 2024 14:49:23 +0100 Subject: [PATCH 02/35] Parsed mongodb option to 'n3fit' command and 'HyperScanner' Added extra parallel and mongodb config options to 'HyperScanner' class Added num_mongo_workers option to n3fit and HyperScanner --- .../n3fit/hyper_optimization/hyper_scan.py | 11 +++++++ n3fit/src/n3fit/scripts/n3fit_exec.py | 29 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index bccb67bd5f..511606e1d9 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -205,6 +205,17 @@ def __init__(self, parameters, sampling_dict, steps=5): restart_config = sampling_dict.get("restart") self.restart_hyperopt = True if restart_config else False + # adding extra options for parallel execution + parallel_config = sampling_dict.get("parallel") + self.parallel_hyperopt = True if parallel_config else False + + # setting up MondoDB options + if self.parallel_hyperopt: + self.db_host = sampling_dict.get("db_host") + self.db_port = sampling_dict.get("db_port") + self.db_name = sampling_dict.get("db_name") + self.num_mongo_workers = sampling_dict.get("num_mongo_workers") + self.hyper_keys = set([]) if "parameters" in sampling_dict: diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py index 32ddf5260f..dd3e911beb 100755 --- a/n3fit/src/n3fit/scripts/n3fit_exec.py +++ b/n3fit/src/n3fit/scripts/n3fit_exec.py @@ -234,6 +234,16 @@ def produce_hyperscanner(self, parameters, hyperscan_config=None, hyperopt=None) return None if hyperopt and self.environment.restart: hyperscan_config.update({'restart': 'true'}) + if hyperopt and self.environment.parallel_hyperopt: + hyperscan_config.update({'parallel': 'true'}) + hyperscan_config.update( + { + 'db_host': self.environment.db_host, + 'db_port': self.environment.db_port, + 'db_name': self.environment.db_name, + 'num_mongo_workers': self.environment.num_mongo_workers, + } + ) return HyperScanner(parameters, hyperscan_config) @@ -261,6 +271,20 @@ def check_positive(value): parser.add_argument("--hyperopt", help="Enable hyperopt scan", default=None, type=int) parser.add_argument("--restart", help="Enable hyperopt restarts", action="store_true") + parser.add_argument( + "--parallel-hyperopt", + help="Enable hyperopt run in parallel with MongoDB", + action="store_true", + ) + parser.add_argument("--db-host", help="MongoDB host", default="localhost") + parser.add_argument("--db-port", help="MongoDB port", default=27017) + parser.add_argument("--db-name", help="MongoDB dataset name", default="hyperopt") + parser.add_argument( + "--num-mongo-workers", + help="Number of mongo workers to be launched simultaneously", + type=check_positive, + default=1, + ) parser.add_argument("replica", help="MC replica number", type=check_positive) parser.add_argument( "-r", @@ -287,6 +311,11 @@ def run(self): self.environment.replicas = NSList(replicas, nskey="replica") self.environment.hyperopt = self.args["hyperopt"] self.environment.restart = self.args["restart"] + self.environment.parallel_hyperopt = self.args["parallel_hyperopt"] + self.environment.db_host = self.args["db_host"] + self.environment.db_port = self.args["db_port"] + self.environment.db_name = self.args["db_name"] + self.environment.num_mongo_workers = self.args["num_mongo_workers"] super().run() except N3FitError as e: log.error(f"Error in n3fit:\n{e}") From fed400eaa9c8a6f9f2d4d9f9b3f045fd93f51dce Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Tue, 30 Jan 2024 11:25:48 +0100 Subject: [PATCH 03/35] Adapted 'hyper_scan_wrapper' to allow for parallel evaluation of fmin trials --- .../n3fit/hyper_optimization/hyper_scan.py | 64 +++++++++++++++---- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index 511606e1d9..5c230c8e90 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -22,6 +22,7 @@ from n3fit.backends import MetaLayer, MetaModel from n3fit.hyper_optimization.filetrials import FileTrials +from n3fit.hyper_optimization.mongofiletrials import MongoFileTrials log = logging.getLogger(__name__) @@ -126,27 +127,62 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= # Tell the trainer we are doing hpyeropt model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys) # Generate the trials object - trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict()) + if hyperscanner.parallel_hyperopt: + # Instantiate `MongoFileTrials` + # Mongo database should have already been initiated at this point + trials = MongoFileTrials( + replica_path_set, + db_host=hyperscanner.db_host, + db_port=hyperscanner.db_port, + db_name=hyperscanner.db_name, + num_workers=hyperscanner.num_mongo_workers, + parameters=hyperscanner.as_dict(), + ) + else: + # Instantiate `FileTrials` + trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict()) + # Initialize seed for hyperopt trials.rstate = np.random.default_rng(HYPEROPT_SEED) - # For restarts, reset the state of `FileTrials` saved in the pickle file - if hyperscanner.restart_hyperopt: + # For sequential hyperopt restarts, reset the state of `FileTrials` saved in the pickle file + if not hyperscanner.parallel_hyperopt and hyperscanner.restart_hyperopt: pickle_file_to_load = f"{replica_path_set}/tries.pkl" log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load) trials = FileTrials.from_pkl(pickle_file_to_load) - # Perform the scan - best = hyperopt.fmin( - fn=model_trainer.hyperparametrizable, - space=hyperscanner.as_dict(), - algo=hyperopt.tpe.suggest, - max_evals=max_evals, - show_progressbar=False, - trials=trials, - rstate=trials.rstate, - trials_save_file=trials.pkl_file, - ) + # Call to hyperopt.fmin + if hyperscanner.parallel_hyperopt: + # Launch mongo workers + trials.start_mongo_workers() + + # Perform the scan in parallel + best = hyperopt.fmin( + fn=model_trainer.hyperparametrizable, + space=hyperscanner.as_dict(), + algo=hyperopt.tpe.suggest, + max_evals=max_evals, + show_progressbar=True, + trials=trials, + rstate=trials.rstate, + max_queue_len=trials.num_workers, + ) + + # Stop mongo workers + trials.stop_mongo_workers() + else: + # Perform the scan sequentially + best = hyperopt.fmin( + fn=model_trainer.hyperparametrizable, + space=hyperscanner.as_dict(), + algo=hyperopt.tpe.suggest, + max_evals=max_evals, + show_progressbar=False, + trials=trials, + rstate=trials.rstate, + trials_save_file=trials.pkl_file, + ) + return hyperscanner.space_eval(best) From 65f6ce833a5e86dbdf494def7af0ea690b388533 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Tue, 30 Jan 2024 15:35:16 +0100 Subject: [PATCH 04/35] Add mondodb and pymongo in conda recipe and pymongo in 'pyproject.toml' Added conda install mongodb to tests.yml workflow Added pymongo as dependency in 'conda-recipe/meta.yaml' Added mongodb to 'python_installation.yml' and 'meta.yml' Restructure conda mongodb install and add pymongo version Added checks of mongod version Remove 'conda activate test' from MongoDB installation Added anaconda as main channel for mongodb installation in 'tests.yml' Added conda-forge as main channel for mongodb installation in 'tests.yml' Test: added mongodb to conda recipe in 'meta.yaml' --- .github/workflows/python_installation.yml | 5 +++++ conda-recipe/meta.yaml | 2 ++ pyproject.toml | 2 ++ 3 files changed, 9 insertions(+) diff --git a/.github/workflows/python_installation.yml b/.github/workflows/python_installation.yml index 9116de7943..8eddb84a83 100644 --- a/.github/workflows/python_installation.yml +++ b/.github/workflows/python_installation.yml @@ -34,6 +34,11 @@ jobs: conda config --append channels conda-forge conda config --set show_channel_urls true conda install lhapdf pandoc + - name: Install MongoDB for parallel hyperopts + shell: bash -l {0} + run: | + conda install mongodb + mongod --version - name: Install nnpdf with testing and qed extras shell: bash -l {0} run: | diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index fc09fc9d08..f8a5f1a573 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -26,6 +26,8 @@ requirements: - psutil # to ensure n3fit affinity is with the right processors - blas==1.0 *mkl* # [osx] # Host's blas is mkl, force also runtime blas to be - hyperopt + - mongodb + - pymongo <4 - seaborn - lhapdf - sqlite diff --git a/pyproject.toml b/pyproject.toml index 9d24b2d8c7..b9060119da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,8 @@ eko = "^0.14.1" # Hyperopt hyperopt = "*" seaborn = "*" +# Hyperopt parallel +pymongo = "<4" # LHAPDF installation for debugging purposes # a3b2bbc3ced97675ac3a71df45f55ba = "*" # Optional dependencies From 1440ceee656a5d31bcb4beb4e7f7efb4e4699258 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Wed, 31 Jan 2024 11:38:11 +0100 Subject: [PATCH 05/35] Added integration test Fix(test): fix in start_mongo_database Fix(test): create database directory priot to 'mongod' command --- n3fit/src/n3fit/tests/test_hyperopt.py | 96 ++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py index 6e79dfe7bf..5cf6021b48 100644 --- a/n3fit/src/n3fit/tests/test_hyperopt.py +++ b/n3fit/src/n3fit/tests/test_hyperopt.py @@ -5,6 +5,7 @@ import pathlib import shutil import subprocess as sp +import time import numpy as np from numpy.testing import assert_approx_equal @@ -170,3 +171,98 @@ def test_restart_from_pickle(tmp_path): assert restart_json[i]['tid'] == direct_json[i]['tid'] assert restart_json[i]['misc']['idxs'] == direct_json[i]['misc']['idxs'] # Note that it doesn't check the final loss of the second trial + + +def start_mongo_database(tmp_path): + """Creates MongoDB database and returns the Popen object.""" + db_command = ["mongod", "--dbpath", f"{tmp_path}/hyperopt"] + directory_path = f"{tmp_path}/hyperopt" + try: + # create database directory + sp.run(["mkdir", "-p", directory_path], check=True) + # launch database + process = sp.Popen(db_command, cwd=tmp_path) + return process + except (sp.CalledProcessError, OSError) as err: + msg = f"Error creating directory or executing {db_command}: {err}" + raise EnvironmentError(msg) from err + + +def stop_mongod_command(process): + """Stops the MongoDB database.""" + # directory_path = f"{tmp_path}/hyperopt" + try: + # stop mongod command + process.terminate() + process.wait() + # remove database files + # sp.run(f"rm -r {directory_path} && rm -r {tmp_path}/65*", check=True) + except (sp.CalledProcessError, OSError) as err: + msg = f"Error stopping the MongoDB process or removing database files: {err}" + raise EnvironmentError(msg) from err + + +def test_parallel_hyperopt(tmp_path): + """Ensure that the parallel implementation of hyperopt with MongoDB works as expected.""" + # Prepare the run + quickcard = f"hyper-{QUICKNAME}.yml" + quickpath = REGRESSION_FOLDER / quickcard + + # Define number of trials and number of mongo-workers to launch + n_trials = 6 + n_mongo_workers = 3 + + # Set up output directories + output_sequential = tmp_path / "run_hyperopt_sequential" + output_parallel = tmp_path / "run_hyperopt_parallel" + + # cp runcard to tmp folder + shutil.copy(quickpath, tmp_path) + + # Run hyperopt sequentially + start_time = time.time() + sp.run( + f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials} " f"-o {output_sequential}".split(), + cwd=tmp_path, + check=True, + ) + end_time = time.time() + sequential_run_time = end_time - start_time + + # Generate on-the-fly a real MongoDB database + process = start_mongo_database(tmp_path) + + # Run hyperopt in parallel + start_time = time.time() + sp.run( + f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials} " + f"--parallel-hyperopt --num-mongo-workers {n_mongo_workers} " + f"-o {output_parallel}".split(), + cwd=tmp_path, + check=True, + ) + end_time = time.time() + parallel_run_time = end_time - start_time + + # Stop mongod command + stop_mongod_command(process) + + # Read up generated json files + sequential_json_path = f"{output_sequential}/nnfit/replica_{REPLICA}/tries.json" + sequential_json = load_data(sequential_json_path) + parallel_json_path = f"{output_parallel}/nnfit/replica_{REPLICA}/tries.json" + parallel_json = load_data(parallel_json_path) + + # Check that the parallel run time is lower than the sequential one + assert parallel_run_time < sequential_run_time + + # Check that the final json files have the same number of trials + assert len(parallel_json) == len(sequential_json) + + for i in range(n_trials): + # Check that the files share the same content + assert len(parallel_json[i]['misc']) == len(sequential_json[i]['misc']) + assert len(parallel_json[i]['result']) == len(sequential_json[i]['result']) + # Note: cannot check that they share exactly the same history + # as the hyperopt algorithm depends on the results from previous runs + # which is obviously different between parallel and sequential runs From 0bb29c65b0785f778176397b20842ba9874d9bcc Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Mon, 19 Feb 2024 12:14:30 +0100 Subject: [PATCH 06/35] Add documentation Added more info to documentation --- doc/sphinx/source/n3fit/hyperopt.rst | 41 ++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst index 83c302216d..c012f0eea7 100644 --- a/doc/sphinx/source/n3fit/hyperopt.rst +++ b/doc/sphinx/source/n3fit/hyperopt.rst @@ -484,3 +484,44 @@ To achieve this, you can use the ``--restart`` option within the ``n3fit`` comma The above command example is effective when the number of saved trials in the ``test_run/nnfit/replica_1/tries.pkl`` is less than ``20``. If there are ``20`` or more saved trials, ``n3fit`` will simply terminate, displaying the best results. + + +Running hyperoptimizations in parallel with MongoDB +--------------------------------------------------- + +In NNPDF, you can effectively run hyperoptimization experiments in parallel using `MongoDB `_. +This functionality is provided by the :class:`~n3fit.hyper_optimization.mongofiletrials.MongoFileTrials` class, +which extends the capabilities of `hyperopt `_'s `MongoTrials` and enables the +simultaneous evaluation of multiple trials. + +To set up and run a parallelized hyperopt search, follow these steps: + + 1. **Initiate the MongoDB database:** Start by setting up the database in your current directory. + This database is referred to as ``hyperopt`` in the following instructions. You can initiate it with the command: + + .. code-block:: bash + + mongod --dbpath ./hyperopt + + By default, ``mongod`` uses port ``27017``. This is also the default port for the ``n3fit --db-port`` option. + If you wish to use a different port, specify it as follows: ``mongod --dbpath ./hyperopt --port YOUR_PORT_NUMBER``. + + 2. **Launch NNPDF with MongoDB integration:** Open a new command prompt and run ``n3fit`` with the desired configuration:: + + .. code-block:: bash + + n3fit hyper-quickcard.yml 1 -r N_replicas --hyperopt N_trials --parallel-hyperopt --num-mongo-workers N + + Here, ``N`` represents the number of MongoDB workers you wish to launch in parallel. + Each mongo worker handles one trial in Hyperopt. So, launching more workers allows for the simultaneous calculation of a greater number of trials. + Note that there is no need to manually launch mongo workers, as the ``hyperopt-mongo-worker`` command is automatically + executed by the :meth:`~n3fit.hyper_optimization.mongofiletrials.MongoFileTrials.start_mongo_workers` method. + By default, the ``host`` argument is set to ``localhost``, and the database is named ``hyperopt``. + If necessary, you can modify these settings using the ``n3fit --db-host`` or ``n3fit --db-name`` options. + + +.. note:: + + Unlike in serial execution, parallel hyperoptimization runs do not generate ``tries.pkl`` files. + To resume an experiment, simply retain the MongoDB database created during your previous run. + Then, follow steps 1 and 2 as described above to restart the experiment. From 49ee8dfe6543d99cb5012e3d7b790b682c6efab2 Mon Sep 17 00:00:00 2001 From: Carlos Murilo Romero Rocha <114645116+Cmurilochem@users.noreply.github.com> Date: Mon, 19 Feb 2024 13:03:18 +0100 Subject: [PATCH 07/35] Refactored fmin call Co-authored-by: Aron Jansen --- .../n3fit/hyper_optimization/hyper_scan.py | 39 ++++++------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index 5c230c8e90..f37e5c5872 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -151,38 +151,21 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load) trials = FileTrials.from_pkl(pickle_file_to_load) - # Call to hyperopt.fmin + # Call to hyperopt.fmin + fmin_args = dict( + fn=model_trainer.hyperparametrizable, + space=hyperscanner.as_dict(), + algo=hyperopt.tpe.suggest, + max_evals=max_evals, + trials=trials, + rstate=trials.rstate, + ) if hyperscanner.parallel_hyperopt: - # Launch mongo workers trials.start_mongo_workers() - - # Perform the scan in parallel - best = hyperopt.fmin( - fn=model_trainer.hyperparametrizable, - space=hyperscanner.as_dict(), - algo=hyperopt.tpe.suggest, - max_evals=max_evals, - show_progressbar=True, - trials=trials, - rstate=trials.rstate, - max_queue_len=trials.num_workers, - ) - - # Stop mongo workers + best = hyperopt.fmin(**fmin_args, show_progressbar=True, max_queue_len=trials.num_workers) trials.stop_mongo_workers() else: - # Perform the scan sequentially - best = hyperopt.fmin( - fn=model_trainer.hyperparametrizable, - space=hyperscanner.as_dict(), - algo=hyperopt.tpe.suggest, - max_evals=max_evals, - show_progressbar=False, - trials=trials, - rstate=trials.rstate, - trials_save_file=trials.pkl_file, - ) - + best = hyperopt.fmin(**fmin_args, show_progressbar=False, trials_save_file=trials.pkl_file) return hyperscanner.space_eval(best) From 3b569398233572a5efcb0c21fcc980084bb626d7 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Mon, 19 Feb 2024 13:31:09 +0100 Subject: [PATCH 08/35] Fix in fmin call comment format --- n3fit/src/n3fit/hyper_optimization/hyper_scan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index f37e5c5872..fd8a4d649e 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -151,7 +151,7 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load) trials = FileTrials.from_pkl(pickle_file_to_load) - # Call to hyperopt.fmin + # Call to hyperopt.fmin fmin_args = dict( fn=model_trainer.hyperparametrizable, space=hyperscanner.as_dict(), From dd05559aa4d25e636cba3cd1f0eba974c07bd1eb Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Mon, 19 Feb 2024 14:49:32 +0100 Subject: [PATCH 09/35] Fix in MongoFileTrials logging --- n3fit/src/n3fit/hyper_optimization/mongofiletrials.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py index ba19eac890..0eb8d3cc2a 100644 --- a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py +++ b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py @@ -232,7 +232,7 @@ def start_mongo_workers( worker = subprocess.Popen(args, env=my_env) # we could use `stderr=subprocess.DEVNULL` in Popen to suppress output info self.workers.append(worker) - logging.info(f"Started mongo worker {i+1}/{self.num_workers}") + log.info(f"Started mongo worker {i+1}/{self.num_workers}") except OSError as err: msg = f"Failed to execute {args}. Make sure you have MongoDB installed." raise EnvironmentError(msg) from err @@ -243,10 +243,8 @@ def stop_mongo_workers(self): try: worker.terminate() worker.wait() - logging.info( - f"Stopped mongo worker {self.workers.index(worker)+1}/{self.num_workers}" - ) + log.info(f"Stopped mongo worker {self.workers.index(worker)+1}/{self.num_workers}") except Exception as e: - logging.error( + log.error( f"Failed to stop mongo worker {self.workers.index(worker)+1}/{self.num_workers}: {e}" ) From c8e6461c5051f09f7135de8d36de46b7960bbae8 Mon Sep 17 00:00:00 2001 From: Carlos Murilo Romero Rocha <114645116+Cmurilochem@users.noreply.github.com> Date: Wed, 21 Feb 2024 07:54:49 +0100 Subject: [PATCH 10/35] Updated docs: replace 'initiate' by 'instantiate' Co-authored-by: Tanjona Rabemananjara --- doc/sphinx/source/n3fit/hyperopt.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst index c012f0eea7..89529bb7e4 100644 --- a/doc/sphinx/source/n3fit/hyperopt.rst +++ b/doc/sphinx/source/n3fit/hyperopt.rst @@ -496,7 +496,7 @@ simultaneous evaluation of multiple trials. To set up and run a parallelized hyperopt search, follow these steps: - 1. **Initiate the MongoDB database:** Start by setting up the database in your current directory. + 1. **Instantiate the MongoDB database:** Start by setting up the database in your current directory. This database is referred to as ``hyperopt`` in the following instructions. You can initiate it with the command: .. code-block:: bash From 0db5de96694444588bf41746269c980487f9e97a Mon Sep 17 00:00:00 2001 From: Carlos Murilo Romero Rocha <114645116+Cmurilochem@users.noreply.github.com> Date: Wed, 21 Feb 2024 07:55:07 +0100 Subject: [PATCH 11/35] Update doc/sphinx/source/n3fit/hyperopt.rst Co-authored-by: Tanjona Rabemananjara --- doc/sphinx/source/n3fit/hyperopt.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst index 89529bb7e4..20cbb91cec 100644 --- a/doc/sphinx/source/n3fit/hyperopt.rst +++ b/doc/sphinx/source/n3fit/hyperopt.rst @@ -506,7 +506,7 @@ To set up and run a parallelized hyperopt search, follow these steps: By default, ``mongod`` uses port ``27017``. This is also the default port for the ``n3fit --db-port`` option. If you wish to use a different port, specify it as follows: ``mongod --dbpath ./hyperopt --port YOUR_PORT_NUMBER``. - 2. **Launch NNPDF with MongoDB integration:** Open a new command prompt and run ``n3fit`` with the desired configuration:: + 2. **Launch NNPDF with MongoDB integration:** Open a new command prompt and run ``n3fit`` with the desired configuration: .. code-block:: bash From 63c244dda0ddb12d80365ad74ead01e59551828c Mon Sep 17 00:00:00 2001 From: Carlos Murilo Romero Rocha <114645116+Cmurilochem@users.noreply.github.com> Date: Wed, 21 Feb 2024 08:00:14 +0100 Subject: [PATCH 12/35] Updated docs: replace database name to 'hyperopt-db' Co-authored-by: Tanjona Rabemananjara --- doc/sphinx/source/n3fit/hyperopt.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst index 20cbb91cec..c636145b4f 100644 --- a/doc/sphinx/source/n3fit/hyperopt.rst +++ b/doc/sphinx/source/n3fit/hyperopt.rst @@ -497,11 +497,11 @@ simultaneous evaluation of multiple trials. To set up and run a parallelized hyperopt search, follow these steps: 1. **Instantiate the MongoDB database:** Start by setting up the database in your current directory. - This database is referred to as ``hyperopt`` in the following instructions. You can initiate it with the command: + This database is referred to as ``hyperopt-db`` in the following instructions. You can initiate it with the command: .. code-block:: bash - mongod --dbpath ./hyperopt + mongod --dbpath ./hyperopt-db By default, ``mongod`` uses port ``27017``. This is also the default port for the ``n3fit --db-port`` option. If you wish to use a different port, specify it as follows: ``mongod --dbpath ./hyperopt --port YOUR_PORT_NUMBER``. From 9576bdb021bc552f53dbfefbafb822be70787db4 Mon Sep 17 00:00:00 2001 From: Carlos Murilo Romero Rocha <114645116+Cmurilochem@users.noreply.github.com> Date: Wed, 21 Feb 2024 08:01:32 +0100 Subject: [PATCH 13/35] Update doc/sphinx/source/n3fit/hyperopt.rst Co-authored-by: Tanjona Rabemananjara --- doc/sphinx/source/n3fit/hyperopt.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst index c636145b4f..930d4eaa02 100644 --- a/doc/sphinx/source/n3fit/hyperopt.rst +++ b/doc/sphinx/source/n3fit/hyperopt.rst @@ -504,7 +504,7 @@ To set up and run a parallelized hyperopt search, follow these steps: mongod --dbpath ./hyperopt-db By default, ``mongod`` uses port ``27017``. This is also the default port for the ``n3fit --db-port`` option. - If you wish to use a different port, specify it as follows: ``mongod --dbpath ./hyperopt --port YOUR_PORT_NUMBER``. + If you wish to use a different port, specify it as follows: ``mongod --dbpath ./hyperopt --db-port YOUR_PORT_NUMBER``. 2. **Launch NNPDF with MongoDB integration:** Open a new command prompt and run ``n3fit`` with the desired configuration: From 1d673ec95a47ef5b19b5ad2b122b742a1168e392 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Wed, 21 Feb 2024 08:23:38 +0100 Subject: [PATCH 14/35] Changed default database name to 'hyperopt-db' --- doc/sphinx/source/n3fit/hyperopt.rst | 2 +- n3fit/src/n3fit/hyper_optimization/mongofiletrials.py | 4 ++-- n3fit/src/n3fit/scripts/n3fit_exec.py | 2 +- n3fit/src/n3fit/tests/test_hyperopt.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst index 930d4eaa02..d0a2cebdee 100644 --- a/doc/sphinx/source/n3fit/hyperopt.rst +++ b/doc/sphinx/source/n3fit/hyperopt.rst @@ -504,7 +504,7 @@ To set up and run a parallelized hyperopt search, follow these steps: mongod --dbpath ./hyperopt-db By default, ``mongod`` uses port ``27017``. This is also the default port for the ``n3fit --db-port`` option. - If you wish to use a different port, specify it as follows: ``mongod --dbpath ./hyperopt --db-port YOUR_PORT_NUMBER``. + If you wish to use a different port, specify it as follows: ``mongod --dbpath ./hyperopt-db --db-port YOUR_PORT_NUMBER``. 2. **Launch NNPDF with MongoDB integration:** Open a new command prompt and run ``n3fit`` with the desired configuration: diff --git a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py index 0eb8d3cc2a..a723812df3 100644 --- a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py +++ b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py @@ -112,7 +112,7 @@ class MongoFileTrials(MongoTrials): db_port: int MongoDB database connection port. Defaults to 27017. db_name: str - MongoDB database name. Details to "hyperopt". + MongoDB database name. Defaults to "hyperopt-db". num_workers: int Number of MongoDB workers to be initiated concurrently. Defaults to 1. parameters: dict @@ -126,7 +126,7 @@ def __init__( replica_path, db_host="localhost", db_port=27017, - db_name="hyperopt", + db_name="hyperopt-db", num_workers=1, parameters=None, *args, diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py index dd3e911beb..1dedcf1165 100755 --- a/n3fit/src/n3fit/scripts/n3fit_exec.py +++ b/n3fit/src/n3fit/scripts/n3fit_exec.py @@ -278,7 +278,7 @@ def check_positive(value): ) parser.add_argument("--db-host", help="MongoDB host", default="localhost") parser.add_argument("--db-port", help="MongoDB port", default=27017) - parser.add_argument("--db-name", help="MongoDB dataset name", default="hyperopt") + parser.add_argument("--db-name", help="MongoDB dataset name", default="hyperopt-db") parser.add_argument( "--num-mongo-workers", help="Number of mongo workers to be launched simultaneously", diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py index 5cf6021b48..7aa740e71a 100644 --- a/n3fit/src/n3fit/tests/test_hyperopt.py +++ b/n3fit/src/n3fit/tests/test_hyperopt.py @@ -175,8 +175,8 @@ def test_restart_from_pickle(tmp_path): def start_mongo_database(tmp_path): """Creates MongoDB database and returns the Popen object.""" - db_command = ["mongod", "--dbpath", f"{tmp_path}/hyperopt"] - directory_path = f"{tmp_path}/hyperopt" + db_command = ["mongod", "--dbpath", f"{tmp_path}/hyperopt-db"] + directory_path = f"{tmp_path}/hyperopt-db" try: # create database directory sp.run(["mkdir", "-p", directory_path], check=True) From 6c50237fa1e7c06826da0eeb14ecfaafa864194c Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Wed, 21 Feb 2024 08:26:29 +0100 Subject: [PATCH 15/35] Removed unused 'set_tf_visible_device' function --- .../hyper_optimization/mongofiletrials.py | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py index a723812df3..99451dd843 100644 --- a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py +++ b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py @@ -27,36 +27,6 @@ def get_physical_gpus(): return tf.config.list_physical_devices('GPU') -def set_tf_visible_device(gpu_id, devices_list): - """ - Set a specific GPU as the visible device for TensorFlow. - - Parameters - ---------- - gpu_id (int): The ID of the GPU to be used. - devices_list (list): List of physical devices detected. - - Returns - ------- - bool: True if the device is set successfully, False otherwise. - """ - try: - if gpu_id < 0 or gpu_id >= len(devices_list): - log.error( - "GPU ID is out of range. Available GPUs: 0 to {}".format(len(devices_list) - 1) - ) - return False - - tf.config.set_visible_devices(devices_list[gpu_id], 'GPU') - tf.config.experimental.set_memory_growth(devices_list[gpu_id], True) - log.info("GPU {} is set as visible device.".format(gpu_id)) - return True - - except Exception as e: - log.error("Failed to set visible device: {}".format(e)) - return False - - def convert_bson_to_dict(obj): """ Recursively convert a BSON object to a standard Python dictionary. From fdf34587f9274675b76c42bf629165726acf613e Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Wed, 21 Feb 2024 09:47:51 +0100 Subject: [PATCH 16/35] Added validation to hyperopt-related arguments --- n3fit/src/n3fit/scripts/n3fit_exec.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py index 1dedcf1165..fb9de3d081 100755 --- a/n3fit/src/n3fit/scripts/n3fit_exec.py +++ b/n3fit/src/n3fit/scripts/n3fit_exec.py @@ -296,6 +296,18 @@ def check_positive(value): def get_commandline_arguments(self, cmdline=None): args = super().get_commandline_arguments(cmdline) + + # Validate dependencies related to the --hyperopt argument + if args["hyperopt"] is None: + if args["restart"]: + raise argparse.ArgumentError( + None, "The --restart option requires --hyperopt to be set." + ) + if args["parallel_hyperopt"]: + raise argparse.ArgumentError( + None, "The --parallel-hyperopt option requires --hyperopt to be set." + ) + if args["output"] is None: args["output"] = pathlib.Path(args["config_yml"]).stem return args From e901ea729d7768e9d1a22077f826c274e0e9e2fb Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Wed, 21 Feb 2024 11:51:44 +0100 Subject: [PATCH 17/35] Moved 'get_physical_gpus' to keras_backend 'internal_state.py' --- n3fit/src/n3fit/backends/__init__.py | 1 + .../n3fit/backends/keras_backend/internal_state.py | 11 +++++++++++ .../src/n3fit/hyper_optimization/mongofiletrials.py | 13 +------------ 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/n3fit/src/n3fit/backends/__init__.py b/n3fit/src/n3fit/backends/__init__.py index 3676dd25d7..e48c4a856f 100644 --- a/n3fit/src/n3fit/backends/__init__.py +++ b/n3fit/src/n3fit/backends/__init__.py @@ -15,6 +15,7 @@ ) from n3fit.backends.keras_backend.internal_state import ( clear_backend_state, + get_physical_gpus, set_eager, set_initial_state, ) diff --git a/n3fit/src/n3fit/backends/keras_backend/internal_state.py b/n3fit/src/n3fit/backends/keras_backend/internal_state.py index 6cfc921c68..e818716940 100644 --- a/n3fit/src/n3fit/backends/keras_backend/internal_state.py +++ b/n3fit/src/n3fit/backends/keras_backend/internal_state.py @@ -143,3 +143,14 @@ def set_initial_state(debug=False, external_seed=None, max_cores=None, double_pr # Once again, if in debug mode or external_seed set, set also the TF seed if debug or external_seed: tf.random.set_seed(use_seed) + + +def get_physical_gpus(): + """ + Retrieve a list of all physical GPU devices available in the system. + + Returns + ------- + list: A list of TensorFlow physical devices of type 'GPU'. + """ + return tf.config.list_physical_devices('GPU') diff --git a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py index 99451dd843..4ed0872f89 100644 --- a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py +++ b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py @@ -9,24 +9,13 @@ from bson import SON, ObjectId from hyperopt.mongoexp import MongoTrials -import tensorflow as tf +from n3fit.backends import get_physical_gpus from n3fit.hyper_optimization.filetrials import space_eval_trial log = logging.getLogger(__name__) -def get_physical_gpus(): - """ - Retrieve a list of all physical GPU devices available in the system. - - Returns - ------- - list: A list of TensorFlow physical devices of type 'GPU'. - """ - return tf.config.list_physical_devices('GPU') - - def convert_bson_to_dict(obj): """ Recursively convert a BSON object to a standard Python dictionary. From 6256aff925936e2e907e74e9ce51e27309ef7785 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Wed, 21 Feb 2024 18:05:44 +0100 Subject: [PATCH 18/35] Added initial MongoFileTrials methods to allow for restarts --- .../n3fit/hyper_optimization/hyper_scan.py | 16 +++++--- .../hyper_optimization/mongofiletrials.py | 38 ++++++++++++++++++- 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index fd8a4d649e..967de91e91 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -145,11 +145,16 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= # Initialize seed for hyperopt trials.rstate = np.random.default_rng(HYPEROPT_SEED) - # For sequential hyperopt restarts, reset the state of `FileTrials` saved in the pickle file - if not hyperscanner.parallel_hyperopt and hyperscanner.restart_hyperopt: - pickle_file_to_load = f"{replica_path_set}/tries.pkl" - log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load) - trials = FileTrials.from_pkl(pickle_file_to_load) + if hyperscanner.restart_hyperopt: + # For parallel hyperopt restarts, extract the database tar file + if hyperscanner.parallel_hyperopt: + log.info("Restarting hyperopt run using the MongoDB database %s", trials.db_name) + trials.extract_mongodb_database() + else: + # For sequential hyperopt restarts, reset the state of `FileTrials` saved in the pickle file + pickle_file_to_load = f"{replica_path_set}/tries.pkl" + log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load) + trials = FileTrials.from_pkl(pickle_file_to_load) # Call to hyperopt.fmin fmin_args = dict( @@ -164,6 +169,7 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= trials.start_mongo_workers() best = hyperopt.fmin(**fmin_args, show_progressbar=True, max_queue_len=trials.num_workers) trials.stop_mongo_workers() + trials.compress_mongodb_database() else: best = hyperopt.fmin(**fmin_args, show_progressbar=False, trials_save_file=trials.pkl_file) return hyperscanner.space_eval(best) diff --git a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py index 4ed0872f89..64f2a877f5 100644 --- a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py +++ b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py @@ -2,6 +2,7 @@ Hyperopt trial object for parallel hyperoptimization with MongoDB. Data are fetched from MongoDB databases and stored in the form of json files within the nnfit folder """ +import glob import json import logging import os @@ -100,6 +101,7 @@ def __init__( self._store_trial = False self._json_file = replica_path / "tries.json" + self.database_tar_file = replica_path / f"{self.db_name}.tar.gz" self._parameters = parameters self._rstate = None self._dynamic_trials = [] @@ -203,7 +205,39 @@ def stop_mongo_workers(self): worker.terminate() worker.wait() log.info(f"Stopped mongo worker {self.workers.index(worker)+1}/{self.num_workers}") - except Exception as e: + except Exception as err: log.error( - f"Failed to stop mongo worker {self.workers.index(worker)+1}/{self.num_workers}: {e}" + f"Failed to stop mongo worker {self.workers.index(worker)+1}/{self.num_workers}: {err}" ) + + def compress_mongodb_database(self): + """Saves MongoDB database as tar file""" + # check if the database exist + if not os.path.exists(f"{self.db_name}" and not glob.glob('65*')): + raise FileNotFoundError( + f"The MongoDB database directory '{self.db_name}' does not exist. " + "Ensure it has been initiated correctly and it is in your path." + ) + # create the tar.gz file + try: + log.info(f"Compressing MongoDB database into {self.database_tar_file}") + subprocess.run( + ['tar', '-cvf', f'{self.database_tar_file}', f'{self.db_name}'] + glob.glob('65*'), + check=True, + ) + except subprocess.CalledProcessError as err: + raise RuntimeError(f"Error compressing the database: {err}") + + def extract_mongodb_database(self): + """Untar MongoDB database for use in restarts.""" + # check if the database tar file exist + if not os.path.exists(f"{self.database_tar_file}"): + raise FileNotFoundError( + f"The MongoDB database tar file '{self.database_tar_file}' does not exist." + ) + # extract tar file + try: + log.info(f"Extracting MongoDB database from {self.database_tar_file}") + subprocess.run(['tar', '-xvf', f'{self.database_tar_file}'], check=True) + except subprocess.CalledProcessError as err: + raise RuntimeError(f"Error extracting the database: {err}") From c72cbad89b0e83a6689fd5671a9eb82e3d309d29 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Thu, 22 Feb 2024 09:11:05 +0100 Subject: [PATCH 19/35] Added 'MongodRunner' class to automate mongod launch and allow for restarts --- .../n3fit/hyper_optimization/hyper_scan.py | 34 +++++++---- .../hyper_optimization/mongofiletrials.py | 61 ++++++++++++++++--- 2 files changed, 76 insertions(+), 19 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index 967de91e91..6ffd6396f8 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -22,7 +22,7 @@ from n3fit.backends import MetaLayer, MetaModel from n3fit.hyper_optimization.filetrials import FileTrials -from n3fit.hyper_optimization.mongofiletrials import MongoFileTrials +from n3fit.hyper_optimization.mongofiletrials import MongodRunner, MongoFileTrials log = logging.getLogger(__name__) @@ -126,6 +126,24 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= """ # Tell the trainer we are doing hpyeropt model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys) + + if hyperscanner.parallel_hyperopt: + # start MongoDB database bu launching `mongod` + hyperscanner.mongod_runner.ensure_database_dir_exists() + mongod = hyperscanner.mongod_runner.start() + + if hyperscanner.restart_hyperopt: + # For parallel hyperopt restarts, extract the database tar file + if hyperscanner.parallel_hyperopt: + tar_file_to_extract = f"{replica_path_set}/{hyperscanner.db_name}.tar.gz" + log.info("Restarting hyperopt run using the MongoDB database %s", tar_file_to_extract) + MongoFileTrials.extract_mongodb_database(tar_file_to_extract) + else: + # For sequential hyperopt restarts, reset the state of `FileTrials` saved in the pickle file + pickle_file_to_load = f"{replica_path_set}/tries.pkl" + log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load) + trials = FileTrials.from_pkl(pickle_file_to_load) + # Generate the trials object if hyperscanner.parallel_hyperopt: # Instantiate `MongoFileTrials` @@ -145,17 +163,6 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= # Initialize seed for hyperopt trials.rstate = np.random.default_rng(HYPEROPT_SEED) - if hyperscanner.restart_hyperopt: - # For parallel hyperopt restarts, extract the database tar file - if hyperscanner.parallel_hyperopt: - log.info("Restarting hyperopt run using the MongoDB database %s", trials.db_name) - trials.extract_mongodb_database() - else: - # For sequential hyperopt restarts, reset the state of `FileTrials` saved in the pickle file - pickle_file_to_load = f"{replica_path_set}/tries.pkl" - log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load) - trials = FileTrials.from_pkl(pickle_file_to_load) - # Call to hyperopt.fmin fmin_args = dict( fn=model_trainer.hyperparametrizable, @@ -169,6 +176,8 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= trials.start_mongo_workers() best = hyperopt.fmin(**fmin_args, show_progressbar=True, max_queue_len=trials.num_workers) trials.stop_mongo_workers() + # stop mongod command and compress database + hyperscanner.mongod_runner.stop(mongod) trials.compress_mongodb_database() else: best = hyperopt.fmin(**fmin_args, show_progressbar=False, trials_save_file=trials.pkl_file) @@ -240,6 +249,7 @@ def __init__(self, parameters, sampling_dict, steps=5): self.db_port = sampling_dict.get("db_port") self.db_name = sampling_dict.get("db_name") self.num_mongo_workers = sampling_dict.get("num_mongo_workers") + self.mongod_runner = MongodRunner(self.db_name, self.db_port) self.hyper_keys = set([]) diff --git a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py index 64f2a877f5..c3209cceb7 100644 --- a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py +++ b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py @@ -1,6 +1,6 @@ """ Hyperopt trial object for parallel hyperoptimization with MongoDB. - Data are fetched from MongoDB databases and stored in the form of json files within the nnfit folder + Data are fetched from MongoDB databases and stored in the form of json and tar.gz files within the nnfit folder. """ import glob import json @@ -59,6 +59,52 @@ def convert_bson_to_dict(obj): return obj +class MongodRunner: + """Class to manage a MongoDB instance. + + This class is responsible for automatically creating and managing a MongoDB database + using the `mongod` command. It allows for starting and stopping a MongoDB instance + programmatically. + + Parameters + ---------- + db_port: int + MongoDB database connection port. Defaults to 27017. + db_name: str + MongoDB database name. Defaults to "hyperopt-db". + """ + + def __init__(self, db_name="hyperopt-db", db_port=27017): + self.db_name = db_name + self.db_port = db_port + + def ensure_database_dir_exists(self): + """Check if MongoDB database directory exists.""" + if not os.path.exists(f"{self.db_name}"): + log.info(f"Creating MongoDB database dir {self.db_name}") + os.makedirs(self.db_name, exist_ok=True) + + def start(self): + """Starts the MongoDB instance via `mongod` command.""" + args = ["mongod", "-quiet", "--dbpath", self.db_name, "--port", str(self.db_port)] + try: + mongod = subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + log.info(f"Started MongoDB database {self.db_name}") + return mongod + except OSError as err: + msg = f"Failed to execute {args}. Make sure you have MongoDB installed." + raise EnvironmentError(msg) from err + + def stop(self, mongod): + """Stops `mongod` command.""" + try: + mongod.terminate() + mongod.wait() + log.info(f"Stopped mongod") + except Exception as err: + log.error(f"Failed to stop mongod: {err}") + + class MongoFileTrials(MongoTrials): """ MongoDB implementation of :class:`n3fit.hyper_optimization.filetrials.FileTrials`. @@ -190,8 +236,8 @@ def start_mongo_workers( # my_env["TF_GPU_ALLOCATOR"] = "cuda_malloc_async" # run mongo workers + # we could use stdout=subprocess.DEVNULL and stderr=subprocess.DEVNULL in Popen to suppress output info worker = subprocess.Popen(args, env=my_env) - # we could use `stderr=subprocess.DEVNULL` in Popen to suppress output info self.workers.append(worker) log.info(f"Started mongo worker {i+1}/{self.num_workers}") except OSError as err: @@ -228,16 +274,17 @@ def compress_mongodb_database(self): except subprocess.CalledProcessError as err: raise RuntimeError(f"Error compressing the database: {err}") - def extract_mongodb_database(self): + @staticmethod + def extract_mongodb_database(database_tar_file): """Untar MongoDB database for use in restarts.""" # check if the database tar file exist - if not os.path.exists(f"{self.database_tar_file}"): + if not os.path.exists(f"{database_tar_file}"): raise FileNotFoundError( - f"The MongoDB database tar file '{self.database_tar_file}' does not exist." + f"The MongoDB database tar file '{database_tar_file}' does not exist." ) # extract tar file try: - log.info(f"Extracting MongoDB database from {self.database_tar_file}") - subprocess.run(['tar', '-xvf', f'{self.database_tar_file}'], check=True) + log.info(f"Extracting MongoDB database from {database_tar_file}") + subprocess.run(['tar', '-xvf', f'{database_tar_file}'], check=True) except subprocess.CalledProcessError as err: raise RuntimeError(f"Error extracting the database: {err}") From 1e5a0d4631f02ebf99f1d1c5a126bd7d8bd47934 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Thu, 22 Feb 2024 13:40:49 +0100 Subject: [PATCH 20/35] Added new test --- n3fit/src/n3fit/tests/test_hyperopt.py | 99 +++++++++++++++++--------- 1 file changed, 64 insertions(+), 35 deletions(-) diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py index 7aa740e71a..2be12cf424 100644 --- a/n3fit/src/n3fit/tests/test_hyperopt.py +++ b/n3fit/src/n3fit/tests/test_hyperopt.py @@ -173,35 +173,6 @@ def test_restart_from_pickle(tmp_path): # Note that it doesn't check the final loss of the second trial -def start_mongo_database(tmp_path): - """Creates MongoDB database and returns the Popen object.""" - db_command = ["mongod", "--dbpath", f"{tmp_path}/hyperopt-db"] - directory_path = f"{tmp_path}/hyperopt-db" - try: - # create database directory - sp.run(["mkdir", "-p", directory_path], check=True) - # launch database - process = sp.Popen(db_command, cwd=tmp_path) - return process - except (sp.CalledProcessError, OSError) as err: - msg = f"Error creating directory or executing {db_command}: {err}" - raise EnvironmentError(msg) from err - - -def stop_mongod_command(process): - """Stops the MongoDB database.""" - # directory_path = f"{tmp_path}/hyperopt" - try: - # stop mongod command - process.terminate() - process.wait() - # remove database files - # sp.run(f"rm -r {directory_path} && rm -r {tmp_path}/65*", check=True) - except (sp.CalledProcessError, OSError) as err: - msg = f"Error stopping the MongoDB process or removing database files: {err}" - raise EnvironmentError(msg) from err - - def test_parallel_hyperopt(tmp_path): """Ensure that the parallel implementation of hyperopt with MongoDB works as expected.""" # Prepare the run @@ -229,9 +200,6 @@ def test_parallel_hyperopt(tmp_path): end_time = time.time() sequential_run_time = end_time - start_time - # Generate on-the-fly a real MongoDB database - process = start_mongo_database(tmp_path) - # Run hyperopt in parallel start_time = time.time() sp.run( @@ -244,9 +212,6 @@ def test_parallel_hyperopt(tmp_path): end_time = time.time() parallel_run_time = end_time - start_time - # Stop mongod command - stop_mongod_command(process) - # Read up generated json files sequential_json_path = f"{output_sequential}/nnfit/replica_{REPLICA}/tries.json" sequential_json = load_data(sequential_json_path) @@ -266,3 +231,67 @@ def test_parallel_hyperopt(tmp_path): # Note: cannot check that they share exactly the same history # as the hyperopt algorithm depends on the results from previous runs # which is obviously different between parallel and sequential runs + + +def clean_up_database(tmp_path): + """Stops the MongoDB database.""" + directory_path = f"{tmp_path}/hyperopt-db" + try: + sp.run(f"rm -r {directory_path} {tmp_path}/65*", shell=True, check=True) + except (sp.CalledProcessError, OSError) as err: + msg = f"Error cleaning up database: {err}" + raise EnvironmentError(msg) from err + + +def test_restart_from_tar(tmp_path): + """Ensure that our parallel hyperopt restart works as expected""" + # Prepare the run + quickcard = f"hyper-{QUICKNAME}.yml" + quickpath = REGRESSION_FOLDER / quickcard + + # Set up some options + n_mongo_workers = 3 + n_trials_stop = 3 + n_trials_total = 6 + output = tmp_path / "output" + + # cp runcard to tmp folder + shutil.copy(quickpath, tmp_path) + # run some trials for the first time + sp.run( + f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_stop} " + f"--parallel-hyperopt --num-mongo-workers {n_mongo_workers} " + f"-o {output}".split(), + cwd=tmp_path, + check=True, + ) + json_path = f"{output}/nnfit/replica_{REPLICA}/tries.json" + initial_json = load_data(json_path) + + # check if the calculation went well + assert len(initial_json) == n_trials_stop + + # just in case, remove old database files to ensure that the restart occurs via tar file + clean_up_database(tmp_path) + + # restart and calculate more trials + sp.run( + f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_total} " + f"--parallel-hyperopt --num-mongo-workers {n_mongo_workers} " + f"-o {output}".split(), + cwd=tmp_path, + check=True, + ) + final_json = load_data(json_path) + + # check if the run completed all trials + assert len(final_json) == n_trials_total + + print(initial_json) + + for i in range(n_trials_stop): + # check that the files share exactly the same hyperopt history until the restart + assert initial_json[i]['misc'] == final_json[i]['misc'] + assert initial_json[i]['state'] == final_json[i]['state'] + assert initial_json[i]['tid'] == final_json[i]['tid'] + # assert initial_json[i]['result'] == final_json[i]['result'] From 13a98eaa8f2063ccebddbd7f8e9c34aff16f7259 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Fri, 23 Feb 2024 09:37:06 +0100 Subject: [PATCH 21/35] Added directoryperdb option to mongod to eliminate the need for the '65*' dirs --- .../hyper_optimization/mongofiletrials.py | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py index c3209cceb7..0084f40bec 100644 --- a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py +++ b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py @@ -2,7 +2,6 @@ Hyperopt trial object for parallel hyperoptimization with MongoDB. Data are fetched from MongoDB databases and stored in the form of json and tar.gz files within the nnfit folder. """ -import glob import json import logging import os @@ -86,7 +85,15 @@ def ensure_database_dir_exists(self): def start(self): """Starts the MongoDB instance via `mongod` command.""" - args = ["mongod", "-quiet", "--dbpath", self.db_name, "--port", str(self.db_port)] + args = [ + "mongod", + "-quiet", + "--dbpath", + self.db_name, + "--port", + str(self.db_port), + "--directoryperdb", + ] try: mongod = subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) log.info(f"Started MongoDB database {self.db_name}") @@ -142,7 +149,9 @@ def __init__( self.db_port = str(db_port) self.db_name = db_name self.num_workers = num_workers - self.mongotrials_arg = f"mongo://{self.db_host}:{self.db_port}/{self.db_name}/jobs" + self.mongotrials_arg = ( + f"mongo://{self.db_host}:{self.db_port}/{self._process_db_name(self.db_name)}/jobs" + ) self.workers = [] self._store_trial = False @@ -154,6 +163,14 @@ def __init__( super().__init__(self.mongotrials_arg, *args, **kwargs) + def _process_db_name(self, db_name): + """Checks if db_name contains a slash, indicating a "directory/db" format.""" + if '/' in db_name: + # Split the string by '/' and take the last part as the db name + db_name_parts = db_name.split('/') + db_name = db_name_parts[-1] + return db_name + @property def rstate(self): """Returns the rstate attribute; see :class:`n3fit.hyper_optimization.filetrials.FileTrials`.""" @@ -259,7 +276,7 @@ def stop_mongo_workers(self): def compress_mongodb_database(self): """Saves MongoDB database as tar file""" # check if the database exist - if not os.path.exists(f"{self.db_name}" and not glob.glob('65*')): + if not os.path.exists(f"{self.db_name}"): raise FileNotFoundError( f"The MongoDB database directory '{self.db_name}' does not exist. " "Ensure it has been initiated correctly and it is in your path." @@ -268,8 +285,7 @@ def compress_mongodb_database(self): try: log.info(f"Compressing MongoDB database into {self.database_tar_file}") subprocess.run( - ['tar', '-cvf', f'{self.database_tar_file}', f'{self.db_name}'] + glob.glob('65*'), - check=True, + ['tar', '-cvf', f'{self.database_tar_file}', f'{self.db_name}'], check=True ) except subprocess.CalledProcessError as err: raise RuntimeError(f"Error compressing the database: {err}") From 1d215696400d197d7d968cb288ca5762f87fd93b Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Fri, 23 Feb 2024 11:38:54 +0100 Subject: [PATCH 22/35] Added tarfile package to handle compression and extraction of tar files --- .../hyper_optimization/mongofiletrials.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py index 0084f40bec..d9e35fde59 100644 --- a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py +++ b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py @@ -6,6 +6,7 @@ import logging import os import subprocess +import tarfile from bson import SON, ObjectId from hyperopt.mongoexp import MongoTrials @@ -284,23 +285,28 @@ def compress_mongodb_database(self): # create the tar.gz file try: log.info(f"Compressing MongoDB database into {self.database_tar_file}") - subprocess.run( - ['tar', '-cvf', f'{self.database_tar_file}', f'{self.db_name}'], check=True - ) - except subprocess.CalledProcessError as err: + with tarfile.open(self.database_tar_file, "w:gz") as tar: + tar.add(self.db_name) + except tarfile.TarError as err: raise RuntimeError(f"Error compressing the database: {err}") @staticmethod - def extract_mongodb_database(database_tar_file): + def extract_mongodb_database(database_tar_file, path='.'): """Untar MongoDB database for use in restarts.""" # check if the database tar file exist if not os.path.exists(f"{database_tar_file}"): raise FileNotFoundError( f"The MongoDB database tar file '{database_tar_file}' does not exist." ) + # check of the provided file is a tar type + if not tarfile.is_tarfile(database_tar_file): + raise tarfile.ReadError( + f"The file '{database_tar_file}' provided is not a tar file type." + ) # extract tar file try: log.info(f"Extracting MongoDB database from {database_tar_file}") - subprocess.run(['tar', '-xvf', f'{database_tar_file}'], check=True) - except subprocess.CalledProcessError as err: + with tarfile.open(f"{database_tar_file}") as tar: + tar.extractall(path) + except tarfile.TarError as err: raise RuntimeError(f"Error extracting the database: {err}") From eb1134047a5816ea8b38a483e6fc3695a7cfbb91 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Sat, 24 Feb 2024 09:51:15 +0100 Subject: [PATCH 23/35] Update and fix test --- n3fit/src/n3fit/tests/test_hyperopt.py | 39 +++++++++++++++++++------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py index 2be12cf424..a3e8c35c31 100644 --- a/n3fit/src/n3fit/tests/test_hyperopt.py +++ b/n3fit/src/n3fit/tests/test_hyperopt.py @@ -5,6 +5,7 @@ import pathlib import shutil import subprocess as sp +import tarfile import time import numpy as np @@ -237,14 +238,25 @@ def clean_up_database(tmp_path): """Stops the MongoDB database.""" directory_path = f"{tmp_path}/hyperopt-db" try: - sp.run(f"rm -r {directory_path} {tmp_path}/65*", shell=True, check=True) + sp.run(f"rm -r {directory_path}", shell=True, check=True) except (sp.CalledProcessError, OSError) as err: msg = f"Error cleaning up database: {err}" raise EnvironmentError(msg) from err +def get_tar_size(filetar): + """Returns the size of a tar file.""" + + def tar_size(tar): + return sum(member.size for member in tar.getmembers()) + + with tarfile.open(filetar, 'r') as tar: + size = tar_size(tar) + return size + + def test_restart_from_tar(tmp_path): - """Ensure that our parallel hyperopt restart works as expected""" + """Ensure that our parallel hyperopt restart works as expected.""" # Prepare the run quickcard = f"hyper-{QUICKNAME}.yml" quickpath = REGRESSION_FOLDER / quickcard @@ -267,9 +279,8 @@ def test_restart_from_tar(tmp_path): ) json_path = f"{output}/nnfit/replica_{REPLICA}/tries.json" initial_json = load_data(json_path) - - # check if the calculation went well - assert len(initial_json) == n_trials_stop + initial_tar = f"{output}/nnfit/replica_{REPLICA}/hyperopt-db.tar.gz" + initial_tar_size = get_tar_size(initial_tar) # just in case, remove old database files to ensure that the restart occurs via tar file clean_up_database(tmp_path) @@ -278,20 +289,28 @@ def test_restart_from_tar(tmp_path): sp.run( f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_total} " f"--parallel-hyperopt --num-mongo-workers {n_mongo_workers} " - f"-o {output}".split(), + f"-o {output} --restart".split(), cwd=tmp_path, check=True, ) final_json = load_data(json_path) + final_tar = f"{output}/nnfit/replica_{REPLICA}/hyperopt-db.tar.gz" + final_tar_size = get_tar_size(final_tar) - # check if the run completed all trials + # check if the calculations went well + assert len(initial_json) == n_trials_stop assert len(final_json) == n_trials_total - print(initial_json) + # check if the tar files were generated correctly + assert tarfile.is_tarfile(initial_tar) is True + assert tarfile.is_tarfile(final_tar) is True + + # check if the final tar file was updated after restart + assert final_tar_size > initial_tar_size for i in range(n_trials_stop): - # check that the files share exactly the same hyperopt history until the restart + # check that the json files share exactly the same hyperopt history until the restart assert initial_json[i]['misc'] == final_json[i]['misc'] assert initial_json[i]['state'] == final_json[i]['state'] assert initial_json[i]['tid'] == final_json[i]['tid'] - # assert initial_json[i]['result'] == final_json[i]['result'] + assert initial_json[i]['result'] == final_json[i]['result'] From 5e8e48c491ccdbcbd692c2dc19f2c5ca918e6cf9 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Sat, 24 Feb 2024 18:36:51 +0100 Subject: [PATCH 24/35] Set explicitly path to restart and additional keyword options to hyperopt-mongo-worker --- .../n3fit/hyper_optimization/hyper_scan.py | 3 ++- .../hyper_optimization/mongofiletrials.py | 19 +++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index 6ffd6396f8..87bfb8e5f6 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -137,7 +137,8 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= if hyperscanner.parallel_hyperopt: tar_file_to_extract = f"{replica_path_set}/{hyperscanner.db_name}.tar.gz" log.info("Restarting hyperopt run using the MongoDB database %s", tar_file_to_extract) - MongoFileTrials.extract_mongodb_database(tar_file_to_extract) + path = os.getcwd() + MongoFileTrials.extract_mongodb_database(tar_file_to_extract, path) else: # For sequential hyperopt restarts, reset the state of `FileTrials` saved in the pickle file pickle_file_to_load = f"{replica_path_set}/tries.pkl" diff --git a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py index d9e35fde59..6c32ee7185 100644 --- a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py +++ b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py @@ -215,7 +215,13 @@ def _insert_trial_docs(self, docs): return super()._insert_trial_docs(docs) def start_mongo_workers( - self, workdir=None, exp_key=None, poll_interval=0.1, use_subprocesses=False + self, + workdir=None, + exp_key=None, + poll_interval=0.1, + use_subprocesses=False, + max_consecutive_failures=4, + reserve_timeout=None, ): """Initiates all mongo workers simultaneously.""" # get the number of gpu cards, if any @@ -236,7 +242,12 @@ def start_mongo_workers( args.extend(["--workdir", workdir]) if exp_key: args.extend(["--exp-key", exp_key]) - args.extend(["--poll-interval", str(poll_interval)]) + if poll_interval: + args.extend(["--poll-interval", str(poll_interval)]) + if max_consecutive_failures: + args.extend(["--max-consecutive-failures", str(max_consecutive_failures)]) + if reserve_timeout: + args.extend(["--reserve-timeout", str(reserve_timeout)]) if use_subprocesses: args.append("--no-subprocesses") @@ -291,7 +302,7 @@ def compress_mongodb_database(self): raise RuntimeError(f"Error compressing the database: {err}") @staticmethod - def extract_mongodb_database(database_tar_file, path='.'): + def extract_mongodb_database(database_tar_file, path=os.getcwd()): """Untar MongoDB database for use in restarts.""" # check if the database tar file exist if not os.path.exists(f"{database_tar_file}"): @@ -305,7 +316,7 @@ def extract_mongodb_database(database_tar_file, path='.'): ) # extract tar file try: - log.info(f"Extracting MongoDB database from {database_tar_file}") + log.info(f"Extracting MongoDB database {database_tar_file} to {path}") with tarfile.open(f"{database_tar_file}") as tar: tar.extractall(path) except tarfile.TarError as err: From f5c64f74f78081711e47c88e9a5347892c43ac01 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Mon, 26 Feb 2024 11:52:43 +0100 Subject: [PATCH 25/35] Fix in hyper_scan_wrapper --- n3fit/src/n3fit/hyper_optimization/hyper_scan.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index 87bfb8e5f6..68e9215e09 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -127,11 +127,6 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= # Tell the trainer we are doing hpyeropt model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys) - if hyperscanner.parallel_hyperopt: - # start MongoDB database bu launching `mongod` - hyperscanner.mongod_runner.ensure_database_dir_exists() - mongod = hyperscanner.mongod_runner.start() - if hyperscanner.restart_hyperopt: # For parallel hyperopt restarts, extract the database tar file if hyperscanner.parallel_hyperopt: @@ -145,6 +140,11 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load) trials = FileTrials.from_pkl(pickle_file_to_load) + if hyperscanner.parallel_hyperopt: + # start MongoDB database by launching `mongod` + hyperscanner.mongod_runner.ensure_database_dir_exists() + mongod = hyperscanner.mongod_runner.start() + # Generate the trials object if hyperscanner.parallel_hyperopt: # Instantiate `MongoFileTrials` From 7861c3e38a6aa685ec7e81a7f4fb0dc01cabd844 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Mon, 26 Feb 2024 14:03:57 +0100 Subject: [PATCH 26/35] Update docs --- doc/sphinx/source/n3fit/hyperopt.rst | 41 ++++++++++++---------------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst index d0a2cebdee..e9ee7e1d86 100644 --- a/doc/sphinx/source/n3fit/hyperopt.rst +++ b/doc/sphinx/source/n3fit/hyperopt.rst @@ -494,34 +494,27 @@ This functionality is provided by the :class:`~n3fit.hyper_optimization.mongofil which extends the capabilities of `hyperopt `_'s `MongoTrials` and enables the simultaneous evaluation of multiple trials. -To set up and run a parallelized hyperopt search, follow these steps: +To run a parallelized hyperopt search, use the following command: - 1. **Instantiate the MongoDB database:** Start by setting up the database in your current directory. - This database is referred to as ``hyperopt-db`` in the following instructions. You can initiate it with the command: - - .. code-block:: bash - - mongod --dbpath ./hyperopt-db - - By default, ``mongod`` uses port ``27017``. This is also the default port for the ``n3fit --db-port`` option. - If you wish to use a different port, specify it as follows: ``mongod --dbpath ./hyperopt-db --db-port YOUR_PORT_NUMBER``. - - 2. **Launch NNPDF with MongoDB integration:** Open a new command prompt and run ``n3fit`` with the desired configuration: +.. code-block:: bash - .. code-block:: bash + n3fit hyper-quickcard.yml 1 -r N_replicas --hyperopt N_trials --parallel-hyperopt --num-mongo-workers N - n3fit hyper-quickcard.yml 1 -r N_replicas --hyperopt N_trials --parallel-hyperopt --num-mongo-workers N +Here, ``N`` represents the number of MongoDB workers you wish to launch in parallel. +Each mongo worker handles one trial in Hyperopt. So, launching more workers allows for the simultaneous calculation of a greater number of trials. +Note that there is no need to manually launch MongoDB databases or mongo workers prior to using ``n3fit``, +as the ``mongod`` and ``hyperopt-mongo-worker`` commands are automatically executed +by :meth:`~n3fit.hyper_optimization.mongofiletrials.MongodRunner.start` and +:meth:`~n3fit.hyper_optimization.mongofiletrials.MongoFileTrials.start_mongo_workers` methods, respectivelly. +By default, the ``host`` and ``port`` arguments are set to ``localhost`` and ``27017``, while the database is named ``hyperopt-db``. +If necessary, you can modify these settings using the ``n3fit --db-host`` , ``n3fit --db-port`` and ``n3fit --db-name`` options. - Here, ``N`` represents the number of MongoDB workers you wish to launch in parallel. - Each mongo worker handles one trial in Hyperopt. So, launching more workers allows for the simultaneous calculation of a greater number of trials. - Note that there is no need to manually launch mongo workers, as the ``hyperopt-mongo-worker`` command is automatically - executed by the :meth:`~n3fit.hyper_optimization.mongofiletrials.MongoFileTrials.start_mongo_workers` method. - By default, the ``host`` argument is set to ``localhost``, and the database is named ``hyperopt``. - If necessary, you can modify these settings using the ``n3fit --db-host`` or ``n3fit --db-name`` options. +To resume a hyperopt experiment, add the ``--restart`` option to the ``n3fit`` command: +.. code-block:: bash -.. note:: + n3fit hyper-quickcard.yml 1 -r N_replicas --hyperopt N_trials --parallel-hyperopt --num-mongo-workers N --restart - Unlike in serial execution, parallel hyperoptimization runs do not generate ``tries.pkl`` files. - To resume an experiment, simply retain the MongoDB database created during your previous run. - Then, follow steps 1 and 2 as described above to restart the experiment. +Note that, unlike in serial execution, parallel hyperoptimization runs do not generate ``tries.pkl`` files. +Instead, MongoDB databases are saved as ``hyperopt-db.tar.gz`` files inside ``replica_path`` directory. +These are conveniently extracted for reuse in restart runs. From 8cb95ccbf4c1782966e3dad8e4e0602f8c762aa2 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Mon, 26 Feb 2024 16:34:12 +0100 Subject: [PATCH 27/35] Updated 'max-consecutive-failures' and 'reserve-timeout' arguments as indicated by Aron --- n3fit/src/n3fit/hyper_optimization/mongofiletrials.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py index 6c32ee7185..c5e2d2cbd0 100644 --- a/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py +++ b/n3fit/src/n3fit/hyper_optimization/mongofiletrials.py @@ -219,9 +219,9 @@ def start_mongo_workers( workdir=None, exp_key=None, poll_interval=0.1, - use_subprocesses=False, - max_consecutive_failures=4, - reserve_timeout=None, + no_subprocesses=False, + max_consecutive_failures=10, + reserve_timeout=600, ): """Initiates all mongo workers simultaneously.""" # get the number of gpu cards, if any @@ -248,7 +248,7 @@ def start_mongo_workers( args.extend(["--max-consecutive-failures", str(max_consecutive_failures)]) if reserve_timeout: args.extend(["--reserve-timeout", str(reserve_timeout)]) - if use_subprocesses: + if no_subprocesses: args.append("--no-subprocesses") # start the worker as a subprocess From 279693963c0e33bac3e81bc409116f13a2428f90 Mon Sep 17 00:00:00 2001 From: Carlos Murilo Romero Rocha <114645116+Cmurilochem@users.noreply.github.com> Date: Wed, 6 Mar 2024 09:09:18 +0100 Subject: [PATCH 28/35] Update hyperopt.rst Co-authored-by: Tanjona Rabemananjara --- doc/sphinx/source/n3fit/hyperopt.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst index e9ee7e1d86..8a1593155d 100644 --- a/doc/sphinx/source/n3fit/hyperopt.rst +++ b/doc/sphinx/source/n3fit/hyperopt.rst @@ -489,7 +489,7 @@ less than ``20``. If there are ``20`` or more saved trials, ``n3fit`` will simpl Running hyperoptimizations in parallel with MongoDB --------------------------------------------------- -In NNPDF, you can effectively run hyperoptimization experiments in parallel using `MongoDB `_. +It is possible to run hyperoptimization scans in parallel using `MongoDB `_. This functionality is provided by the :class:`~n3fit.hyper_optimization.mongofiletrials.MongoFileTrials` class, which extends the capabilities of `hyperopt `_'s `MongoTrials` and enables the simultaneous evaluation of multiple trials. From 594f76bbc7a7a03070f7786e64697f375ed19379 Mon Sep 17 00:00:00 2001 From: Carlos Murilo Romero Rocha <114645116+Cmurilochem@users.noreply.github.com> Date: Wed, 6 Mar 2024 09:09:39 +0100 Subject: [PATCH 29/35] Update hyperopt.rst Co-authored-by: Tanjona Rabemananjara --- doc/sphinx/source/n3fit/hyperopt.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst index 8a1593155d..dd82589ff5 100644 --- a/doc/sphinx/source/n3fit/hyperopt.rst +++ b/doc/sphinx/source/n3fit/hyperopt.rst @@ -507,7 +507,7 @@ as the ``mongod`` and ``hyperopt-mongo-worker`` commands are automatically execu by :meth:`~n3fit.hyper_optimization.mongofiletrials.MongodRunner.start` and :meth:`~n3fit.hyper_optimization.mongofiletrials.MongoFileTrials.start_mongo_workers` methods, respectivelly. By default, the ``host`` and ``port`` arguments are set to ``localhost`` and ``27017``, while the database is named ``hyperopt-db``. -If necessary, you can modify these settings using the ``n3fit --db-host`` , ``n3fit --db-port`` and ``n3fit --db-name`` options. +If necessary, it is possible modify these settings using the ``n3fit --db-host`` , ``n3fit --db-port`` and ``n3fit --db-name`` options. To resume a hyperopt experiment, add the ``--restart`` option to the ``n3fit`` command: From 797eb98ca6de52ce1b212ee41aac71107daa7c0e Mon Sep 17 00:00:00 2001 From: Carlos Murilo Romero Rocha <114645116+Cmurilochem@users.noreply.github.com> Date: Wed, 6 Mar 2024 09:10:11 +0100 Subject: [PATCH 30/35] Update hyper_scan.py Co-authored-by: Tanjona Rabemananjara --- n3fit/src/n3fit/hyper_optimization/hyper_scan.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index 68e9215e09..c3c4e88171 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -132,8 +132,7 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals= if hyperscanner.parallel_hyperopt: tar_file_to_extract = f"{replica_path_set}/{hyperscanner.db_name}.tar.gz" log.info("Restarting hyperopt run using the MongoDB database %s", tar_file_to_extract) - path = os.getcwd() - MongoFileTrials.extract_mongodb_database(tar_file_to_extract, path) + MongoFileTrials.extract_mongodb_database(tar_file_to_extract, path=os.getcwd()) else: # For sequential hyperopt restarts, reset the state of `FileTrials` saved in the pickle file pickle_file_to_load = f"{replica_path_set}/tries.pkl" From 543d2d586a91b3820de2f1d70acdee570a354702 Mon Sep 17 00:00:00 2001 From: Aron Date: Thu, 7 Mar 2024 16:03:02 +0100 Subject: [PATCH 31/35] Add runcard to mongodb database name --- n3fit/src/n3fit/hyper_optimization/hyper_scan.py | 4 +++- n3fit/src/n3fit/scripts/n3fit_exec.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index c3c4e88171..0dddfc37d4 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -245,9 +245,11 @@ def __init__(self, parameters, sampling_dict, steps=5): # setting up MondoDB options if self.parallel_hyperopt: + # add output_path to db name to avoid conflicts + db_name = f'{sampling_dict.get("db_name")}-{sampling_dict.get("output_path")}' self.db_host = sampling_dict.get("db_host") self.db_port = sampling_dict.get("db_port") - self.db_name = sampling_dict.get("db_name") + self.db_name = db_name self.num_mongo_workers = sampling_dict.get("num_mongo_workers") self.mongod_runner = MongodRunner(self.db_name, self.db_port) diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py index fb9de3d081..8364db1e3f 100755 --- a/n3fit/src/n3fit/scripts/n3fit_exec.py +++ b/n3fit/src/n3fit/scripts/n3fit_exec.py @@ -241,6 +241,7 @@ def produce_hyperscanner(self, parameters, hyperscan_config=None, hyperopt=None) 'db_host': self.environment.db_host, 'db_port': self.environment.db_port, 'db_name': self.environment.db_name, + 'output_path': self.environment.output_path.name, 'num_mongo_workers': self.environment.num_mongo_workers, } ) From 95e4d1988b1f7d341f50aff1b1c8b17185028f7f Mon Sep 17 00:00:00 2001 From: Aron Date: Thu, 7 Mar 2024 16:46:31 +0100 Subject: [PATCH 32/35] Fix restart test --- n3fit/src/n3fit/tests/test_hyperopt.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py index a3e8c35c31..c400b33d1f 100644 --- a/n3fit/src/n3fit/tests/test_hyperopt.py +++ b/n3fit/src/n3fit/tests/test_hyperopt.py @@ -278,9 +278,9 @@ def test_restart_from_tar(tmp_path): check=True, ) json_path = f"{output}/nnfit/replica_{REPLICA}/tries.json" + tar_name = f"{output}/nnfit/replica_{REPLICA}/hyperopt-db-hyper-{QUICKNAME}.tar.gz" initial_json = load_data(json_path) - initial_tar = f"{output}/nnfit/replica_{REPLICA}/hyperopt-db.tar.gz" - initial_tar_size = get_tar_size(initial_tar) + initial_tar_size = get_tar_size(tar_name) # just in case, remove old database files to ensure that the restart occurs via tar file clean_up_database(tmp_path) @@ -294,16 +294,14 @@ def test_restart_from_tar(tmp_path): check=True, ) final_json = load_data(json_path) - final_tar = f"{output}/nnfit/replica_{REPLICA}/hyperopt-db.tar.gz" - final_tar_size = get_tar_size(final_tar) + final_tar_size = get_tar_size(tar_name) # check if the calculations went well assert len(initial_json) == n_trials_stop assert len(final_json) == n_trials_total # check if the tar files were generated correctly - assert tarfile.is_tarfile(initial_tar) is True - assert tarfile.is_tarfile(final_tar) is True + assert tarfile.is_tarfile(tar_name) is True # check if the final tar file was updated after restart assert final_tar_size > initial_tar_size From e122cb56376983127547217d3983e8ece2cb4ff6 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Thu, 7 Mar 2024 21:02:38 +0100 Subject: [PATCH 33/35] Fix test database name and path --- n3fit/src/n3fit/tests/test_hyperopt.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py index c400b33d1f..294addb587 100644 --- a/n3fit/src/n3fit/tests/test_hyperopt.py +++ b/n3fit/src/n3fit/tests/test_hyperopt.py @@ -234,9 +234,9 @@ def test_parallel_hyperopt(tmp_path): # which is obviously different between parallel and sequential runs -def clean_up_database(tmp_path): +def clean_up_database(tmp_path, database_name): """Stops the MongoDB database.""" - directory_path = f"{tmp_path}/hyperopt-db" + directory_path = f"{tmp_path}/{database_name}" try: sp.run(f"rm -r {directory_path}", shell=True, check=True) except (sp.CalledProcessError, OSError) as err: @@ -266,6 +266,7 @@ def test_restart_from_tar(tmp_path): n_trials_stop = 3 n_trials_total = 6 output = tmp_path / "output" + database_name = f"hyperopt-db-{output.name}" # cp runcard to tmp folder shutil.copy(quickpath, tmp_path) @@ -278,12 +279,12 @@ def test_restart_from_tar(tmp_path): check=True, ) json_path = f"{output}/nnfit/replica_{REPLICA}/tries.json" - tar_name = f"{output}/nnfit/replica_{REPLICA}/hyperopt-db-hyper-{QUICKNAME}.tar.gz" + tar_name = f"{output}/nnfit/replica_{REPLICA}/{database_name}.tar.gz" initial_json = load_data(json_path) initial_tar_size = get_tar_size(tar_name) # just in case, remove old database files to ensure that the restart occurs via tar file - clean_up_database(tmp_path) + clean_up_database(tmp_path, database_name) # restart and calculate more trials sp.run( From 1fbbddb37ff43624b6ede3df86a2bd5051a1303b Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Thu, 7 Mar 2024 21:03:35 +0100 Subject: [PATCH 34/35] Update docs with database name changes --- doc/sphinx/source/n3fit/hyperopt.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst index dd82589ff5..7c7ba903e9 100644 --- a/doc/sphinx/source/n3fit/hyperopt.rst +++ b/doc/sphinx/source/n3fit/hyperopt.rst @@ -506,8 +506,9 @@ Note that there is no need to manually launch MongoDB databases or mongo workers as the ``mongod`` and ``hyperopt-mongo-worker`` commands are automatically executed by :meth:`~n3fit.hyper_optimization.mongofiletrials.MongodRunner.start` and :meth:`~n3fit.hyper_optimization.mongofiletrials.MongoFileTrials.start_mongo_workers` methods, respectivelly. -By default, the ``host`` and ``port`` arguments are set to ``localhost`` and ``27017``, while the database is named ``hyperopt-db``. -If necessary, it is possible modify these settings using the ``n3fit --db-host`` , ``n3fit --db-port`` and ``n3fit --db-name`` options. +By default, the ``host`` and ``port`` arguments are set to ``localhost`` and ``27017``. The database is named ``hyperopt-db-output_name``, where +``output_name`` is set to the name of the runcard. If the ``n3fit -o OUTPUT`` option is provided, ``output_name`` is set to ``OUTPUT``, with the database being referred to as ``hyperopt-db-OUTPUT``. +If necessary, it is possible to modify all the above settings using the ``n3fit --db-host`` , ``n3fit --db-port`` and ``n3fit --db-name`` options. To resume a hyperopt experiment, add the ``--restart`` option to the ``n3fit`` command: @@ -516,5 +517,5 @@ To resume a hyperopt experiment, add the ``--restart`` option to the ``n3fit`` c n3fit hyper-quickcard.yml 1 -r N_replicas --hyperopt N_trials --parallel-hyperopt --num-mongo-workers N --restart Note that, unlike in serial execution, parallel hyperoptimization runs do not generate ``tries.pkl`` files. -Instead, MongoDB databases are saved as ``hyperopt-db.tar.gz`` files inside ``replica_path`` directory. +Instead, MongoDB databases are saved as ``hyperopt-db-output_name.tar.gz`` files inside ``replica_path`` directory. These are conveniently extracted for reuse in restart runs. From 3b977519c371bd6663f86a0e6202cb4fad20f472 Mon Sep 17 00:00:00 2001 From: Cmurilochem Date: Fri, 8 Mar 2024 07:51:35 +0100 Subject: [PATCH 35/35] Remove unused import --- n3fit/src/n3fit/hyper_optimization/hyper_scan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index 0dddfc37d4..58681c687f 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -14,7 +14,7 @@ """ import copy import logging -from typing import Callable +import os import hyperopt from hyperopt.pyll.base import scope