NNPDF · Cmurilochem · Mar 8, 2024 · Jan 26, 2024 · Jan 29, 2024 · Jan 30, 2024
diff --git a/.github/workflows/python_installation.yml b/.github/workflows/python_installation.yml
@@ -34,6 +34,11 @@ jobs:
         conda config --append channels conda-forge
         conda config --set show_channel_urls true
         conda install lhapdf pandoc
+    - name: Install MongoDB for parallel hyperopts
+      shell: bash -l {0}
+      run: |
+        conda install mongodb
+        mongod --version
     - name: Install nnpdf with testing and qed extras
       shell: bash -l {0}
       run: |

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -26,6 +26,8 @@ requirements:
         - psutil # to ensure n3fit affinity is with the right processors
         - blas==1.0 *mkl* # [osx] # Host's blas is mkl, force also runtime blas to be
         - hyperopt
+        - mongodb
+        - pymongo <4
         - seaborn
         - lhapdf
         - sqlite

diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst
@@ -484,3 +484,38 @@ To achieve this, you can use the ``--restart`` option within the ``n3fit`` comma
 
 The above command example is effective when the number of saved trials in the ``test_run/nnfit/replica_1/tries.pkl`` is
 less than ``20``. If there are ``20`` or more saved trials, ``n3fit`` will simply terminate, displaying the best results.
+
+
+Running hyperoptimizations in parallel with MongoDB
+---------------------------------------------------
+
+It is possible to run hyperoptimization scans in parallel using `MongoDB <https://www.mongodb.com>`_.
+This functionality is provided by the :class:`~n3fit.hyper_optimization.mongofiletrials.MongoFileTrials` class,
+which extends the capabilities of `hyperopt <https://github.com/hyperopt/hyperopt>`_'s `MongoTrials` and enables the
+simultaneous evaluation of multiple trials.
+
+To run a parallelized hyperopt search, use the following command:
+
+.. code-block:: bash
+
+  n3fit hyper-quickcard.yml 1 -r N_replicas --hyperopt N_trials --parallel-hyperopt --num-mongo-workers N
+
+Here, ``N`` represents the number of MongoDB workers you wish to launch in parallel.
+Each mongo worker handles one trial in Hyperopt. So, launching more workers allows for the simultaneous calculation of a greater number of trials.
+Note that there is no need to manually launch MongoDB databases or mongo workers prior to using ``n3fit``,
+as the ``mongod`` and ``hyperopt-mongo-worker`` commands are automatically executed
+by :meth:`~n3fit.hyper_optimization.mongofiletrials.MongodRunner.start` and
+:meth:`~n3fit.hyper_optimization.mongofiletrials.MongoFileTrials.start_mongo_workers` methods, respectivelly.
+By default, the ``host`` and ``port`` arguments are set to ``localhost`` and ``27017``. The database is named ``hyperopt-db-output_name``, where
+``output_name`` is set to the name of the runcard. If the ``n3fit -o OUTPUT`` option is provided, ``output_name`` is set to ``OUTPUT``, with the database being referred to as ``hyperopt-db-OUTPUT``.
+If necessary, it is possible to modify all the above settings using the ``n3fit --db-host`` , ``n3fit --db-port`` and ``n3fit --db-name`` options.
+
+To resume a hyperopt experiment, add the ``--restart`` option to the ``n3fit`` command:
+
+.. code-block:: bash
+
+  n3fit hyper-quickcard.yml 1 -r N_replicas --hyperopt N_trials --parallel-hyperopt --num-mongo-workers N --restart
+
+Note that, unlike in serial execution, parallel hyperoptimization runs do not generate ``tries.pkl`` files.
+Instead, MongoDB databases are saved as ``hyperopt-db-output_name.tar.gz`` files inside ``replica_path`` directory.
+These are conveniently extracted for reuse in restart runs.
diff --git a/n3fit/src/n3fit/backends/__init__.py b/n3fit/src/n3fit/backends/__init__.py
@@ -15,6 +15,7 @@
 )
 from n3fit.backends.keras_backend.internal_state import (
     clear_backend_state,
+    get_physical_gpus,
     set_eager,
     set_initial_state,
 )

diff --git a/n3fit/src/n3fit/backends/keras_backend/internal_state.py b/n3fit/src/n3fit/backends/keras_backend/internal_state.py
@@ -143,3 +143,14 @@ def set_initial_state(debug=False, external_seed=None, max_cores=None, double_pr
     # Once again, if in debug mode or external_seed set, set also the TF seed
     if debug or external_seed:
         tf.random.set_seed(use_seed)
+
+
+def get_physical_gpus():
+    """
+    Retrieve a list of all physical GPU devices available in the system.
+
+    Returns
+    -------
+        list: A list of TensorFlow physical devices of type 'GPU'.
+    """
+    return tf.config.list_physical_devices('GPU')
diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
@@ -14,14 +14,15 @@
 """
 import copy
 import logging
-from typing import Callable
+import os
 
 import hyperopt
 from hyperopt.pyll.base import scope
 import numpy as np
 
 from n3fit.backends import MetaLayer, MetaModel
 from n3fit.hyper_optimization.filetrials import FileTrials
+from n3fit.hyper_optimization.mongofiletrials import MongodRunner, MongoFileTrials
 
 log = logging.getLogger(__name__)
 
@@ -125,28 +126,61 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     """
     # Tell the trainer we are doing hpyeropt
     model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys)
+
+    if hyperscanner.restart_hyperopt:
+        # For parallel hyperopt restarts, extract the database tar file
+        if hyperscanner.parallel_hyperopt:
+            tar_file_to_extract = f"{replica_path_set}/{hyperscanner.db_name}.tar.gz"
+            log.info("Restarting hyperopt run using the MongoDB database %s", tar_file_to_extract)
+            MongoFileTrials.extract_mongodb_database(tar_file_to_extract, path=os.getcwd())
+        else:
+            # For sequential hyperopt restarts, reset the state of `FileTrials` saved in the pickle file
+            pickle_file_to_load = f"{replica_path_set}/tries.pkl"
+            log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load)
+            trials = FileTrials.from_pkl(pickle_file_to_load)
+
+    if hyperscanner.parallel_hyperopt:
+        # start MongoDB database by launching `mongod`
+        hyperscanner.mongod_runner.ensure_database_dir_exists()
+        mongod = hyperscanner.mongod_runner.start()
+
     # Generate the trials object
-    trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
+    if hyperscanner.parallel_hyperopt:
+        # Instantiate `MongoFileTrials`
+        # Mongo database should have already been initiated at this point
+        trials = MongoFileTrials(
+            replica_path_set,
+            db_host=hyperscanner.db_host,
+            db_port=hyperscanner.db_port,
+            db_name=hyperscanner.db_name,
+            num_workers=hyperscanner.num_mongo_workers,
+            parameters=hyperscanner.as_dict(),
+        )
+    else:
+        # Instantiate `FileTrials`
+        trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
+
     # Initialize seed for hyperopt
     trials.rstate = np.random.default_rng(HYPEROPT_SEED)
 
-    # For restarts, reset the state of `FileTrials` saved in the pickle file
-    if hyperscanner.restart_hyperopt:
-        pickle_file_to_load = f"{replica_path_set}/tries.pkl"
-        log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load)
-        trials = FileTrials.from_pkl(pickle_file_to_load)
-
-    # Perform the scan
-    best = hyperopt.fmin(
+    # Call to hyperopt.fmin
+    fmin_args = dict(
         fn=model_trainer.hyperparametrizable,
         space=hyperscanner.as_dict(),
         algo=hyperopt.tpe.suggest,
         max_evals=max_evals,
-        show_progressbar=False,
         trials=trials,
         rstate=trials.rstate,
-        trials_save_file=trials.pkl_file,
     )
+    if hyperscanner.parallel_hyperopt:
+        trials.start_mongo_workers()
+        best = hyperopt.fmin(**fmin_args, show_progressbar=True, max_queue_len=trials.num_workers)
+        trials.stop_mongo_workers()
+        # stop mongod command and compress database
+        hyperscanner.mongod_runner.stop(mongod)
+        trials.compress_mongodb_database()
+    else:
+        best = hyperopt.fmin(**fmin_args, show_progressbar=False, trials_save_file=trials.pkl_file)
     return hyperscanner.space_eval(best)
 
 
@@ -205,6 +239,20 @@ def __init__(self, parameters, sampling_dict, steps=5):
         restart_config = sampling_dict.get("restart")
         self.restart_hyperopt = True if restart_config else False
 
+        # adding extra options for parallel execution
+        parallel_config = sampling_dict.get("parallel")
+        self.parallel_hyperopt = True if parallel_config else False
+
+        # setting up MondoDB options
+        if self.parallel_hyperopt:
+            # add output_path to db name to avoid conflicts
+            db_name = f'{sampling_dict.get("db_name")}-{sampling_dict.get("output_path")}'
+            self.db_host = sampling_dict.get("db_host")
+            self.db_port = sampling_dict.get("db_port")
+            self.db_name = db_name
+            self.num_mongo_workers = sampling_dict.get("num_mongo_workers")
+            self.mongod_runner = MongodRunner(self.db_name, self.db_port)
+
         self.hyper_keys = set([])
 
         if "parameters" in sampling_dict: